l2/main.py

"""Bootstrap compiler for the L2 language.

This file now contains working scaffolding for:

* Parsing definitions, literals, and ordinary word references.
* Respecting immediate/macro words so syntax can be rewritten on the fly.
* Emitting NASM-compatible x86-64 assembly with explicit data and return stacks.
* Driving the toolchain via ``nasm`` + ``ld``.
"""

from __future__ import annotations

import bisect
import os
import re
import sys
from pathlib import Path
TYPE_CHECKING = False
if TYPE_CHECKING:
    from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set, Union, Tuple

try:  # lazy optional import; required for compile-time :asm execution
    from keystone import Ks, KsError, KS_ARCH_X86, KS_MODE_64
except Exception:  # pragma: no cover - optional dependency
    Ks = None
    KsError = Exception
    KS_ARCH_X86 = KS_MODE_64 = None

# Pre-compiled regex patterns used by JIT and BSS code
_RE_REL_PAT = re.compile(r'\[rel\s+(\w+)\]')
_RE_LABEL_PAT = re.compile(r'^(\.\w+|\w+):')
_RE_BSS_PERSISTENT = re.compile(r'persistent:\s*resb\s+(\d+)')
_RE_NEWLINE = re.compile('\n')
# Blanking asm bodies before tokenization: the tokenizer doesn't need asm
# content (the parser extracts it from the original source via byte offsets).
# This removes ~75% of tokens for asm-heavy programs like game_of_life.
_RE_ASM_BODY = re.compile(r'(:asm\b[^{]*\{)([^}]*)(})')
_ASM_BLANK_TBL = str.maketrans({chr(i): ' ' for i in range(128) if i != 10})
def _blank_asm_bodies(source: str) -> str:
    return _RE_ASM_BODY.sub(lambda m: m.group(1) + m.group(2).translate(_ASM_BLANK_TBL) + m.group(3), source)
DEFAULT_MACRO_EXPANSION_LIMIT = 256
_SOURCE_PATH = Path("<source>")

_struct_mod = None
def _get_struct():
    global _struct_mod
    if _struct_mod is None:
        import struct as _s
        _struct_mod = _s
    return _struct_mod


class Diagnostic:
    """Structured error/warning with optional source context and suggestions."""
    __slots__ = ('level', 'message', 'path', 'line', 'column', 'length', 'hint', 'suggestion')

    def __init__(
        self,
        level: str,
        message: str,
        path: Optional[Path] = None,
        line: int = 0,
        column: int = 0,
        length: int = 0,
        hint: str = "",
        suggestion: str = "",
    ) -> None:
        self.level = level       # "error", "warning", "note"
        self.message = message
        self.path = path
        self.line = line
        self.column = column
        self.length = length
        self.hint = hint
        self.suggestion = suggestion

    def format(self, *, color: bool = True) -> str:
        """Format the diagnostic in Rust-style with source context."""
        _RED = "\033[1;31m" if color else ""
        _YELLOW = "\033[1;33m" if color else ""
        _BLUE = "\033[1;34m" if color else ""
        _CYAN = "\033[1;36m" if color else ""
        _BOLD = "\033[1m" if color else ""
        _DIM = "\033[2m" if color else ""
        _RST = "\033[0m" if color else ""

        level_color = _RED if self.level == "error" else (_YELLOW if self.level == "warning" else _BLUE)
        parts: List[str] = []
        parts.append(f"{level_color}{self.level}{_RST}{_BOLD}: {self.message}{_RST}")

        if self.path and self.line > 0:
            loc = f"{self.path}:{self.line}"
            if self.column > 0:
                loc += f":{self.column}"
            parts.append(f"  {_BLUE}-->{_RST} {loc}")

            # Try to show the source line
            try:
                src_lines = self.path.read_text(encoding="utf-8", errors="ignore").splitlines()
                if 0 < self.line <= len(src_lines):
                    src_line = src_lines[self.line - 1]
                    line_no = str(self.line)
                    pad = " " * len(line_no)
                    parts.append(f"  {_BLUE}{pad} |{_RST}")
                    parts.append(f"  {_BLUE}{line_no} |{_RST} {src_line}")
                    if self.column > 0:
                        caret_len = max(1, self.length) if self.length else 1
                        arrow = " " * (self.column - 1) + level_color + "^" * caret_len + _RST
                        parts.append(f"  {_BLUE}{pad} |{_RST} {arrow}")
                    if self.hint:
                        parts.append(f"  {_BLUE}{pad} |{_RST} {_CYAN}= note: {self.hint}{_RST}")
                    if self.suggestion:
                        parts.append(f"  {_BLUE}{pad} |{_RST}")
                        parts.append(f"  {_BLUE}{pad} = {_CYAN}help{_RST}: {self.suggestion}")
            except Exception:
                pass

        elif self.hint:
            parts.append(f"  {_CYAN}= note: {self.hint}{_RST}")

        return "\n".join(parts)

    def __str__(self) -> str:
        if self.path and self.line > 0:
            return f"{self.level}: {self.message} at {self.path}:{self.line}:{self.column}"
        return f"{self.level}: {self.message}"


class ParseError(Exception):
    """Raised when the source stream cannot be parsed."""
    def __init__(self, message: str = "", *, diagnostic: Optional[Diagnostic] = None) -> None:
        self.diagnostic = diagnostic
        super().__init__(message)


class CompileError(Exception):
    """Raised when IR cannot be turned into assembly."""
    def __init__(self, message: str = "", *, diagnostic: Optional[Diagnostic] = None) -> None:
        self.diagnostic = diagnostic
        super().__init__(message)


class CompileTimeError(ParseError):
    """Raised when a compile-time word fails with context."""


# ---------------------------------------------------------------------------
# Tokenizer / Reader
# ---------------------------------------------------------------------------


class Token:
    __slots__ = ('lexeme', 'line', 'column', 'start', 'end', 'expansion_depth')

    def __init__(self, lexeme: str, line: int, column: int, start: int, end: int, expansion_depth: int = 0) -> None:
        self.lexeme = lexeme
        self.line = line
        self.column = column
        self.start = start
        self.end = end
        self.expansion_depth = expansion_depth

    def __repr__(self) -> str:  # pragma: no cover - debug helper
        return f"Token({self.lexeme!r}@{self.line}:{self.column})"


class SourceLocation:
    __slots__ = ('path', 'line', 'column')

    def __init__(self, path: Path, line: int, column: int) -> None:
        self.path = path
        self.line = line
        self.column = column

_SourceLocation_new = SourceLocation.__new__
_SourceLocation_cls = SourceLocation

def _make_loc(path: Path, line: int, column: int) -> SourceLocation:
    loc = _SourceLocation_new(_SourceLocation_cls)
    loc.path = path
    loc.line = line
    loc.column = column
    return loc

_READER_REGEX_CACHE: Dict[frozenset, "re.Pattern[str]"] = {}

_STACK_EFFECT_PAREN_RE = re.compile(r'\(([^)]*--[^)]*)\)')
_STACK_EFFECT_BARE_RE = re.compile(r'#\s*(\S+(?:\s+\S+)*?)\s+--\s')

def _parse_stack_effect_comment(source: str, word_token_start: int) -> Optional[int]:
    """Extract the input count from a stack-effect comment near a 'word' token.

    Looks for ``# ... (a b -- c)`` or ``# a b -- c`` on the same line as
    *word_token_start* or on the immediately preceding line.  Returns the
    number of inputs (names before ``--``) or *None* if no effect comment
    is found.
    """
    # Find the line containing the word token
    line_start = source.rfind('\n', 0, word_token_start)
    line_start = 0 if line_start == -1 else line_start + 1
    line_end = source.find('\n', word_token_start)
    if line_end == -1:
        line_end = len(source)
    lines_to_check = [source[line_start:line_end]]
    if line_start > 0:
        prev_end = line_start - 1
        prev_start = source.rfind('\n', 0, prev_end)
        prev_start = 0 if prev_start == -1 else prev_start + 1
        lines_to_check.append(source[prev_start:prev_end])

    for line in lines_to_check:
        if '#' not in line or '--' not in line:
            continue
        # Prefer parenthesized effect: # text (a b -- c)
        m = _STACK_EFFECT_PAREN_RE.search(line)
        if m:
            parts = m.group(1).split('--')
            inputs_part = parts[0].strip()
            return len(inputs_part.split()) if inputs_part else 0
        # Bare effect on same line as word: # a b -- c
        m = _STACK_EFFECT_BARE_RE.search(line)
        if m:
            return len(m.group(1).split())
    return None

class Reader:
    """Default reader; users can swap implementations at runtime."""

    def __init__(self) -> None:
        self.line = 1
        self.column = 0
        self.custom_tokens: Set[str] = {"(", ")", "{", "}", ";", ",", "[", "]"}
        self._token_order: List[str] = sorted(self.custom_tokens, key=len, reverse=True)
        self._single_char_tokens: Set[str] = {t for t in self.custom_tokens if len(t) == 1}
        self._multi_char_tokens: List[str] = [t for t in self._token_order if len(t) > 1]
        self._multi_first_chars: Set[str] = {t[0] for t in self._multi_char_tokens}

    def add_tokens(self, tokens: Iterable[str]) -> None:
        updated = False
        for tok in tokens:
            if not tok:
                continue
            if tok not in self.custom_tokens:
                self.custom_tokens.add(tok)
                updated = True
        if updated:
            self._token_order = sorted(self.custom_tokens, key=len, reverse=True)
            self._single_char_tokens = {t for t in self.custom_tokens if len(t) == 1}
            self._multi_char_tokens = [t for t in self._token_order if len(t) > 1]
            self._multi_first_chars = {t[0] for t in self._multi_char_tokens}
            self._token_re = None  # invalidate cached regex
            self._multi_char_tokens = [t for t in self._token_order if len(t) > 1]
            self._multi_first_chars = {t[0] for t in self._multi_char_tokens}

    def add_token_chars(self, chars: str) -> None:
        self.add_tokens(chars)

    def _build_token_re(self) -> "re.Pattern[str]":
        """Build a compiled regex for the current token set."""
        cache_key = frozenset(self.custom_tokens)
        cached = _READER_REGEX_CACHE.get(cache_key)
        if cached is not None:
            return cached
        singles_escaped = ''.join(re.escape(t) for t in sorted(self._single_char_tokens))
        # Word pattern: any non-delimiter char, or ; followed by alpha (;end is one token)
        if ';' in self._single_char_tokens:
            word_part = rf'(?:[^\s#"{singles_escaped}]|;(?=[a-zA-Z]))+'
        else:
            word_part = rf'[^\s#"{singles_escaped}]+'
        # Multi-char tokens (longest first)
        multi_part = ''
        if self._multi_char_tokens:
            multi_escaped = '|'.join(re.escape(t) for t in self._multi_char_tokens)
            multi_part = rf'|{multi_escaped}'
        pattern = (
            rf'"(?:[^"\\]|\\.)*"?'       # string literal (possibly unterminated)
            rf'|#[^\n]*'                  # comment
            rf'{multi_part}'              # multi-char tokens (if any)
            rf'|{word_part}'              # word
            rf'|[{singles_escaped}]'      # single-char tokens
        )
        compiled = re.compile(pattern)
        _READER_REGEX_CACHE[cache_key] = compiled
        return compiled

    def tokenize(self, source: str) -> List[Token]:
        # Lazily build/cache the token regex
        token_re = getattr(self, '_token_re', None)
        if token_re is None:
            token_re = self._build_token_re()
            self._token_re = token_re
        # Pre-compute line start offsets for O(1) amortized line/column lookup
        _line_starts = [0] + [m.end() for m in _RE_NEWLINE.finditer(source)]
        _n_lines = len(_line_starts)
        result: List[Token] = []
        _append = result.append
        _Token_new = Token.__new__
        _Token_cls = Token
        # Linear scan: tokens arrive in source order, so line index only advances
        _cur_li = 0
        _next_line_start = _line_starts[1] if _n_lines > 1 else 0x7FFFFFFFFFFFFFFF
        for m in token_re.finditer(source):
            start, end = m.span()
            fc = source[start]
            if fc == '#':
                continue  # skip comment
            text = source[start:end]
            if fc == '"':
                if end - start < 2 or source[end - 1] != '"':
                    raise ParseError("unterminated string literal")
            # Advance line index to find the correct line for this position
            while start >= _next_line_start:
                _cur_li += 1
                _next_line_start = _line_starts[_cur_li + 1] if _cur_li + 1 < _n_lines else 0x7FFFFFFFFFFFFFFF
            tok = _Token_new(_Token_cls)
            tok.lexeme = text
            tok.line = _cur_li + 1
            tok.column = start - _line_starts[_cur_li]
            tok.start = start
            tok.end = end
            tok.expansion_depth = 0
            _append(tok)
        # Update reader state to end-of-source position
        self.line = _n_lines
        self.column = len(source) - _line_starts[_n_lines - 1]
        return result


# ---------------------------------------------------------------------------
# Dictionary / Words
# ---------------------------------------------------------------------------


# Integer opcode constants for hot-path dispatch
OP_WORD = 0
OP_LITERAL = 1
OP_WORD_PTR = 2
OP_FOR_BEGIN = 3
OP_FOR_END = 4
OP_BRANCH_ZERO = 5
OP_JUMP = 6
OP_LABEL = 7
OP_LIST_BEGIN = 8
OP_LIST_END = 9
OP_LIST_LITERAL = 10
OP_OTHER = 11

_OP_STR_TO_INT = {
    "word": OP_WORD,
    "literal": OP_LITERAL,
    "word_ptr": OP_WORD_PTR,
    "for_begin": OP_FOR_BEGIN,
    "for_end": OP_FOR_END,
    "branch_zero": OP_BRANCH_ZERO,
    "jump": OP_JUMP,
    "label": OP_LABEL,
    "list_begin": OP_LIST_BEGIN,
    "list_end": OP_LIST_END,
    "list_literal": OP_LIST_LITERAL,
}


def _is_scalar_literal(node: Op) -> bool:
    return node._opcode == OP_LITERAL and not isinstance(node.data, str)


# Pre-computed peephole optimization data structures (avoids rebuilding per definition)
_PEEPHOLE_WORD_RULES: List[Tuple[Tuple[str, ...], Tuple[str, ...]]] = [
    # --- stack no-ops (cancellation) ---
    (("dup", "drop"), ()),
    (("swap", "swap"), ()),
    (("over", "drop"), ()),
    (("dup", "nip"), ()),
    (("2dup", "2drop"), ()),
    (("2swap", "2swap"), ()),
    (("rot", "rot", "rot"), ()),
    (("rot", "-rot"), ()),
    (("-rot", "rot"), ()),
    (("drop", "drop"), ("2drop",)),
    (("over", "over"), ("2dup",)),
    (("inc", "dec"), ()),
    (("dec", "inc"), ()),
    (("neg", "neg"), ()),
    (("not", "not"), ()),
    (("bitnot", "bitnot"), ()),
    (("bnot", "bnot"), ()),
    (("abs", "abs"), ("abs",)),
    # --- canonicalizations that merge into single ops ---
    (("swap", "drop"), ("nip",)),
    (("swap", "over"), ("tuck",)),
    (("swap", "nip"), ("drop",)),
    (("nip", "drop"), ("2drop",)),
    (("tuck", "drop"), ("swap",)),
    # --- commutative ops: swap before them is a no-op ---
    (("swap", "+"), ("+",)),
    (("swap", "*"), ("*",)),
    (("swap", "=="), ("==",)),
    (("swap", "!="), ("!=",)),
    (("swap", "band"), ("band",)),
    (("swap", "bor"), ("bor",)),
    (("swap", "bxor"), ("bxor",)),
    (("swap", "and"), ("and",)),
    (("swap", "or"), ("or",)),
    (("swap", "min"), ("min",)),
    (("swap", "max"), ("max",)),
    # --- dup + self-idempotent binary -> identity ---
    (("dup", "bor"), ()),
    (("dup", "band"), ()),
    (("dup", "bxor"), ("drop", "literal_0")),
    (("dup", "=="), ("drop", "literal_1")),
    (("dup", "-"), ("drop", "literal_0")),
]

_PEEPHOLE_PLACEHOLDER_RULES: Dict[Tuple[str, ...], Tuple[str, ...]] = {}
_PEEPHOLE_CLEAN_RULES: List[Tuple[Tuple[str, ...], Tuple[str, ...]]] = []
for _pat, _repl in _PEEPHOLE_WORD_RULES:
    if any(r.startswith("literal_") for r in _repl):
        _PEEPHOLE_PLACEHOLDER_RULES[_pat] = _repl
    else:
        _PEEPHOLE_CLEAN_RULES.append((_pat, _repl))

_PEEPHOLE_MAX_PAT_LEN = max(len(p) for p, _ in _PEEPHOLE_WORD_RULES) if _PEEPHOLE_WORD_RULES else 0

# Unified dict: pattern tuple -> replacement tuple (for O(1) lookup)
_PEEPHOLE_ALL_RULES: Dict[Tuple[str, ...], Tuple[str, ...]] = {}
for _pat, _repl in _PEEPHOLE_WORD_RULES:
    _PEEPHOLE_ALL_RULES[_pat] = _repl

# Which first-words have *any* rule (quick skip for non-matching heads)
_PEEPHOLE_FIRST_WORDS: Set[str] = {p[0] for p in _PEEPHOLE_ALL_RULES}

# Length-grouped rules indexed by first word for efficient matching
_PEEPHOLE_RULE_INDEX: Dict[str, List[Tuple[Tuple[str, ...], Tuple[str, ...]]]] = {}
for _pattern, _repl in _PEEPHOLE_CLEAN_RULES:
    _PEEPHOLE_RULE_INDEX.setdefault(_pattern[0], []).append((_pattern, _repl))

_PEEPHOLE_TERMINATORS = frozenset({OP_JUMP})

_PEEPHOLE_CANCEL_PAIRS = frozenset({
    ("not", "not"), ("neg", "neg"),
    ("bitnot", "bitnot"), ("bnot", "bnot"),
    ("inc", "dec"), ("dec", "inc"),
})
_PEEPHOLE_SHIFT_OPS = frozenset({"shl", "shr", "sar"})
_DEFAULT_CONTROL_WORDS = frozenset({"if", "else", "for", "while", "do"})


class Op:
    """Flat operation used for both compile-time execution and emission."""
    __slots__ = ('op', 'data', 'loc', '_word_ref', '_opcode')

    def __init__(self, op: str, data: Any = None, loc: Optional[SourceLocation] = None,
                 _word_ref: Optional[Word] = None, _opcode: int = OP_OTHER) -> None:
        self.op = op
        self.data = data
        self.loc = loc
        self._word_ref = _word_ref
        self._opcode = _OP_STR_TO_INT.get(op, OP_OTHER)


def _make_op(op: str, data: Any = None, loc: Optional[SourceLocation] = None) -> Op:
    """Fast Op constructor that avoids dict lookup for known opcodes."""
    node = Op.__new__(Op)
    node.op = op
    node.data = data
    node.loc = loc
    node._word_ref = None
    node._opcode = _OP_STR_TO_INT.get(op, OP_OTHER)
    return node


def _make_literal_op(data: Any, loc: Optional[SourceLocation] = None) -> Op:
    """Specialized Op constructor for 'literal' ops."""
    node = Op.__new__(Op)
    node.op = "literal"
    node.data = data
    node.loc = loc
    node._word_ref = None
    node._opcode = OP_LITERAL
    return node


def _make_word_op(data: str, loc: Optional[SourceLocation] = None) -> Op:
    """Specialized Op constructor for 'word' ops."""
    node = Op.__new__(Op)
    node.op = "word"
    node.data = data
    node.loc = loc
    node._word_ref = None
    node._opcode = OP_WORD
    return node


class Definition:
    __slots__ = ('name', 'body', 'immediate', 'compile_only', 'terminator', 'inline',
                 'stack_inputs', '_label_positions', '_for_pairs', '_begin_pairs',
                 '_words_resolved', '_merged_runs')

    def __init__(self, name: str, body: List[Op], immediate: bool = False,
                 compile_only: bool = False, terminator: str = "end", inline: bool = False,
                 stack_inputs: Optional[int] = None) -> None:
        self.name = name
        self.body = body
        self.immediate = immediate
        self.compile_only = compile_only
        self.terminator = terminator
        self.inline = inline
        self.stack_inputs = stack_inputs
        self._label_positions = None
        self._for_pairs = None
        self._begin_pairs = None
        self._words_resolved = False
        self._merged_runs = None


class AsmDefinition:
    __slots__ = ('name', 'body', 'immediate', 'compile_only', 'inline', 'effects', '_inline_lines')

    def __init__(self, name: str, body: str, immediate: bool = False,
                 compile_only: bool = False, inline: bool = False,
                 effects: Set[str] = None, _inline_lines: Optional[List[str]] = None) -> None:
        self.name = name
        self.body = body
        self.immediate = immediate
        self.compile_only = compile_only
        self.inline = inline
        self.effects = effects if effects is not None else set()
        self._inline_lines = _inline_lines


class Module:
    __slots__ = ('forms', 'variables', 'prelude', 'bss', 'cstruct_layouts')

    def __init__(self, forms: List[Any], variables: Dict[str, str] = None,
                 prelude: Optional[List[str]] = None, bss: Optional[List[str]] = None,
                 cstruct_layouts: Dict[str, CStructLayout] = None) -> None:
        self.forms = forms
        self.variables = variables if variables is not None else {}
        self.prelude = prelude
        self.bss = bss
        self.cstruct_layouts = cstruct_layouts if cstruct_layouts is not None else {}


class MacroDefinition:
    __slots__ = ('name', 'tokens', 'param_count')

    def __init__(self, name: str, tokens: List[str], param_count: int = 0) -> None:
        self.name = name
        self.tokens = tokens
        self.param_count = param_count


class StructField:
    __slots__ = ('name', 'offset', 'size')

    def __init__(self, name: str, offset: int, size: int) -> None:
        self.name = name
        self.offset = offset
        self.size = size


class CStructField:
    __slots__ = ('name', 'type_name', 'offset', 'size', 'align')

    def __init__(self, name: str, type_name: str, offset: int, size: int, align: int) -> None:
        self.name = name
        self.type_name = type_name
        self.offset = offset
        self.size = size
        self.align = align


class CStructLayout:
    __slots__ = ('name', 'size', 'align', 'fields')

    def __init__(self, name: str, size: int, align: int, fields: List[CStructField]) -> None:
        self.name = name
        self.size = size
        self.align = align
        self.fields = fields


class MacroContext:
    """Small facade exposed to Python-defined macros."""

    def __init__(self, parser: "Parser") -> None:
        self._parser = parser

    @property
    def parser(self) -> "Parser":
        return self._parser

    def next_token(self) -> Token:
        return self._parser.next_token()

    def peek_token(self) -> Optional[Token]:
        return self._parser.peek_token()

    def emit_literal(self, value: int) -> None:
        self._parser.emit_node(_make_literal_op(value))

    def emit_word(self, name: str) -> None:
        self._parser.emit_node(_make_word_op(name))

    def emit_node(self, node: Op) -> None:
        self._parser.emit_node(node)

    def inject_tokens(self, tokens: Sequence[str], template: Optional[Token] = None) -> None:
        if template is None:
            template = Token(lexeme="", line=0, column=0, start=0, end=0)
        generated = [
            Token(
                lexeme=lex,
                line=template.line,
                column=template.column,
                start=template.start,
                end=template.end,
            )
            for lex in tokens
        ]
        self.inject_token_objects(generated)

    def inject_token_objects(self, tokens: Sequence[Token]) -> None:
        self._parser.tokens[self._parser.pos:self._parser.pos] = list(tokens)

    def set_token_hook(self, handler: Optional[str]) -> None:
        self._parser.token_hook = handler

    def new_label(self, prefix: str) -> str:
        return self._parser._new_label(prefix)

    def most_recent_definition(self) -> Optional[Word]:
        return self._parser.most_recent_definition()


# Type aliases (only evaluated under TYPE_CHECKING)
MacroHandler = None  # Callable[[MacroContext], Optional[List[Op]]]
IntrinsicEmitter = None  # Callable[["FunctionEmitter"], None]


# Word effects ---------------------------------------------------------------


WORD_EFFECT_STRING_IO = "string-io"
_WORD_EFFECT_ALIASES: Dict[str, str] = {
    "string": WORD_EFFECT_STRING_IO,
    "strings": WORD_EFFECT_STRING_IO,
    "string-io": WORD_EFFECT_STRING_IO,
    "string_io": WORD_EFFECT_STRING_IO,
    "strings-io": WORD_EFFECT_STRING_IO,
    "strings_io": WORD_EFFECT_STRING_IO,
}


class Word:
    __slots__ = ('name', 'priority', 'immediate', 'definition', 'macro', 'intrinsic',
                 'macro_expansion', 'macro_params', 'compile_time_intrinsic',
                 'runtime_intrinsic', 'compile_only', 'compile_time_override',
                 'is_extern', 'extern_inputs', 'extern_outputs', 'extern_signature',
                 'extern_variadic', 'inline')

    def __init__(self, name: str, priority: int = 0, immediate: bool = False,
                 definition=None, macro=None, intrinsic=None,
                 macro_expansion=None, macro_params: int = 0,
                 compile_time_intrinsic=None, runtime_intrinsic=None,
                 compile_only: bool = False, compile_time_override: bool = False,
                 is_extern: bool = False, extern_inputs: int = 0, extern_outputs: int = 0,
                 extern_signature=None, extern_variadic: bool = False,
                 inline: bool = False) -> None:
        self.name = name
        self.priority = priority
        self.immediate = immediate
        self.definition = definition
        self.macro = macro
        self.intrinsic = intrinsic
        self.macro_expansion = macro_expansion
        self.macro_params = macro_params
        self.compile_time_intrinsic = compile_time_intrinsic
        self.runtime_intrinsic = runtime_intrinsic
        self.compile_only = compile_only
        self.compile_time_override = compile_time_override
        self.is_extern = is_extern
        self.extern_inputs = extern_inputs
        self.extern_outputs = extern_outputs
        self.extern_signature = extern_signature
        self.extern_variadic = extern_variadic
        self.inline = inline


_suppress_redefine_warnings = False


def _suppress_redefine_warnings_set(value: bool) -> None:
    global _suppress_redefine_warnings
    _suppress_redefine_warnings = value


class Dictionary:
    __slots__ = ('words', 'warn_callback')

    def __init__(self, words: Dict[str, Word] = None) -> None:
        self.words = words if words is not None else {}
        self.warn_callback: Optional[Callable] = None

    def register(self, word: Word) -> Word:
        existing = self.words.get(word.name)
        if existing is None:
            self.words[word.name] = word
            return word

        # Preserve existing intrinsic handlers unless explicitly replaced.
        if word.runtime_intrinsic is None and existing.runtime_intrinsic is not None:
            word.runtime_intrinsic = existing.runtime_intrinsic
        if word.compile_time_intrinsic is None and existing.compile_time_intrinsic is not None:
            word.compile_time_intrinsic = existing.compile_time_intrinsic

        if word.priority > existing.priority:
            self.words[word.name] = word
            sys.stderr.write(
                f"[note] word {word.name}: using priority {word.priority} over {existing.priority}\n"
            )
            return word

        if word.priority < existing.priority:
            sys.stderr.write(
                f"[note] word {word.name}: keeping priority {existing.priority}, ignored {word.priority}\n"
            )
            return existing

        # Same priority: allow replacing placeholder bootstrap words silently.
        if existing.definition is None and word.definition is not None:
            self.words[word.name] = word
            return word

        if not _suppress_redefine_warnings:
            if self.warn_callback is not None:
                self.warn_callback(word.name, word.priority)
            else:
                sys.stderr.write(f"[warn] redefining word {word.name} (priority {word.priority})\n")
        self.words[word.name] = word
        return word

    def lookup(self, name: str) -> Optional[Word]:
        return self.words.get(name)


# ---------------------------------------------------------------------------
# Parser
# ---------------------------------------------------------------------------


Context = None  # Union[Module, Definition] - only used in annotations


class Parser:
    EXTERN_DEFAULT_PRIORITY = 1

    def __init__(
        self,
        dictionary: Dictionary,
        reader: Optional[Reader] = None,
        *,
        macro_expansion_limit: int = DEFAULT_MACRO_EXPANSION_LIMIT,
    ) -> None:
        if macro_expansion_limit < 1:
            raise ValueError("macro_expansion_limit must be >= 1")
        self.dictionary = dictionary
        self.reader = reader or Reader()
        self.macro_expansion_limit = macro_expansion_limit
        self.tokens: List[Token] = []
        self._token_iter: Optional[Iterable[Token]] = None
        self._token_iter_exhausted = True
        self.pos = 0
        self.context_stack: List[Context] = []
        self.definition_stack: List[Tuple[Word, bool]] = []
        self.last_defined: Optional[Word] = None
        self.source: str = ""
        self.macro_recording: Optional[MacroDefinition] = None
        self.control_stack: List[Dict[str, str]] = []
        self.block_openers: Set[str] = {"word", "with", "for", "while", "begin"}
        self.control_overrides: Set[str] = set()
        self._warned_control_overrides: Set[str] = set()
        self.label_counter = 0
        self.token_hook: Optional[str] = None
        self._last_token: Optional[Token] = None
        self.variable_labels: Dict[str, str] = {}
        self.variable_words: Dict[str, str] = {}
        self.file_spans: List[FileSpan] = []
        self._span_starts: List[int] = []
        self._span_index_len: int = 0
        self._span_cache_idx: int = -1
        self.compile_time_vm = CompileTimeVM(self)
        self.custom_prelude: Optional[List[str]] = None
        self.custom_bss: Optional[List[str]] = None
        self.cstruct_layouts: Dict[str, CStructLayout] = {}
        self._pending_inline_definition: bool = False
        self._pending_priority: Optional[int] = None
        self.diagnostics: List[Diagnostic] = []
        self._max_errors: int = 20
        self._warnings_enabled: Set[str] = set()
        self._werror: bool = False

    def _rebuild_span_index(self) -> None:
        """Rebuild bisect index after file_spans changes."""
        self._span_starts: List[int] = [s.start_line for s in self.file_spans]
        self._span_index_len: int = len(self.file_spans)

    def location_for_token(self, token: Token) -> SourceLocation:
        spans = self.file_spans
        if not spans:
            return _make_loc(_SOURCE_PATH, token.line, token.column)
        if self._span_index_len != len(spans):
            self._rebuild_span_index()
            self._span_cache_idx = -1
        tl = token.line
        # Fast path: check cached span first (common for sequential access)
        ci = self._span_cache_idx
        if ci >= 0:
            span = spans[ci]
            if span.start_line <= tl < span.end_line:
                return _make_loc(span.path, span.local_start_line + (tl - span.start_line), token.column)
        span_starts = self._span_starts
        idx = bisect.bisect_right(span_starts, tl) - 1
        if idx >= 0:
            span = spans[idx]
            if tl < span.end_line:
                self._span_cache_idx = idx
                return _make_loc(span.path, span.local_start_line + (tl - span.start_line), token.column)
        return _make_loc(_SOURCE_PATH, tl, token.column)

    def _record_diagnostic(self, token: Optional[Token], message: str, *, level: str = "error", hint: str = "", suggestion: str = "") -> None:
        """Record a diagnostic and raise ParseError if too many errors."""
        loc = self.location_for_token(token) if token else _make_loc(_SOURCE_PATH, 0, 0)
        diag = Diagnostic(
            level=level, message=message,
            path=loc.path, line=loc.line, column=loc.column,
            length=len(token.lexeme) if token else 0,
            hint=hint, suggestion=suggestion,
        )
        self.diagnostics.append(diag)
        if level == "error" and sum(1 for d in self.diagnostics if d.level == "error") >= self._max_errors:
            raise ParseError(f"too many errors ({self._max_errors}), aborting", diagnostic=diag)

    def _warn(self, token: Optional[Token], category: str, message: str, *, hint: str = "", suggestion: str = "") -> None:
        """Record a warning if the category is enabled. Promotes to error under --Werror."""
        if "all" not in self._warnings_enabled and category not in self._warnings_enabled:
            return
        level = "error" if self._werror else "warning"
        self._record_diagnostic(token, message, level=level, hint=hint, suggestion=suggestion)

    def _skip_to_recovery_point(self) -> None:
        """Skip tokens until we reach a safe recovery point (end, ;, or top-level definition keyword)."""
        _recovery_keywords = {"word", "end", ";", ":asm", ":py", "extern", "macro"}
        depth = 0
        while self.pos < len(self.tokens):
            lex = self.tokens[self.pos].lexeme
            if lex == "word" or lex == ":asm" or lex == ":py":
                if depth == 0:
                    break  # Don't consume — let the main loop pick it up
                depth += 1
            elif lex == "end":
                if depth <= 1:
                    self.pos += 1
                    break
                depth -= 1
            elif lex == ";":
                self.pos += 1
                break
            elif lex == "extern" and depth == 0:
                break
            self.pos += 1
        # Reset state for recovery
        self.macro_recording = None
        self._pending_priority = None
        self._pending_inline_definition = False
        while self.definition_stack:
            self.definition_stack.pop()
        while len(self.context_stack) > 1:
            self.context_stack.pop()
        self.control_stack.clear()

    def inject_token_objects(self, tokens: Sequence[Token]) -> None:
        """Insert tokens at the current parse position."""
        self.tokens[self.pos:self.pos] = list(tokens)

    # Public helpers for macros ------------------------------------------------
    def next_token(self) -> Token:
        return self._consume()

    def peek_token(self) -> Optional[Token]:
        return None if self.pos >= len(self.tokens) else self.tokens[self.pos]

    def emit_node(self, node: Op) -> None:
        self._append_op(node)

    def most_recent_definition(self) -> Optional[Word]:
        return self.last_defined

    def allocate_variable(self, name: str) -> Tuple[str, str]:
        if name in self.variable_labels:
            label = self.variable_labels[name]
        else:
            base = sanitize_label(f"var_{name}")
            label = base
            suffix = 0
            existing = set(self.variable_labels.values())
            while label in existing:
                suffix += 1
                label = f"{base}_{suffix}"
            self.variable_labels[name] = label
        hidden_word = f"__with_{name}"
        self.variable_words[name] = hidden_word
        if self.dictionary.lookup(hidden_word) is None:
            word = Word(name=hidden_word)

            def _intrinsic(builder: FunctionEmitter, target: str = label) -> None:
                builder.push_label(target)

            word.intrinsic = _intrinsic

            # CT intrinsic: allocate a qword in CTMemory for this variable.
            # The address is lazily created on first use and cached.
            _ct_var_addrs: Dict[str, int] = {}

            def _ct_intrinsic(vm: CompileTimeVM, var_name: str = name) -> None:
                if var_name not in _ct_var_addrs:
                    _ct_var_addrs[var_name] = vm.memory.allocate(8)
                vm.push(_ct_var_addrs[var_name])

            word.compile_time_intrinsic = _ct_intrinsic
            word.runtime_intrinsic = _ct_intrinsic
            self.dictionary.register(word)
        return label, hidden_word

    def _handle_end_control(self) -> None:
        """Close one generic control frame pushed by compile-time words."""
        if not self.control_stack:
            raise ParseError("unexpected 'end' without matching block")

        entry = self.control_stack.pop()
        if not isinstance(entry, dict):
            raise ParseError("invalid control frame")

        close_ops = entry.get("close_ops")
        if close_ops is None:
            return
        if not isinstance(close_ops, list):
            raise ParseError("control frame field 'close_ops' must be a list")

        for spec in close_ops:
            op_name: Optional[str] = None
            data: Any = None
            if isinstance(spec, dict):
                candidate = spec.get("op")
                if isinstance(candidate, str):
                    op_name = candidate
                if "data" in spec:
                    data = spec["data"]
            elif isinstance(spec, (list, tuple)):
                if not spec:
                    raise ParseError("close_ops contains empty sequence")
                if isinstance(spec[0], str):
                    op_name = spec[0]
                data = spec[1] if len(spec) > 1 else None
            elif isinstance(spec, str):
                op_name = spec
            else:
                raise ParseError(f"invalid close op descriptor: {spec!r}")

            if not op_name:
                raise ParseError(f"close op missing valid 'op' name: {spec!r}")
            self._append_op(_make_op(op_name, data))

    # Parsing ------------------------------------------------------------------
    def parse(self, tokens: Iterable[Token], source: str) -> Module:
        self.tokens = tokens if isinstance(tokens, list) else list(tokens)
        self._token_iter = None
        self._token_iter_exhausted = True
        self.source = source
        self.pos = 0
        self.variable_labels = {}
        self.variable_words = {}
        self.cstruct_layouts = {}
        self.context_stack = [
            Module(
                forms=[],
                variables=self.variable_labels,
                cstruct_layouts=self.cstruct_layouts,
            )
        ]
        self.definition_stack.clear()
        self.last_defined = None
        self.control_stack = []
        self.label_counter = 0
        self.token_hook = None
        self._last_token = None
        self.custom_prelude = None
        self.custom_bss = None
        self._pending_inline_definition = False
        self._pending_priority = None

        _priority_keywords = {
            "word", ":asm", ":py", "extern", "inline", "priority",
        }

        # Sentinel values for dispatch actions
        _KW_LIST_BEGIN = 1
        _KW_LIST_END = 2
        _KW_WORD = 3
        _KW_END = 4
        _KW_ASM = 5
        _KW_PY = 6
        _KW_EXTERN = 7
        _KW_PRIORITY = 8
        _keyword_dispatch = {
            "[": _KW_LIST_BEGIN, "]": _KW_LIST_END, "word": _KW_WORD,
            "end": _KW_END, ":asm": _KW_ASM, ":py": _KW_PY,
            "extern": _KW_EXTERN, "priority": _KW_PRIORITY,
        }
        _kw_get = _keyword_dispatch.get

        _tokens = self.tokens
        try:
            while self.pos < len(_tokens):
              try:
                token = _tokens[self.pos]
                self.pos += 1
                self._last_token = token
                if self.token_hook and self._run_token_hook(token):
                    continue
                if self.macro_recording is not None:
                    if token.lexeme == ";":
                        self._finish_macro_recording(token)
                    else:
                        self.macro_recording.tokens.append(token.lexeme)
                    continue
                lexeme = token.lexeme
                if self._pending_priority is not None and lexeme not in _priority_keywords:
                    raise ParseError(
                        f"priority {self._pending_priority} must be followed by definition/extern"
                    )
                kw = _kw_get(lexeme)
                if kw is not None:
                    if kw == _KW_LIST_BEGIN:
                        self._handle_list_begin()
                    elif kw == _KW_LIST_END:
                        self._handle_list_end(token)
                    elif kw == _KW_WORD:
                        inline_def = self._consume_pending_inline()
                        self._begin_definition(token, terminator="end", inline=inline_def)
                    elif kw == _KW_END:
                        if self.control_stack:
                            self._handle_end_control()
                        elif self._try_end_definition(token):
                            pass
                        else:
                            raise ParseError(f"unexpected 'end' at {token.line}:{token.column}")
                    elif kw == _KW_ASM:
                        self._parse_asm_definition(token)
                        _tokens = self.tokens
                    elif kw == _KW_PY:
                        self._parse_py_definition(token)
                        _tokens = self.tokens
                    elif kw == _KW_EXTERN:
                        self._parse_extern(token)
                    elif kw == _KW_PRIORITY:
                        self._parse_priority_directive(token)
                    continue
                if self._try_handle_builtin_control(token):
                    continue
                if self._handle_token(token):
                    _tokens = self.tokens
              except CompileTimeError:
                raise
              except ParseError as _recov_exc:
                self._record_diagnostic(self._last_token, str(_recov_exc))
                self._skip_to_recovery_point()
                _tokens = self.tokens
                continue
        except CompileTimeError:
            raise
        except ParseError:
            raise
        except Exception as exc:
            tok = self._last_token
            if tok is None:
                raise ParseError(f"unexpected error during parse: {exc}") from None
            raise ParseError(
                f"unexpected error near '{tok.lexeme}' at {tok.line}:{tok.column}: {exc}"
            ) from None

        if self.macro_recording is not None:
            self._record_diagnostic(self._last_token, "unterminated macro definition (missing ';')")
        if self._pending_priority is not None:
            self._record_diagnostic(self._last_token, f"dangling priority {self._pending_priority} without following definition")

        if len(self.context_stack) != 1:
            self._record_diagnostic(self._last_token, "unclosed definition at EOF")
        if self.control_stack:
            self._record_diagnostic(self._last_token, "unclosed control structure at EOF")

        # If any errors were accumulated, raise with all diagnostics
        error_count = sum(1 for d in self.diagnostics if d.level == "error")
        if error_count > 0:
            raise ParseError(f"compilation failed with {error_count} error(s)")

        module = self.context_stack.pop()
        if not isinstance(module, Module):  # pragma: no cover - defensive
            raise ParseError("internal parser state corrupt")
        module.variables = dict(self.variable_labels)
        module.prelude = self.custom_prelude
        module.bss = self.custom_bss
        module.cstruct_layouts = dict(self.cstruct_layouts)
        return module

    def _handle_list_begin(self) -> None:
        label = self._new_label("list")
        self._append_op(_make_op("list_begin", label))
        self._push_control({"type": "list", "label": label})

    def _handle_list_end(self, token: Token) -> None:
        entry = self._pop_control(("list",))
        label = entry["label"]
        self._append_op(_make_op("list_end", label))

    def _should_use_custom_control(self, lexeme: str) -> bool:
        # Fast path: default parser controls unless explicitly overridden.
        if lexeme not in self.control_overrides:
            return False
        word = self.dictionary.lookup(lexeme)
        if word is None:
            return False
        return bool(word.immediate)

    def _warn_control_override(self, token: Token, lexeme: str) -> None:
        if lexeme in self._warned_control_overrides:
            return
        self._warned_control_overrides.add(lexeme)
        sys.stderr.write(
            f"[warn] default control structure ({lexeme}) has been overridden; using custom implementation\n"
        )

    def _try_handle_builtin_control(self, token: Token) -> bool:
        lexeme = token.lexeme
        if lexeme not in _DEFAULT_CONTROL_WORDS:
            return False
        if self._should_use_custom_control(lexeme):
            self._warn_control_override(token, lexeme)
            return False
        if lexeme == "if":
            self._handle_builtin_if(token)
            return True
        if lexeme == "else":
            self._handle_builtin_else(token)
            return True
        if lexeme == "for":
            self._handle_builtin_for(token)
            return True
        if lexeme == "while":
            self._handle_builtin_while(token)
            return True
        if lexeme == "do":
            self._handle_builtin_do(token)
            return True
        return False

    def _handle_builtin_if(self, token: Token) -> None:
        # Support shorthand `else <cond> if` by sharing the previous else-end label.
        if self.control_stack:
            top = self.control_stack[-1]
            if (
                top.get("type") == "else"
                and isinstance(top.get("line"), int)
                and top["line"] == token.line
            ):
                prev_else = self._pop_control(("else",))
                shared_end = prev_else.get("end")
                if not isinstance(shared_end, str):
                    shared_end = self._new_label("if_end")
                false_label = self._new_label("if_false")
                self._append_op(_make_op("branch_zero", false_label))
                self._push_control(
                    {
                        "type": "if",
                        "false": false_label,
                        "end": shared_end,
                        "close_ops": [
                            {"op": "label", "data": false_label},
                            {"op": "label", "data": shared_end},
                        ],
                        "line": token.line,
                        "column": token.column,
                    }
                )
                return

        false_label = self._new_label("if_false")
        self._append_op(_make_op("branch_zero", false_label))
        self._push_control(
            {
                "type": "if",
                "false": false_label,
                "end": None,
                "close_ops": [{"op": "label", "data": false_label}],
                "line": token.line,
                "column": token.column,
            }
        )

    def _handle_builtin_else(self, token: Token) -> None:
        entry = self._pop_control(("if",))
        false_label = entry.get("false")
        if not isinstance(false_label, str):
            raise ParseError("invalid if control frame")
        end_label = entry.get("end")
        if not isinstance(end_label, str):
            end_label = self._new_label("if_end")
        self._append_op(_make_op("jump", end_label))
        self._append_op(_make_op("label", false_label))
        self._push_control(
            {
                "type": "else",
                "end": end_label,
                "close_ops": [{"op": "label", "data": end_label}],
                "line": token.line,
                "column": token.column,
            }
        )

    def _handle_builtin_for(self, token: Token) -> None:
        loop_label = self._new_label("for_loop")
        end_label = self._new_label("for_end")
        frame = {"loop": loop_label, "end": end_label}
        self._append_op(_make_op("for_begin", dict(frame)))
        self._push_control(
            {
                "type": "for",
                "loop": loop_label,
                "end": end_label,
                "close_ops": [{"op": "for_end", "data": dict(frame)}],
                "line": token.line,
                "column": token.column,
            }
        )

    def _handle_builtin_while(self, token: Token) -> None:
        begin_label = self._new_label("begin")
        end_label = self._new_label("end")
        self._append_op(_make_op("label", begin_label))
        self._push_control(
            {
                "type": "while_open",
                "begin": begin_label,
                "end": end_label,
                "line": token.line,
                "column": token.column,
            }
        )

    def _handle_builtin_do(self, token: Token) -> None:
        entry = self._pop_control(("while_open",))
        begin_label = entry.get("begin")
        end_label = entry.get("end")
        if not isinstance(begin_label, str) or not isinstance(end_label, str):
            raise ParseError("invalid while control frame")
        self._append_op(_make_op("branch_zero", end_label))
        self._push_control(
            {
                "type": "while",
                "begin": begin_label,
                "end": end_label,
                "close_ops": [
                    {"op": "jump", "data": begin_label},
                    {"op": "label", "data": end_label},
                ],
                "line": token.line,
                "column": token.column,
            }
        )

    def _parse_priority_directive(self, token: Token) -> None:
        if self._eof():
            raise ParseError(f"priority value missing at {token.line}:{token.column}")
        value_tok = self._consume()
        try:
            value = int(value_tok.lexeme, 0)
        except ValueError:
            raise ParseError(
                f"invalid priority '{value_tok.lexeme}' at {value_tok.line}:{value_tok.column}"
            )
        self._pending_priority = value

    def _consume_pending_priority(self, *, default: int = 0) -> int:
        if self._pending_priority is None:
            return default
        value = self._pending_priority
        self._pending_priority = None
        return value

    # Internal helpers ---------------------------------------------------------

    def _parse_extern(self, token: Token) -> None:
        # extern <name> [inputs outputs]
        # OR
        # extern <ret_type> <name>(<args>)

        if self._eof():
            raise ParseError(f"extern missing name at {token.line}:{token.column}")

        priority = self._consume_pending_priority(default=self.EXTERN_DEFAULT_PRIORITY)
        first_token = self._consume()
        if self._try_parse_c_extern(first_token, priority=priority):
            return
        self._parse_legacy_extern(first_token, priority=priority)

    def _parse_legacy_extern(self, name_token: Token, *, priority: int = 0) -> None:
        name = name_token.lexeme
        candidate = Word(name=name, priority=priority)
        word = self.dictionary.register(candidate)
        if word is not candidate:
            return
        word.is_extern = True

        peek = self.peek_token()
        if peek is not None and peek.lexeme.isdigit():
            word.extern_inputs = int(self._consume().lexeme)
            peek = self.peek_token()
            if peek is not None and peek.lexeme.isdigit():
                word.extern_outputs = int(self._consume().lexeme)
            else:
                word.extern_outputs = 0
        else:
            word.extern_inputs = 0
            word.extern_outputs = 0

    def _try_parse_c_extern(self, first_token: Token, *, priority: int = 0) -> bool:
        saved_pos = self.pos
        prefix_tokens: List[str] = [first_token.lexeme]

        while True:
            if self._eof():
                self.pos = saved_pos
                return False
            lookahead = self._consume()
            if lookahead.lexeme == "(":
                break
            if lookahead.lexeme.isdigit():
                self.pos = saved_pos
                return False
            prefix_tokens.append(lookahead.lexeme)

        if not prefix_tokens:
            raise ParseError("extern missing return type/name before '('")

        name_lexeme = prefix_tokens.pop()
        if not _is_identifier(name_lexeme):
            prefix_name, suffix_name = _split_trailing_identifier(name_lexeme)
            if suffix_name is None:
                raise ParseError(f"extern expected identifier before '(' but got '{name_lexeme}'")
            name_lexeme = suffix_name
            if prefix_name:
                prefix_tokens.append(prefix_name)

        if not _is_identifier(name_lexeme):
            raise ParseError(f"extern expected identifier before '(' but got '{name_lexeme}'")

        ret_type = _normalize_c_type_tokens(prefix_tokens, allow_default=True)
        inputs, arg_types, variadic = self._parse_c_param_list()
        outputs = 0 if ret_type == "void" else 1
        self._register_c_extern(name_lexeme, inputs, outputs, arg_types, ret_type,
                                priority=priority, variadic=variadic)
        return True

    def _parse_c_param_list(self) -> Tuple[int, List[str], bool]:
        """Parse C-style parameter list. Returns (count, types, is_variadic)."""
        inputs = 0
        arg_types: List[str] = []
        variadic = False

        if self._eof():
            raise ParseError("extern unclosed '('")
        peek = self.peek_token()
        if peek.lexeme == ")":
            self._consume()
            return inputs, arg_types, False

        while True:
            # Check for ... (variadic)
            peek = self.peek_token()
            if peek is not None and peek.lexeme == "...":
                self._consume()
                variadic = True
                if self._eof():
                    raise ParseError("extern unclosed '(' after '...'")
                closing = self._consume()
                if closing.lexeme != ")":
                    raise ParseError("expected ')' after '...' in extern parameter list")
                break
            lexemes = self._collect_c_param_lexemes()
            arg_type = _normalize_c_type_tokens(lexemes, allow_default=False)
            if arg_type == "void" and inputs == 0:
                if self._eof():
                    raise ParseError("extern unclosed '(' after 'void'")
                closing = self._consume()
                if closing.lexeme != ")":
                    raise ParseError("expected ')' after 'void' in extern parameter list")
                return 0, [], False
            inputs += 1
            arg_types.append(arg_type)
            if self._eof():
                raise ParseError("extern unclosed '('")
            separator = self._consume()
            if separator.lexeme == ")":
                break
            if separator.lexeme != ",":
                raise ParseError(
                    f"expected ',' or ')' in extern parameter list, got '{separator.lexeme}'"
                )
        return inputs, arg_types, variadic

    def _collect_c_param_lexemes(self) -> List[str]:
        lexemes: List[str] = []
        while True:
            if self._eof():
                raise ParseError("extern unclosed '('")
            peek = self.peek_token()
            if peek.lexeme in (",", ")"):
                break
            lexemes.append(self._consume().lexeme)

        if not lexemes:
            raise ParseError("missing parameter type in extern declaration")

        if len(lexemes) > 1 and _is_identifier(lexemes[-1]):
            lexemes.pop()
            return lexemes

        prefix, suffix = _split_trailing_identifier(lexemes[-1])
        if suffix is not None:
            if prefix:
                lexemes[-1] = prefix
            else:
                lexemes.pop()
        return lexemes

    def _register_c_extern(
        self,
        name: str,
        inputs: int,
        outputs: int,
        arg_types: List[str],
        ret_type: str,
        *,
        priority: int = 0,
        variadic: bool = False,
    ) -> None:
        candidate = Word(name=name, priority=priority)
        word = self.dictionary.register(candidate)
        if word is not candidate:
            return
        word.is_extern = True
        word.extern_inputs = inputs
        word.extern_outputs = outputs
        word.extern_signature = (arg_types, ret_type)
        word.extern_variadic = variadic

    def _handle_token(self, token: Token) -> bool:
        """Handle a token. Returns True if the token list was modified (macro expansion)."""
        lexeme = token.lexeme
        first = lexeme[0]
        # Fast-path: inline integer literal parse (most common literal type)
        if first.isdigit() or first == '-' or first == '+':
            try:
                value = int(lexeme, 0)
                self._append_op(_make_literal_op(value))
                return False
            except ValueError:
                pass
            # Fall through to float/string check
            if self._try_literal(token):
                return False
        elif first == '"' or first == '.':
            if self._try_literal(token):
                return False

        if first == '&':
            target_name = lexeme[1:]
            if not target_name:
                raise ParseError(f"missing word name after '&' at {token.line}:{token.column}")
            self._append_op(_make_op("word_ptr", target_name))
            return False

        word = self.dictionary.words.get(lexeme)
        if word is not None:
            if word.macro_expansion is not None:
                args = self._collect_macro_args(word.macro_params)
                self._inject_macro_tokens(word, token, args)
                return True
            if word.immediate:
                if word.macro:
                    produced = word.macro(MacroContext(self))
                    if produced:
                        for node in produced:
                            self._append_op(node)
                else:
                    self._execute_immediate_word(word)
                return False

        self._append_op(_make_word_op(lexeme))
        return False

    def _execute_immediate_word(self, word: Word) -> None:
        try:
            self.compile_time_vm.invoke(word)
        except CompileTimeError:
            raise
        except ParseError:
            raise
        except Exception as exc:  # pragma: no cover - defensive
            raise CompileTimeError(f"compile-time word '{word.name}' failed: {exc}") from None

    def _handle_macro_recording(self, token: Token) -> bool:
        if self.macro_recording is None:
            return False
        if token.lexeme == ";":
            self._finish_macro_recording(token)
        else:
            self.macro_recording.tokens.append(token.lexeme)
        return True

    def _maybe_expand_macro(self, token: Token) -> bool:
        word = self.dictionary.lookup(token.lexeme)
        if word and word.macro_expansion is not None:
            args = self._collect_macro_args(word.macro_params)
            self._inject_macro_tokens(word, token, args)
            return True
        return False

    def _inject_macro_tokens(self, word: Word, token: Token, args: List[str]) -> None:
        next_depth = token.expansion_depth + 1
        if next_depth > self.macro_expansion_limit:
            raise ParseError(
                f"macro expansion depth limit ({self.macro_expansion_limit}) exceeded while expanding '{word.name}'"
            )
        replaced: List[str] = []
        for lex in word.macro_expansion or []:
            if lex.startswith("$"):
                idx = int(lex[1:])
                if idx < 0 or idx >= len(args):
                    raise ParseError(f"macro {word.name} missing argument for {lex}")
                replaced.append(args[idx])
            else:
                replaced.append(lex)
        insertion = [
            Token(
                lexeme=lex,
                line=token.line,
                column=token.column,
                start=token.start,
                end=token.end,
                expansion_depth=next_depth,
            )
            for lex in replaced
        ]
        self.tokens[self.pos:self.pos] = insertion

    def _collect_macro_args(self, count: int) -> List[str]:
        args: List[str] = []
        for _ in range(count):
            if self._eof():
                raise ParseError("macro invocation missing arguments")
            args.append(self._consume().lexeme)
        return args

    def _start_macro_recording(self, name: str, param_count: int) -> None:
        if self.macro_recording is not None:
            raise ParseError("nested macro definitions are not supported")
        self.macro_recording = MacroDefinition(name=name, tokens=[], param_count=param_count)

    def _finish_macro_recording(self, token: Token) -> None:
        if self.macro_recording is None:
            raise ParseError(f"unexpected ';' closing a macro at {token.line}:{token.column}")
        macro_def = self.macro_recording
        self.macro_recording = None
        word = Word(name=macro_def.name)
        word.macro_expansion = list(macro_def.tokens)
        word.macro_params = macro_def.param_count
        self.dictionary.register(word)

    def _push_control(self, entry: Dict[str, str]) -> None:
        if "line" not in entry or "column" not in entry:
            tok = self._last_token
            if tok is not None:
                entry = dict(entry)
                entry["line"] = tok.line
                entry["column"] = tok.column
        self.control_stack.append(entry)

    def _pop_control(self, expected: Tuple[str, ...]) -> Dict[str, str]:
        if not self.control_stack:
            raise ParseError("control stack underflow")
        entry = self.control_stack.pop()
        if entry.get("type") not in expected:
            tok = self._last_token
            location = ""
            if tok is not None:
                location = f" at {tok.line}:{tok.column} near '{tok.lexeme}'"
            origin = ""
            if "line" in entry and "column" in entry:
                origin = f" (opened at {entry['line']}:{entry['column']})"
            raise ParseError(f"mismatched control word '{entry.get('type')}'" + origin + location)
        return entry

    def _new_label(self, prefix: str) -> str:
        label = f"L_{prefix}_{self.label_counter}"
        self.label_counter += 1
        return label

    def _run_token_hook(self, token: Token) -> bool:
        if not self.token_hook:
            return False
        hook_word = self.dictionary.lookup(self.token_hook)
        if hook_word is None:
            raise ParseError(f"token hook '{self.token_hook}' not defined")
        self.compile_time_vm.invoke_with_args(hook_word, [token])
        # Convention: hook leaves handled flag on stack (int truthy means consumed)
        handled = self.compile_time_vm.pop()
        return bool(handled)

    def _try_end_definition(self, token: Token) -> bool:
        if len(self.context_stack) <= 1:
            return False
        ctx = self.context_stack[-1]
        if not isinstance(ctx, Definition):
            return False
        if ctx.terminator != token.lexeme:
            return False
        self._end_definition(token)
        return True

    def _consume_pending_inline(self) -> bool:
        pending = self._pending_inline_definition
        self._pending_inline_definition = False
        return pending

    def _begin_definition(self, token: Token, terminator: str = "end", inline: bool = False) -> None:
        if self._eof():
            raise ParseError(
                f"definition name missing after '{token.lexeme}' at {token.line}:{token.column}"
            )
        name_token = self._consume()
        priority = self._consume_pending_priority()
        definition = Definition(
            name=name_token.lexeme,
            body=[],
            terminator=terminator,
            inline=inline,
            stack_inputs=_parse_stack_effect_comment(self.source, token.start),
        )
        self.context_stack.append(definition)
        candidate = Word(name=definition.name, priority=priority)
        candidate.definition = definition
        candidate.inline = inline
        active_word = self.dictionary.register(candidate)
        is_active = active_word is candidate
        self.definition_stack.append((candidate, is_active))

    def _end_definition(self, token: Token) -> None:
        if len(self.context_stack) <= 1:
            raise ParseError(f"unexpected '{token.lexeme}' at {token.line}:{token.column}")
        ctx = self.context_stack.pop()
        if not isinstance(ctx, Definition):
            raise ParseError(f"'{token.lexeme}' can only close definitions")
        if ctx.terminator != token.lexeme:
            raise ParseError(
                f"definition '{ctx.name}' expects terminator '{ctx.terminator}' but got '{token.lexeme}'"
            )
        word, is_active = self.definition_stack.pop()
        if not is_active:
            return
        ctx.immediate = word.immediate
        ctx.compile_only = word.compile_only
        ctx.inline = word.inline
        if word.compile_only or word.immediate:
            word.compile_time_override = True
            word.compile_time_intrinsic = None
        module = self.context_stack[-1]
        if not isinstance(module, Module):
            raise ParseError("nested definitions are not supported yet")
        module.forms.append(ctx)
        self.last_defined = word

    def _parse_effect_annotations(self) -> List[str]:
        """Parse a '(effects ...)' clause that follows a :asm name."""
        open_tok = self._consume()
        if open_tok.lexeme != "(":  # pragma: no cover - defensive
            raise ParseError("internal parser error: effect clause must start with '('")
        tokens: List[Token] = []
        while True:
            if self._eof():
                raise ParseError("unterminated effect clause in asm definition")
            tok = self._consume()
            if tok.lexeme == ")":
                break
            tokens.append(tok)
        if not tokens:
            raise ParseError("effect clause must include 'effect' or 'effects'")
        keyword = tokens.pop(0)
        if keyword.lexeme.lower() not in {"effect", "effects"}:
            raise ParseError(
                f"effect clause must start with 'effect' or 'effects', got '{keyword.lexeme}'"
            )
        effect_names: List[str] = []
        for tok in tokens:
            if tok.lexeme == ",":
                continue
            normalized = tok.lexeme.lower().replace("_", "-")
            canonical = _WORD_EFFECT_ALIASES.get(normalized)
            if canonical is None:
                raise ParseError(
                    f"unknown effect '{tok.lexeme}' at {tok.line}:{tok.column}"
                )
            if canonical not in effect_names:
                effect_names.append(canonical)
        if not effect_names:
            raise ParseError("effect clause missing effect names")
        return effect_names

    def _parse_asm_definition(self, token: Token) -> None:
        if self._eof():
            raise ParseError(f"definition name missing after ':asm' at {token.line}:{token.column}")
        inline_def = self._consume_pending_inline()
        name_token = self._consume()
        effect_names: Optional[List[str]] = None
        if not self._eof():
            next_token = self.peek_token()
            if next_token is not None and next_token.lexeme == "(":
                effect_names = self._parse_effect_annotations()
        brace_token = self._consume()
        if brace_token.lexeme != "{":
            raise ParseError(f"expected '{{' after asm name at {brace_token.line}:{brace_token.column}")
        block_start = brace_token.end
        block_end: Optional[int] = None
        # Scan for closing brace directly via list indexing (avoid method-call overhead)
        _tokens = self.tokens
        _tlen = len(_tokens)
        _pos = self.pos
        while _pos < _tlen:
            nt = _tokens[_pos]
            _pos += 1
            if nt.lexeme == "}":
                block_end = nt.start
                break
        self.pos = _pos
        if block_end is None:
            raise ParseError("missing '}' to terminate asm body")
        asm_body = self.source[block_start:block_end]
        priority = self._consume_pending_priority()
        definition = AsmDefinition(name=name_token.lexeme, body=asm_body, inline=inline_def)
        if effect_names is not None:
            definition.effects = set(effect_names)
        candidate = Word(name=definition.name, priority=priority)
        candidate.definition = definition
        if inline_def:
            candidate.inline = True
        word = self.dictionary.register(candidate)
        if word is candidate:
            definition.immediate = word.immediate
            definition.compile_only = word.compile_only
        module = self.context_stack[-1]
        if not isinstance(module, Module):
            raise ParseError("asm definitions must be top-level forms")
        if word is candidate:
            module.forms.append(definition)
            self.last_defined = word
        if self._eof():
            raise ParseError("asm definition missing terminator ';'")
        terminator = self._consume()
        if terminator.lexeme != ";":
            raise ParseError(f"expected ';' after asm definition at {terminator.line}:{terminator.column}")

    def _parse_py_definition(self, token: Token) -> None:
        if self._eof():
            raise ParseError(f"definition name missing after ':py' at {token.line}:{token.column}")
        name_token = self._consume()
        brace_token = self._consume()
        if brace_token.lexeme != "{":
            raise ParseError(f"expected '{{' after py name at {brace_token.line}:{brace_token.column}")
        block_start = brace_token.end
        block_end: Optional[int] = None
        _tokens = self.tokens
        _tlen = len(_tokens)
        _pos = self.pos
        while _pos < _tlen:
            nt = _tokens[_pos]
            _pos += 1
            if nt.lexeme == "}":
                block_end = nt.start
                break
        self.pos = _pos
        if block_end is None:
            raise ParseError("missing '}' to terminate py body")
        import textwrap
        py_body = textwrap.dedent(self.source[block_start:block_end])
        priority = self._consume_pending_priority()
        candidate = Word(name=name_token.lexeme, priority=priority)
        word = self.dictionary.register(candidate)
        if word is not candidate:
            if self._eof():
                raise ParseError("py definition missing terminator ';'")
            terminator = self._consume()
            if terminator.lexeme != ";":
                raise ParseError(f"expected ';' after py definition at {terminator.line}:{terminator.column}")
            return
        namespace = self._py_exec_namespace()
        try:
            exec(py_body, namespace)
        except Exception as exc:  # pragma: no cover - user code
            raise ParseError(f"python macro body for '{word.name}' raised: {exc}") from exc
        macro_fn = namespace.get("macro")
        intrinsic_fn = namespace.get("intrinsic")
        if macro_fn is None and intrinsic_fn is None:
            raise ParseError("python definition must define 'macro' or 'intrinsic'")
        if macro_fn is not None:
            word.macro = macro_fn
            word.immediate = True
        if intrinsic_fn is not None:
            word.intrinsic = intrinsic_fn
        if self._eof():
            raise ParseError("py definition missing terminator ';'")
        terminator = self._consume()
        if terminator.lexeme != ";":
            raise ParseError(f"expected ';' after py definition at {terminator.line}:{terminator.column}")

    def _py_exec_namespace(self) -> Dict[str, Any]:
        return dict(PY_EXEC_GLOBALS)

    def _append_op(self, node: Op) -> None:
        if node.loc is None:
            tok = self._last_token
            if tok is not None:
                # Inlined fast path of location_for_token
                spans = self.file_spans
                if spans:
                    if self._span_index_len != len(spans):
                        self._rebuild_span_index()
                        self._span_cache_idx = -1
                    tl = tok.line
                    ci = self._span_cache_idx
                    if ci >= 0:
                        span = spans[ci]
                        if span.start_line <= tl < span.end_line:
                            node.loc = _make_loc(span.path, span.local_start_line + (tl - span.start_line), tok.column)
                        else:
                            node.loc = self._location_for_token_slow(tok, tl)
                    else:
                        node.loc = self._location_for_token_slow(tok, tl)
                else:
                    node.loc = _make_loc(_SOURCE_PATH, tok.line, tok.column)
        target = self.context_stack[-1]
        if target.__class__ is Definition:
            target.body.append(node)
        else:
            target.forms.append(node)

    def _location_for_token_slow(self, token: Token, tl: int) -> SourceLocation:
        """Slow path for location_for_token: bisect lookup."""
        span_starts = self._span_starts
        idx = bisect.bisect_right(span_starts, tl) - 1
        if idx >= 0:
            span = self.file_spans[idx]
            if tl < span.end_line:
                self._span_cache_idx = idx
                return _make_loc(span.path, span.local_start_line + (tl - span.start_line), token.column)
        return _make_loc(_SOURCE_PATH, tl, token.column)

    def _try_literal(self, token: Token) -> bool:
        lexeme = token.lexeme
        first = lexeme[0] if lexeme else '\0'
        if first.isdigit() or first == '-' or first == '+':
            try:
                value = int(lexeme, 0)
                self._append_op(_make_literal_op(value))
                return True
            except ValueError:
                pass

        # Try float
        if first.isdigit() or first == '-' or first == '+' or first == '.':
            try:
                if "." in lexeme or "e" in lexeme.lower():
                    value = float(lexeme)
                    self._append_op(_make_literal_op(value))
                    return True
            except ValueError:
                pass

        if first == '"':
            string_value = _parse_string_literal(token)
            if string_value is not None:
                self._append_op(_make_literal_op(string_value))
                return True

        return False

    def _consume(self) -> Token:
        pos = self.pos
        if pos >= len(self.tokens):
            raise ParseError("unexpected EOF")
        self.pos = pos + 1
        return self.tokens[pos]

    def _eof(self) -> bool:
        return self.pos >= len(self.tokens)

    def _ensure_tokens(self, upto: int) -> None:
        if self._token_iter_exhausted:
            return
        if self._token_iter is None:
            self._token_iter_exhausted = True
            return
        while len(self.tokens) <= upto and not self._token_iter_exhausted:
            try:
                next_tok = next(self._token_iter)
            except StopIteration:
                self._token_iter_exhausted = True
                break
            self.tokens.append(next_tok)


# ---------------------------------------------------------------------------
# Compile-time VM helpers
# ---------------------------------------------------------------------------


def _to_i64(v: int) -> int:
    """Truncate to signed 64-bit integer (matching x86-64 register semantics)."""
    v = v & 0xFFFFFFFFFFFFFFFF
    if v >= 0x8000000000000000:
        v -= 0x10000000000000000
    return v


class _CTVMJump(Exception):
    """Raised by the ``jmp`` intrinsic to transfer control in _execute_nodes."""

    def __init__(self, target_ip: int) -> None:
        self.target_ip = target_ip


class _CTVMReturn(Exception):
    """Raised to return from the current word frame in _execute_nodes."""


class _CTVMExit(Exception):
    """Raised by the ``exit`` intrinsic to stop compile-time execution."""

    def __init__(self, code: int = 0) -> None:
        self.code = code


class CTMemory:
    """Managed memory for the compile-time VM.

    Uses ctypes buffers with real process addresses so that ``c@``, ``c!``,
    ``@``, ``!`` can operate on them directly via ``ctypes.from_address``.

    String literals are slab-allocated from a contiguous data section so that
    ``data_start``/``data_end`` bracket them correctly for ``print``'s range
    check.
    """

    PERSISTENT_SIZE = 64  # matches default BSS ``persistent: resb 64``
    PRINT_BUF_SIZE = 128   # matches ``PRINT_BUF_BYTES``
    DATA_SECTION_SIZE = 4 * 1024 * 1024  # 4 MB slab for string literals

    def __init__(self, persistent_size: int = 0) -> None:
        import ctypes as _ctypes
        globals().setdefault('ctypes', _ctypes)
        self._buffers: List[Any] = []  # prevent GC of ctypes objects
        self._string_cache: Dict[str, Tuple[int, int]] = {}  # cache string literals

        # Persistent BSS region (for ``mem`` word)
        actual_persistent = persistent_size if persistent_size > 0 else self.PERSISTENT_SIZE
        self._persistent = ctypes.create_string_buffer(actual_persistent)
        self._persistent_size = actual_persistent
        self._buffers.append(self._persistent)
        self.persistent_addr: int = ctypes.addressof(self._persistent)

        # print_buf region (for words that use ``[rel print_buf]``)
        self._print_buf = ctypes.create_string_buffer(self.PRINT_BUF_SIZE)
        self._buffers.append(self._print_buf)
        self.print_buf_addr: int = ctypes.addressof(self._print_buf)

        # Data section – contiguous slab for string literals so that
        # data_start..data_end consistently brackets all of them.
        self._data_section = ctypes.create_string_buffer(self.DATA_SECTION_SIZE)
        self._buffers.append(self._data_section)
        self.data_start: int = ctypes.addressof(self._data_section)
        self.data_end: int = self.data_start + self.DATA_SECTION_SIZE
        self._data_offset: int = 0

        # sys_argc / sys_argv – populated by invoke()
        self._sys_argc = ctypes.c_int64(0)
        self._buffers.append(self._sys_argc)
        self.sys_argc_addr: int = ctypes.addressof(self._sys_argc)

        self._sys_argv_ptrs: Optional[ctypes.Array[Any]] = None
        self._sys_argv = ctypes.c_int64(0)  # qword holding pointer to argv array
        self._buffers.append(self._sys_argv)
        self.sys_argv_addr: int = ctypes.addressof(self._sys_argv)

    # -- argv helpers ------------------------------------------------------

    def setup_argv(self, args: List[str]) -> None:
        """Populate sys_argc / sys_argv from *args*."""
        self._sys_argc.value = len(args)
        # Build null-terminated C string array
        argv_bufs: List[Any] = []
        for arg in args:
            encoded = arg.encode("utf-8") + b"\x00"
            buf = ctypes.create_string_buffer(encoded, len(encoded))
            self._buffers.append(buf)
            argv_bufs.append(buf)
        # pointer array (+ NULL sentinel)
        arr_type = ctypes.c_int64 * (len(args) + 1)
        self._sys_argv_ptrs = arr_type()
        for i, buf in enumerate(argv_bufs):
            self._sys_argv_ptrs[i] = ctypes.addressof(buf)
        self._sys_argv_ptrs[len(args)] = 0
        self._buffers.append(self._sys_argv_ptrs)
        self._sys_argv.value = ctypes.addressof(self._sys_argv_ptrs)

    # -- allocation --------------------------------------------------------

    def allocate(self, size: int) -> int:
        """Allocate a zero-filled region, return its real address.
        Adds padding to mimic real mmap which always gives full pages."""
        if size <= 0:
            size = 1
        buf = ctypes.create_string_buffer(size + 16)  # padding for null terminators
        addr = ctypes.addressof(buf)
        self._buffers.append(buf)
        return addr

    def store_string(self, s: str) -> Tuple[int, int]:
        """Store a UTF-8 string in the data section slab.  Returns ``(addr, length)``.
        Caches immutable string literals to avoid redundant allocations."""
        cached = self._string_cache.get(s)
        if cached is not None:
            return cached
        encoded = s.encode("utf-8")
        needed = len(encoded) + 1  # null terminator
        aligned = (needed + 7) & ~7  # 8-byte align
        if self._data_offset + aligned > self.DATA_SECTION_SIZE:
            raise RuntimeError("CT data section overflow")
        addr = self.data_start + self._data_offset
        ctypes.memmove(addr, encoded, len(encoded))
        ctypes.c_uint8.from_address(addr + len(encoded)).value = 0  # null terminator
        self._data_offset += aligned
        result = (addr, len(encoded))
        self._string_cache[s] = result
        return result

    # -- low-level access --------------------------------------------------

    @staticmethod
    def read_byte(addr: int) -> int:
        return ctypes.c_uint8.from_address(addr).value

    @staticmethod
    def write_byte(addr: int, value: int) -> None:
        ctypes.c_uint8.from_address(addr).value = value & 0xFF

    @staticmethod
    def read_qword(addr: int) -> int:
        return ctypes.c_int64.from_address(addr).value

    @staticmethod
    def write_qword(addr: int, value: int) -> None:
        ctypes.c_int64.from_address(addr).value = _to_i64(value)

    @staticmethod
    def read_bytes(addr: int, length: int) -> bytes:
        return ctypes.string_at(addr, length)


class CompileTimeVM:
    NATIVE_STACK_SIZE = 8 * 1024 * 1024  # 8 MB per native stack

    def __init__(self, parser: Parser) -> None:
        self.parser = parser
        self.dictionary = parser.dictionary
        self.stack: List[Any] = []
        self.return_stack: List[Any] = []
        self.loop_stack: List[Dict[str, Any]] = []
        self._handles = _CTHandleTable()
        self.call_stack: List[str] = []
        # Runtime-faithful execution state — lazily allocated on first use
        self._memory: Optional[CTMemory] = None
        self.runtime_mode: bool = False
        self._list_capture_stack: List[Any] = []  # for list_begin/list_end (int depth or native r12 addr)
        self._ct_executed: Set[str] = set()  # words already executed at CT
        # Native stack state (used only in runtime_mode)
        self.r12: int = 0  # data stack pointer (grows downward)
        self.r13: int = 0  # return stack pointer (grows downward)
        self._native_data_stack: Optional[Any] = None   # ctypes buffer
        self._native_data_top: int = 0
        # REPL persistent state
        self._repl_initialized: bool = False
        self._repl_libs: List[str] = []
        self._native_return_stack: Optional[Any] = None  # ctypes buffer
        self._native_return_top: int = 0
        # JIT cache: word name -> ctypes callable
        self._jit_cache: Dict[str, Any] = {}
        self._jit_code_pages: List[Any] = []  # keep mmap pages alive
        # Pre-allocated output structs for JIT calls (lazily allocated)
        self._jit_out2: Optional[Any] = None
        self._jit_out2_addr: int = 0
        self._jit_out4: Optional[Any] = None
        self._jit_out4_addr: int = 0
        # BSS symbol table for JIT patching
        self._bss_symbols: Dict[str, int] = {}
        # dlopen handles for C extern support
        self._dl_handles: List[Any] = []  # ctypes.CDLL handles
        self._dl_func_cache: Dict[str, Any] = {}  # name -> ctypes callable
        self._ct_libs: List[str] = []  # library names from -l flags
        self._ctypes_struct_cache: Dict[str, Any] = {}
        self.current_location: Optional[SourceLocation] = None
        # Coroutine JIT support: save buffer for callee-saved regs (lazily allocated)
        self._jit_save_buf: Optional[Any] = None
        self._jit_save_buf_addr: int = 0

    @property
    def memory(self) -> CTMemory:
        if self._memory is None:
            self._memory = CTMemory()
        return self._memory

    @memory.setter
    def memory(self, value: CTMemory) -> None:
        self._memory = value

    def _ensure_jit_out(self) -> None:
        if self._jit_out2 is None:
            import ctypes as _ctypes
            globals().setdefault('ctypes', _ctypes)
            self._jit_out2 = (_ctypes.c_int64 * 2)()
            self._jit_out2_addr = _ctypes.addressof(self._jit_out2)
            self._jit_out4 = (_ctypes.c_int64 * 4)()
            self._jit_out4_addr = _ctypes.addressof(self._jit_out4)

    def _ensure_jit_save_buf(self) -> None:
        if self._jit_save_buf is None:
            self._jit_save_buf = (ctypes.c_int64 * 8)()
            self._jit_save_buf_addr = ctypes.addressof(self._jit_save_buf)

    @staticmethod
    def _is_coroutine_asm(body: str) -> bool:
        """Detect asm words that manipulate the x86 return stack (coroutine patterns).

        Heuristic: if the body pops rsi/rdi before any label (capturing the
        return address), it's a coroutine word.
        """
        for raw_line in body.splitlines():
            line = raw_line.strip()
            if not line or line.startswith(";"):
                continue
            if _RE_LABEL_PAT.match(line):
                break
            if line.startswith("pop "):
                reg = line.split()[1].rstrip(",")
                if reg in ("rsi", "rdi"):
                    return True
        return False

    def reset(self) -> None:
        self.stack.clear()
        self.return_stack.clear()
        self.loop_stack.clear()
        self._handles.clear()
        self.call_stack.clear()
        self._list_capture_stack.clear()
        self.r12 = 0
        self.r13 = 0
        self.current_location = None
        self._repl_initialized = False

    def invoke(self, word: Word, *, runtime_mode: bool = False, libs: Optional[List[str]] = None) -> None:
        self.reset()
        self._ensure_jit_out()
        prev_mode = self.runtime_mode
        self.runtime_mode = runtime_mode
        if runtime_mode:
            # Determine persistent size from BSS overrides if available.
            persistent_size = 0
            if self.parser.custom_bss:
                for bss_line in self.parser.custom_bss:
                    m = _RE_BSS_PERSISTENT.search(bss_line)
                    if m:
                        persistent_size = int(m.group(1))
            self.memory = CTMemory(persistent_size)  # fresh memory per invocation
            self.memory.setup_argv(sys.argv)

            # Allocate native stacks
            self._native_data_stack = ctypes.create_string_buffer(self.NATIVE_STACK_SIZE)
            self._native_data_top = ctypes.addressof(self._native_data_stack) + self.NATIVE_STACK_SIZE
            self.r12 = self._native_data_top  # empty, grows downward

            self._native_return_stack = ctypes.create_string_buffer(self.NATIVE_STACK_SIZE)
            self._native_return_top = ctypes.addressof(self._native_return_stack) + self.NATIVE_STACK_SIZE
            self.r13 = self._native_return_top  # empty, grows downward

            # BSS symbol table for JIT [rel SYMBOL] patching
            self._bss_symbols = {
                "data_start": self.memory.data_start,
                "data_end": self.memory.data_start + self.memory._data_offset if self.memory._data_offset else self.memory.data_end,
                "print_buf": self.memory.print_buf_addr,
                "print_buf_end": self.memory.print_buf_addr + CTMemory.PRINT_BUF_SIZE,
                "persistent": self.memory.persistent_addr,
                "persistent_end": self.memory.persistent_addr + self.memory._persistent_size,
                "sys_argc": self.memory.sys_argc_addr,
                "sys_argv": self.memory.sys_argv_addr,
            }

            # JIT cache is per-invocation (addresses change)
            self._jit_cache = {}
            self._jit_code_pages = []

            # dlopen libraries for C extern support
            self._dl_handles = []
            self._dl_func_cache = {}
            all_libs = list(self._ct_libs)
            if libs:
                for lib in libs:
                    if lib not in all_libs:
                        all_libs.append(lib)
            for lib_name in all_libs:
                self._dlopen(lib_name)

            # Deep word chains need extra Python stack depth.
            old_limit = sys.getrecursionlimit()
            if old_limit < 10000:
                sys.setrecursionlimit(10000)
        try:
            self._call_word(word)
        except _CTVMExit:
            pass  # graceful exit from CT execution
        finally:
            self.runtime_mode = prev_mode
            # Clear JIT cache; code pages are libc mmap'd and we intentionally
            # leak them — the OS reclaims them at process exit.
            self._jit_cache.clear()
            self._jit_code_pages.clear()
            self._dl_func_cache.clear()
            self._dl_handles.clear()

    def invoke_with_args(self, word: Word, args: Sequence[Any]) -> None:
        self.reset()
        for value in args:
            self.push(value)
        self._call_word(word)

    def invoke_repl(self, word: Word, *, libs: Optional[List[str]] = None) -> None:
        """Execute *word* in runtime mode, preserving stack/memory across calls.

        On the first call (or after ``reset()``), allocates native stacks and
        memory.  Subsequent calls reuse the existing state so values left on
        the data stack persist between REPL evaluations.
        """
        self._ensure_jit_out()
        prev_mode = self.runtime_mode
        self.runtime_mode = True

        if not self._repl_initialized:
            persistent_size = 0
            if self.parser.custom_bss:
                for bss_line in self.parser.custom_bss:
                    m = _RE_BSS_PERSISTENT.search(bss_line)
                    if m:
                        persistent_size = int(m.group(1))
            self.memory = CTMemory(persistent_size)
            self.memory.setup_argv(sys.argv)

            self._native_data_stack = ctypes.create_string_buffer(self.NATIVE_STACK_SIZE)
            self._native_data_top = ctypes.addressof(self._native_data_stack) + self.NATIVE_STACK_SIZE
            self.r12 = self._native_data_top

            self._native_return_stack = ctypes.create_string_buffer(self.NATIVE_STACK_SIZE)
            self._native_return_top = ctypes.addressof(self._native_return_stack) + self.NATIVE_STACK_SIZE
            self.r13 = self._native_return_top

            self._bss_symbols = {
                "data_start": self.memory.data_start,
                "data_end": self.memory.data_start + self.memory._data_offset if self.memory._data_offset else self.memory.data_end,
                "print_buf": self.memory.print_buf_addr,
                "print_buf_end": self.memory.print_buf_addr + CTMemory.PRINT_BUF_SIZE,
                "persistent": self.memory.persistent_addr,
                "persistent_end": self.memory.persistent_addr + self.memory._persistent_size,
                "sys_argc": self.memory.sys_argc_addr,
                "sys_argv": self.memory.sys_argv_addr,
            }
            self._jit_cache = {}
            self._jit_code_pages = []
            self._dl_handles = []
            self._dl_func_cache = {}
            all_libs = list(self._ct_libs)
            if libs:
                for lib in libs:
                    if lib not in all_libs:
                        all_libs.append(lib)
            for lib_name in all_libs:
                self._dlopen(lib_name)

            old_limit = sys.getrecursionlimit()
            if old_limit < 10000:
                sys.setrecursionlimit(10000)
            self._repl_initialized = True
            self._repl_libs = list(libs or [])
        else:
            # Subsequent call — open any new libraries not yet loaded
            if libs:
                for lib in libs:
                    if lib not in self._repl_libs:
                        self._dlopen(lib)
                        self._repl_libs.append(lib)

        # Clear transient state but keep stacks and memory
        self.call_stack.clear()
        self.loop_stack.clear()
        self._list_capture_stack.clear()
        self.current_location = None
        # JIT cache must be cleared because word definitions change between
        # REPL evaluations (re-parsed each time).
        self._jit_cache.clear()

        try:
            self._call_word(word)
        except _CTVMExit:
            pass
        finally:
            self.runtime_mode = prev_mode

    def repl_stack_values(self) -> List[int]:
        """Return current native data stack contents (bottom to top)."""
        if not self._repl_initialized or self.r12 >= self._native_data_top:
            return []
        values = []
        addr = self._native_data_top - 8
        while addr >= self.r12:
            values.append(CTMemory.read_qword(addr))
            addr -= 8
        return values

    def push(self, value: Any) -> None:
        if self.runtime_mode:
            self.r12 -= 8
            if isinstance(value, float):
                bits = _get_struct().unpack("q", _get_struct().pack("d", value))[0]
                CTMemory.write_qword(self.r12, bits)
            else:
                CTMemory.write_qword(self.r12, _to_i64(int(value)))
        else:
            self.stack.append(value)

    def pop(self) -> Any:
        if self.runtime_mode:
            if self.r12 >= self._native_data_top:
                raise ParseError("compile-time stack underflow")
            val = CTMemory.read_qword(self.r12)
            self.r12 += 8
            return val
        if not self.stack:
            raise ParseError("compile-time stack underflow")
        return self.stack.pop()

    def _resolve_handle(self, value: Any) -> Any:
        if isinstance(value, int):
            for delta in (0, -1, 1):
                candidate = value + delta
                if candidate in self._handles.objects:
                    obj = self._handles.objects[candidate]
                    self._handles.objects[value] = obj
                    return obj
            # Occasionally a raw object id can appear on the stack; recover it if we still
            # hold the object reference.
            for obj in self._handles.objects.values():
                if id(obj) == value:
                    self._handles.objects[value] = obj
                    return obj
        return value

    def peek(self) -> Any:
        if self.runtime_mode:
            if self.r12 >= self._native_data_top:
                raise ParseError("compile-time stack underflow")
            return CTMemory.read_qword(self.r12)
        if not self.stack:
            raise ParseError("compile-time stack underflow")
        return self.stack[-1]

    def pop_int(self) -> int:
        if self.runtime_mode:
            return self.pop()  # already returns int from native stack
        value = self.pop()
        if isinstance(value, bool):
            return int(value)
        if not isinstance(value, int):
            raise ParseError(f"expected integer on compile-time stack, got {type(value).__name__}: {value!r}")
        return value

    # -- return stack helpers (native r13 in runtime_mode) -----------------

    def push_return(self, value: int) -> None:
        if self.runtime_mode:
            self.r13 -= 8
            CTMemory.write_qword(self.r13, _to_i64(value))
        else:
            self.return_stack.append(value)

    def pop_return(self) -> int:
        if self.runtime_mode:
            val = CTMemory.read_qword(self.r13)
            self.r13 += 8
            return val
        return self.return_stack.pop()

    def peek_return(self) -> int:
        if self.runtime_mode:
            return CTMemory.read_qword(self.r13)
        return self.return_stack[-1]

    def poke_return(self, value: int) -> None:
        """Overwrite top of return stack."""
        if self.runtime_mode:
            CTMemory.write_qword(self.r13, _to_i64(value))
        else:
            self.return_stack[-1] = value

    def return_stack_empty(self) -> bool:
        if self.runtime_mode:
            return self.r13 >= self._native_return_top
        return len(self.return_stack) == 0

    # -- native stack depth ------------------------------------------------

    def native_stack_depth(self) -> int:
        """Number of items on data stack (runtime_mode only)."""
        return (self._native_data_top - self.r12) // 8

    def pop_str(self) -> str:
        value = self._resolve_handle(self.pop())
        if not isinstance(value, str):
            raise ParseError("expected string on compile-time stack")
        return value

    def pop_list(self) -> List[Any]:
        value = self._resolve_handle(self.pop())
        if not isinstance(value, list):
            known = value in self._handles.objects if isinstance(value, int) else False
            handles_size = len(self._handles.objects)
            handle_keys = list(self._handles.objects.keys())
            raise ParseError(
                f"expected list on compile-time stack, got {type(value).__name__} value={value!r} known_handle={known} handles={handles_size}:{handle_keys!r} stack={self.stack!r}"
            )
        return value

    def pop_token(self) -> Token:
        value = self._resolve_handle(self.pop())
        if not isinstance(value, Token):
            raise ParseError("expected token on compile-time stack")
        return value

    # -- dlopen / C extern support -----------------------------------------

    def _dlopen(self, lib_name: str) -> None:
        """Open a shared library and append to _dl_handles."""
        import ctypes.util
        # Try as given first (handles absolute paths, "libc.so.6", etc.)
        candidates = [lib_name]
        # If given a static archive (.a), try .so from the same directory
        if lib_name.endswith(".a"):
            so_variant = lib_name[:-2] + ".so"
            candidates.append(so_variant)
        # Try lib<name>.so
        if not lib_name.startswith("lib") and "." not in lib_name:
            candidates.append(f"lib{lib_name}.so")
        # Use ctypes.util.find_library for short names like "m", "c"
        found = ctypes.util.find_library(lib_name)
        if found:
            candidates.append(found)
        for candidate in candidates:
            try:
                handle = ctypes.CDLL(candidate, use_errno=True)
                self._dl_handles.append(handle)
                return
            except OSError:
                continue
        # Not fatal — the library may not be needed at CT

    _CTYPE_MAP: Optional[Dict[str, Any]] = None

    @classmethod
    def _get_ctype_map(cls) -> Dict[str, Any]:
        if cls._CTYPE_MAP is None:
            import ctypes
            cls._CTYPE_MAP = {
                "int": ctypes.c_int,
                "int8_t": ctypes.c_int8,
                "uint8_t": ctypes.c_uint8,
                "int16_t": ctypes.c_int16,
                "uint16_t": ctypes.c_uint16,
                "int32_t": ctypes.c_int32,
                "uint32_t": ctypes.c_uint32,
                "long": ctypes.c_long,
                "long long": ctypes.c_longlong,
                "int64_t": ctypes.c_int64,
                "unsigned int": ctypes.c_uint,
                "unsigned long": ctypes.c_ulong,
                "unsigned long long": ctypes.c_ulonglong,
                "uint64_t": ctypes.c_uint64,
                "size_t": ctypes.c_size_t,
                "ssize_t": ctypes.c_ssize_t,
                "char": ctypes.c_char,
                "char*": ctypes.c_void_p,
                "void*": ctypes.c_void_p,
                "double": ctypes.c_double,
                "float": ctypes.c_float,
            }
        return cls._CTYPE_MAP

    def _resolve_struct_ctype(self, struct_name: str) -> Any:
        cached = self._ctypes_struct_cache.get(struct_name)
        if cached is not None:
            return cached
        layout = self.parser.cstruct_layouts.get(struct_name)
        if layout is None:
            raise ParseError(f"unknown cstruct '{struct_name}' used in extern signature")
        fields = []
        for field in layout.fields:
            fields.append((field.name, self._resolve_ctype(field.type_name)))
        struct_cls = type(f"CTStruct_{sanitize_label(struct_name)}", (ctypes.Structure,), {"_fields_": fields})
        self._ctypes_struct_cache[struct_name] = struct_cls
        return struct_cls

    def _resolve_ctype(self, type_name: str) -> Any:
        """Map a C type name string to a ctypes type."""
        import ctypes
        t = _canonical_c_type_name(type_name)
        if t.endswith("*"):
            return ctypes.c_void_p
        if t.startswith("struct "):
            return self._resolve_struct_ctype(t[len("struct "):].strip())
        t = t.replace("*", "* ").replace("  ", " ").strip()
        ctype_map = self._get_ctype_map()
        if t in ctype_map:
            return ctype_map[t]
        # Default to c_long (64-bit on Linux x86-64)
        return ctypes.c_long

    def _dlsym(self, name: str) -> Any:
        """Look up a symbol across all dl handles, return a raw function pointer or None."""
        for handle in self._dl_handles:
            try:
                return getattr(handle, name)
            except AttributeError:
                continue
        return None

    def _call_extern_ct(self, word: Word) -> None:
        """Call an extern C function via dlsym/ctypes on the native stacks."""
        name = word.name

        # Special handling for exit — intercept it before doing anything
        if name == "exit":
            raise _CTVMExit()

        func = self._dl_func_cache.get(name)
        if func is None:
            raw = self._dlsym(name)
            if raw is None:
                raise ParseError(f"extern '{name}' not found in any loaded library")

            signature = word.extern_signature
            inputs = word.extern_inputs
            outputs = word.extern_outputs

            if signature:
                arg_types, ret_type = signature
                c_arg_types = [self._resolve_ctype(t) for t in arg_types]
                if ret_type == "void":
                    c_ret_type = None
                else:
                    c_ret_type = self._resolve_ctype(ret_type)
            else:
                # Legacy mode: assume all int64 args
                arg_types = []
                c_arg_types = [ctypes.c_int64] * inputs
                c_ret_type = ctypes.c_int64 if outputs > 0 else None

            # Configure the ctypes function object directly
            raw.restype = c_ret_type
            raw.argtypes = c_arg_types
            # Stash metadata for calling
            raw._ct_inputs = inputs
            raw._ct_outputs = outputs
            raw._ct_arg_types = c_arg_types
            raw._ct_ret_type = c_ret_type
            raw._ct_signature = signature
            func = raw
            self._dl_func_cache[name] = func

        inputs = func._ct_inputs
        outputs = func._ct_outputs
        arg_types = list(func._ct_signature[0]) if func._ct_signature else []

        # For variadic externs, the TOS value is the extra arg count
        # (consumed by the compiler, not passed to C).
        va_extra = 0
        if getattr(word, "extern_variadic", False):
            va_extra = int(self.pop())
            inputs += va_extra
            for _ in range(va_extra):
                arg_types.append("long")
            # Update ctypes argtypes to include the variadic args
            func.argtypes = list(func._ct_arg_types) + [ctypes.c_int64] * va_extra

        # Pop arguments off the native data stack (right-to-left / reverse order)
        raw_args = []
        for i in range(inputs):
            raw_args.append(self.pop())
        raw_args.reverse()

        # Convert arguments to proper ctypes values
        call_args = []
        for i, raw in enumerate(raw_args):
            arg_type = _canonical_c_type_name(arg_types[i]) if i < len(arg_types) else None
            if arg_type in ("float", "double"):
                # Reinterpret the int64 bits as a double (matching the language's convention)
                raw_int = _to_i64(int(raw))
                double_val = _get_struct().unpack("d", _get_struct().pack("q", raw_int))[0]
                call_args.append(double_val)
            elif arg_type is not None and arg_type.startswith("struct ") and not arg_type.endswith("*"):
                struct_name = arg_type[len("struct "):].strip()
                struct_ctype = self._resolve_struct_ctype(struct_name)
                call_args.append(struct_ctype.from_address(int(raw)))
            else:
                call_args.append(int(raw))

        result = func(*call_args)

        if outputs > 0 and result is not None:
            ret_type = _canonical_c_type_name(func._ct_signature[1]) if func._ct_signature else None
            if ret_type in ("float", "double"):
                int_bits = _get_struct().unpack("q", _get_struct().pack("d", float(result)))[0]
                self.push(int_bits)
            elif ret_type is not None and ret_type.startswith("struct "):
                struct_name = ret_type[len("struct "):].strip()
                layout = self.parser.cstruct_layouts.get(struct_name)
                if layout is None:
                    raise ParseError(f"unknown cstruct '{struct_name}' used in extern return type")
                out_ptr = self.memory.allocate(layout.size)
                ctypes.memmove(out_ptr, ctypes.byref(result), layout.size)
                self.push(out_ptr)
            else:
                self.push(int(result))

    def _call_word(self, word: Word) -> None:
        self.call_stack.append(word.name)
        try:
            definition = word.definition
            # In runtime_mode, prefer runtime_intrinsic (for exit/jmp/syscall
            # and __with_* variables).  All other :asm words run as native JIT.
            if self.runtime_mode and word.runtime_intrinsic is not None:
                word.runtime_intrinsic(self)
                return
            prefer_definition = word.compile_time_override or (isinstance(definition, Definition) and (word.immediate or word.compile_only))
            if not prefer_definition and word.compile_time_intrinsic is not None:
                word.compile_time_intrinsic(self)
                return
            # C extern words: call via dlopen/dlsym in runtime_mode
            if self.runtime_mode and getattr(word, "is_extern", False):
                self._call_extern_ct(word)
                return
            if definition is None:
                raise ParseError(f"word '{word.name}' has no compile-time definition")
            if isinstance(definition, AsmDefinition):
                if self.runtime_mode:
                    self._run_jit(word)
                else:
                    self._run_asm_definition(word)
                return
            # Whole-word JIT for regular definitions in runtime mode
            if self.runtime_mode and isinstance(definition, Definition):
                ck = f"__defn_jit_{word.name}"
                jf = self._jit_cache.get(ck)
                if jf is None and ck + "_miss" not in self._jit_cache:
                    jf = self._compile_definition_jit(word)
                    if jf is not None:
                        self._jit_cache[ck] = jf
                        self._jit_cache[ck + "_addr"] = ctypes.cast(jf, ctypes.c_void_p).value
                    else:
                        self._jit_cache[ck + "_miss"] = True
                if jf is not None:
                    out = self._jit_out2
                    jf(self.r12, self.r13, self._jit_out2_addr)
                    self.r12 = out[0]
                    self.r13 = out[1]
                    return
            self._execute_nodes(definition.body, _defn=definition)
        except CompileTimeError:
            raise
        except (_CTVMJump, _CTVMExit, _CTVMReturn):
            raise
        except ParseError as exc:
            raise CompileTimeError(f"{exc}\ncompile-time stack: {' -> '.join(self.call_stack)}") from None
        except Exception as exc:
            raise CompileTimeError(
                f"compile-time failure in '{word.name}': {exc}\ncompile-time stack: {' -> '.join(self.call_stack)}"
            ) from None
        finally:
            self.call_stack.pop()

    # -- Native JIT execution (runtime_mode) --------------------------------

    _JIT_FUNC_TYPE: Optional[Any] = None

    def _run_jit(self, word: Word) -> None:
        """JIT-compile (once) and execute an :asm word on the native r12/r13 stacks."""
        func = self._jit_cache.get(word.name)
        if func is None:
            func = self._compile_jit(word)
            self._jit_cache[word.name] = func

        out = self._jit_out2
        func(self.r12, self.r13, self._jit_out2_addr)
        self.r12 = out[0]
        self.r13 = out[1]

    def _compile_jit(self, word: Word) -> Any:
        """Assemble an :asm word into executable memory and return a ctypes callable."""
        if Ks is None:
            raise ParseError("keystone-engine is required for JIT execution")
        definition = word.definition
        if not isinstance(definition, AsmDefinition):
            raise ParseError(f"word '{word.name}' has no asm body")
        asm_body = definition.body.strip("\n")
        is_coro = self._is_coroutine_asm(asm_body)

        bss = self._bss_symbols

        # Build wrapper
        lines: List[str] = []
        if is_coro:
            self._ensure_jit_save_buf()
            sb = self._jit_save_buf_addr
            # Use register-indirect addressing: x86-64 mov [disp],reg only
            # supports 32-bit displacement -- sb is a 64-bit heap address.
            lines.extend([
                "_ct_entry:",
                f"    mov rax, {sb}",        # load save buffer base
                "    mov [rax], rbx",
                "    mov [rax + 8], r12",
                "    mov [rax + 16], r13",
                "    mov [rax + 24], r14",
                "    mov [rax + 32], r15",
                "    mov [rax + 40], rdx",   # output ptr
                "    mov r12, rdi",
                "    mov r13, rsi",
                # Replace return address with trampoline
                "    pop rcx",
                "    mov [rax + 48], rcx",   # save ctypes return addr
                "    lea rcx, [rip + _ct_trampoline]",
                "    push rcx",
            ])
        else:
            # Standard wrapper: save callee-saved regs on stack
            lines.extend([
                "_ct_entry:",
                "    push rbx",
                "    push r12",
                "    push r13",
                "    push r14",
                "    push r15",
                "    sub rsp, 16",       # align + room for output ptr
                "    mov [rsp], rdx",    # save output-struct pointer
                "    mov r12, rdi",      # data stack
                "    mov r13, rsi",      # return stack
            ])

        # Patch asm body
        # Collect dot-prefixed local labels and build rename map for Keystone
        _local_labels: Set[str] = set()
        for raw_line in asm_body.splitlines():
            line = raw_line.strip()
            lm = _RE_LABEL_PAT.match(line)
            if lm and lm.group(1).startswith('.'):
                _local_labels.add(lm.group(1))

        for raw_line in asm_body.splitlines():
            line = raw_line.strip()
            if not line or line.startswith(";"):
                continue
            if line.startswith("extern"):
                continue  # strip extern declarations
            if line == "ret" and not is_coro:
                line = "jmp _ct_save"

            # Rename dot-prefixed local labels to Keystone-compatible names
            for lbl in _local_labels:
                line = re.sub(rf'(?<!\w){re.escape(lbl)}(?=\s|:|,|$|\]|\))',
                              '_jl' + lbl[1:], line)

            # Patch [rel SYMBOL] -> concrete address
            m = _RE_REL_PAT.search(line)
            if m and m.group(1) in bss:
                sym = m.group(1)
                addr = bss[sym]
                if line.lstrip().startswith("lea"):
                    # lea REG, [rel X] -> mov REG, addr
                    line = _RE_REL_PAT.sub(str(addr), line).replace("lea", "mov", 1)
                else:
                    # e.g. mov rax, [rel X] or mov byte [rel X], val
                    # Replace with push/mov-rax/substitute/pop trampoline
                    lines.append("    push rax")
                    lines.append(f"    mov rax, {addr}")
                    new_line = _RE_REL_PAT.sub("[rax]", line)
                    lines.append(f"    {new_line}")
                    lines.append("    pop rax")
                    continue
            # Convert NASM 'rel' to explicit rip-relative for Keystone
            if '[rel ' in line:
                line = line.replace('[rel ', '[rip + ')
            lines.append(f"    {line}")

        # Save/epilogue
        if is_coro:
            sb = self._jit_save_buf_addr
            lines.extend([
                "_ct_trampoline:",
                f"    mov rax, {sb}",        # reload save buffer base
                "    mov rcx, [rax + 40]",   # output ptr
                "    mov [rcx], r12",
                "    mov [rcx + 8], r13",
                "    mov rbx, [rax]",
                "    mov r12, [rax + 8]",
                "    mov r13, [rax + 16]",
                "    mov r14, [rax + 24]",
                "    mov r15, [rax + 32]",
                "    mov rcx, [rax + 48]",   # ctypes return addr
                "    push rcx",
                "    ret",
            ])
        else:
            lines.extend([
                "_ct_save:",
                "    mov rax, [rsp]",      # output-struct pointer
                "    mov [rax], r12",
                "    mov [rax + 8], r13",
                "    add rsp, 16",
                "    pop r15",
                "    pop r14",
                "    pop r13",
                "    pop r12",
                "    pop rbx",
                "    ret",
            ])

        ptr = self._jit_assemble_page(lines, word.name)
        if CompileTimeVM._JIT_FUNC_TYPE is None:
            CompileTimeVM._JIT_FUNC_TYPE = ctypes.CFUNCTYPE(None, ctypes.c_int64, ctypes.c_int64, ctypes.c_void_p)
        func = self._JIT_FUNC_TYPE(ptr)
        return func

    def _jit_assemble_page(self, lines: List[str], word_name: str) -> int:
        """Assemble lines into an RWX page and return its address."""
        def _norm(l: str) -> str:
            l = l.split(";", 1)[0].rstrip()
            for sz in ("qword", "dword", "word", "byte"):
                l = l.replace(f"{sz} [", f"{sz} ptr [")
            return l
        normalized = [_norm(l) for l in lines if _norm(l).strip()]
        ks = Ks(KS_ARCH_X86, KS_MODE_64)
        try:
            encoding, _ = ks.asm("\n".join(normalized))
        except KsError as exc:
            debug_txt = "\n".join(normalized)
            raise ParseError(
                f"JIT assembly failed for '{word_name}': {exc}\n--- asm ---\n{debug_txt}\n--- end ---"
            ) from exc
        if encoding is None:
            raise ParseError(f"JIT produced no code for '{word_name}'")
        code = bytes(encoding)
        page_size = max(len(code), 4096)
        _libc = ctypes.CDLL(None, use_errno=True)
        _libc.mmap.restype = ctypes.c_void_p
        _libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int,
                                ctypes.c_int, ctypes.c_int, ctypes.c_long]
        PROT_RWX = 0x1 | 0x2 | 0x4
        MAP_PRIVATE = 0x02
        MAP_ANONYMOUS = 0x20
        ptr = _libc.mmap(None, page_size, PROT_RWX,
                          MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)
        if ptr == ctypes.c_void_p(-1).value or ptr is None:
            raise RuntimeError(f"mmap failed for JIT code ({page_size} bytes)")
        ctypes.memmove(ptr, code, len(code))
        self._jit_code_pages.append((ptr, page_size))
        return ptr

    def _compile_raw_jit(self, word: Word) -> int:
        """Compile a word into executable memory without a wrapper.

        Returns the native code address (not a ctypes callable).
        For AsmDefinition: just the asm body (patched, no wrapper).
        For Definition: compiled body with ret (no entry/exit wrapper).
        """
        cache_key = f"__raw_jit_{word.name}"
        cached = self._jit_cache.get(cache_key)
        if cached is not None:
            return cached

        definition = word.definition
        bss = self._bss_symbols
        lines: List[str] = []

        if isinstance(definition, AsmDefinition):
            asm_body = definition.body.strip("\n")
            is_coro = self._is_coroutine_asm(asm_body)
            _local_labels: Set[str] = set()
            for raw_line in asm_body.splitlines():
                line = raw_line.strip()
                lm = _RE_LABEL_PAT.match(line)
                if lm and lm.group(1).startswith('.'):
                    _local_labels.add(lm.group(1))
            for raw_line in asm_body.splitlines():
                line = raw_line.strip()
                if not line or line.startswith(";") or line.startswith("extern"):
                    continue
                # Keep ret as-is (raw functions return normally)
                for lbl in _local_labels:
                    line = re.sub(rf'(?<!\w){re.escape(lbl)}(?=\s|:|,|$|\]|\))',
                                  '_jl' + lbl[1:], line)
                m = _RE_REL_PAT.search(line)
                if m and m.group(1) in bss:
                    sym = m.group(1)
                    addr = bss[sym]
                    if line.lstrip().startswith("lea"):
                        line = _RE_REL_PAT.sub(str(addr), line).replace("lea", "mov", 1)
                    else:
                        lines.append("    push rax")
                        lines.append(f"    mov rax, {addr}")
                        new_line = _RE_REL_PAT.sub("[rax]", line)
                        lines.append(f"    {new_line}")
                        lines.append("    pop rax")
                        continue
                if '[rel ' in line:
                    line = line.replace('[rel ', '[rip + ')
                lines.append(f"    {line}")
            lines.append("    ret")
        elif isinstance(definition, Definition):
            lines.extend(self._compile_raw_definition_lines(word, definition))
        else:
            raise ParseError(f"cannot raw-JIT word '{word.name}'")

        ptr = self._jit_assemble_page(lines, f"raw_{word.name}")
        self._jit_cache[cache_key] = ptr
        return ptr

    def _compile_raw_definition_lines(self, word: Word, defn: Definition) -> List[str]:
        """Compile a Definition body to raw JIT asm lines (no wrapper, just body + ret)."""
        self._resolve_words_in_body(defn)
        bss = self._bss_symbols
        body = defn.body
        lines: List[str] = []
        uid = id(defn)
        lc = [0]
        def _nl(prefix: str) -> str:
            lc[0] += 1
            return f"_rj{uid}_{prefix}_{lc[0]}"

        # Label maps
        label_map: Dict[str, str] = {}
        for node in body:
            if node._opcode == OP_LABEL:
                ln = str(node.data)
                if ln not in label_map:
                    label_map[ln] = _nl("lbl")

        for_map: Dict[int, Tuple[str, str]] = {}
        fstack: List[Tuple[int, str, str]] = []
        for idx, node in enumerate(body):
            if node._opcode == OP_FOR_BEGIN:
                bl, el = _nl("for_top"), _nl("for_end")
                fstack.append((idx, bl, el))
            elif node._opcode == OP_FOR_END:
                if fstack:
                    bi, bl, el = fstack.pop()
                    for_map[bi] = (bl, el)
                    for_map[idx] = (bl, el)

        ba_map: Dict[int, Tuple[str, str]] = {}
        bstack: List[Tuple[int, str, str]] = []
        for idx, node in enumerate(body):
            if node._opcode == OP_WORD and node._word_ref is None:
                nm = node.data
                if nm == "begin":
                    bl, al = _nl("begin"), _nl("again")
                    bstack.append((idx, bl, al))
                elif nm == "again":
                    if bstack:
                        bi, bl, al = bstack.pop()
                        ba_map[bi] = (bl, al)
                        ba_map[idx] = (bl, al)

        begin_rt: List[Tuple[str, str]] = []
        out2_addr = self._jit_out2_addr

        for idx, node in enumerate(body):
            opc = node._opcode
            if opc == OP_LITERAL:
                data = node.data
                if isinstance(data, str):
                    addr, length = self.memory.store_string(data)
                    lines.append("    sub r12, 16")
                    lines.append(f"    mov rax, {addr}")
                    lines.append("    mov [r12 + 8], rax")
                    if -0x80000000 <= length <= 0x7FFFFFFF:
                        lines.append(f"    mov qword [r12], {length}")
                    else:
                        lines.append(f"    mov rax, {length}")
                        lines.append("    mov [r12], rax")
                else:
                    val = int(data) & 0xFFFFFFFFFFFFFFFF
                    if val >= 0x8000000000000000:
                        val -= 0x10000000000000000
                    lines.append("    sub r12, 8")
                    if -0x80000000 <= val <= 0x7FFFFFFF:
                        lines.append(f"    mov qword [r12], {val}")
                    else:
                        lines.append(f"    mov rax, {val}")
                        lines.append("    mov [r12], rax")
            elif opc == OP_WORD:
                wref = node._word_ref
                if wref is None:
                    name = node.data
                    if name == "begin":
                        pair = ba_map.get(idx)
                        if pair:
                            begin_rt.append(pair)
                            lines.append(f"{pair[0]}:")
                    elif name == "again":
                        pair = ba_map.get(idx)
                        if pair:
                            lines.append(f"    jmp {pair[0]}")
                            lines.append(f"{pair[1]}:")
                            if begin_rt and begin_rt[-1] == pair:
                                begin_rt.pop()
                    elif name == "continue":
                        if begin_rt:
                            lines.append(f"    jmp {begin_rt[-1][0]}")
                    elif name == "exit":
                        lines.append("    ret")
                    continue

                wd = wref.definition
                if isinstance(wd, AsmDefinition):
                    if self._is_coroutine_asm(wd.body.strip("\n")):
                        # Coroutine asm: call raw JIT instead of inlining
                        raw_addr = self._compile_raw_jit(wref)
                        lines.append(f"    mov rax, {raw_addr}")
                        lines.append("    call rax")
                    else:
                        # Inline asm body
                        prefix = _nl(f"a{idx}")
                        _local_labels: Set[str] = set()
                        asm_txt = wd.body.strip("\n")
                        has_ret = False
                        for raw_line in asm_txt.splitlines():
                            ln = raw_line.strip()
                            lm = _RE_LABEL_PAT.match(ln)
                            if lm:
                                _local_labels.add(lm.group(1))
                            if ln == "ret":
                                has_ret = True
                        end_lbl = f"{prefix}_end" if has_ret else None
                        for raw_line in asm_txt.splitlines():
                            ln = raw_line.strip()
                            if not ln or ln.startswith(";") or ln.startswith("extern"):
                                continue
                            if ln == "ret":
                                lines.append(f"    jmp {end_lbl}")
                                continue
                            for lbl in _local_labels:
                                ln = re.sub(rf'(?<!\w){re.escape(lbl)}(?=\s|:|,|$|\]|\))',
                                            prefix + lbl, ln)
                            m = _RE_REL_PAT.search(ln)
                            if m and m.group(1) in bss:
                                sym = m.group(1)
                                addr = bss[sym]
                                if ln.lstrip().startswith("lea"):
                                    ln = _RE_REL_PAT.sub(str(addr), ln).replace("lea", "mov", 1)
                                else:
                                    lines.append("    push rax")
                                    lines.append(f"    mov rax, {addr}")
                                    new_ln = _RE_REL_PAT.sub("[rax]", ln)
                                    lines.append(f"    {new_ln}")
                                    lines.append("    pop rax")
                                    continue
                            if '[rel ' in ln:
                                ln = ln.replace('[rel ', '[')
                            lines.append(f"    {ln}")
                        if end_lbl is not None:
                            lines.append(f"{end_lbl}:")
                elif isinstance(wd, Definition):
                    # Call standard JIT'd sub-definition via output buffer
                    ck = f"__defn_jit_{wref.name}"
                    if ck not in self._jit_cache:
                        sub = self._compile_definition_jit(wref)
                        if sub is None:
                            # Can't JIT; fall back to raw JIT of the sub-word
                            raw_addr = self._compile_raw_jit(wref)
                            lines.append(f"    mov rax, {raw_addr}")
                            lines.append("    call rax")
                            continue
                        self._jit_cache[ck] = sub
                        self._jit_cache[ck + "_addr"] = ctypes.cast(sub, ctypes.c_void_p).value
                    func_addr = self._jit_cache.get(ck + "_addr")
                    if func_addr is None:
                        raise ParseError(f"raw JIT: missing JIT for '{wref.name}'")
                    lines.append("    mov rdi, r12")
                    lines.append("    mov rsi, r13")
                    lines.append(f"    mov rdx, {out2_addr}")
                    lines.append(f"    mov rax, {func_addr}")
                    lines.append("    call rax")
                    lines.append(f"    mov rax, {out2_addr}")
                    lines.append("    mov r12, [rax]")
                    lines.append("    mov r13, [rax + 8]")
                else:
                    raise ParseError(f"raw JIT: unsupported word '{wref.name}'")
            elif opc == OP_WORD_PTR:
                # Word pointer: push the raw JIT address of the target
                target_name = str(node.data)
                tw = self.dictionary.lookup(target_name)
                if tw is None:
                    raise ParseError(f"raw JIT: unknown word '{target_name}'")
                raw_addr = self._compile_raw_jit(tw)
                lines.append("    sub r12, 8")
                lines.append(f"    mov rax, {raw_addr}")
                lines.append("    mov [r12], rax")
            elif opc == OP_FOR_BEGIN:
                pair = for_map.get(idx)
                if pair is None:
                    raise ParseError("raw JIT: unmatched for")
                bl, el = pair
                lines.append("    mov rax, [r12]")
                lines.append("    add r12, 8")
                lines.append("    cmp rax, 0")
                lines.append(f"    jle {el}")
                lines.append("    sub r13, 8")
                lines.append("    mov [r13], rax")
                lines.append(f"{bl}:")
            elif opc == OP_FOR_END:
                pair = for_map.get(idx)
                if pair is None:
                    raise ParseError("raw JIT: unmatched for end")
                bl, el = pair
                lines.append("    dec qword [r13]")
                lines.append("    cmp qword [r13], 0")
                lines.append(f"    jg {bl}")
                lines.append("    add r13, 8")
                lines.append(f"{el}:")
            elif opc == OP_BRANCH_ZERO:
                ln = str(node.data)
                al = label_map.get(ln)
                if al is None:
                    raise ParseError("raw JIT: unknown branch target")
                lines.append("    mov rax, [r12]")
                lines.append("    add r12, 8")
                lines.append("    test rax, rax")
                lines.append(f"    jz {al}")
            elif opc == OP_JUMP:
                ln = str(node.data)
                al = label_map.get(ln)
                if al is None:
                    raise ParseError("raw JIT: unknown jump target")
                lines.append(f"    jmp {al}")
            elif opc == OP_LABEL:
                ln = str(node.data)
                al = label_map.get(ln)
                if al is None:
                    raise ParseError("raw JIT: unknown label")
                lines.append(f"{al}:")
            else:
                raise ParseError(f"raw JIT: unsupported opcode {opc} in '{word.name}'")

        lines.append("    ret")
        return lines

    # -- Whole-word JIT: compile Definition bodies to native code -----------

    def _compile_definition_jit(self, word: Word) -> Any:
        """JIT-compile a regular Definition body into native x86-64 code.

        Returns a ctypes callable or None if the definition cannot be JIT'd.
        """
        defn = word.definition
        if not isinstance(defn, Definition):
            return None
        if Ks is None:
            return None

        # Guard against infinite recursion (recursive words)
        compiling = getattr(self, "_djit_compiling", None)
        if compiling is None:
            compiling = set()
            self._djit_compiling = compiling
        if word.name in compiling:
            return None  # recursive word, can't JIT
        compiling.add(word.name)
        try:
            return self._compile_definition_jit_inner(word, defn)
        finally:
            compiling.discard(word.name)

    def _compile_definition_jit_inner(self, word: Word, defn: Definition) -> Any:
        # Ensure word references are resolved
        self._resolve_words_in_body(defn)

        body = defn.body
        bss = self._bss_symbols

        # Pre-scan: bail if any op is unsupported
        for node in body:
            opc = node._opcode
            if opc == OP_LITERAL:
                if isinstance(node.data, str):
                    return None
            elif opc == OP_WORD:
                wref = node._word_ref
                if wref is None:
                    name = node.data
                    if name not in ("begin", "again", "continue", "exit"):
                        return None
                elif wref.runtime_intrinsic is not None:
                    return None
                elif getattr(wref, "is_extern", False):
                    return None  # extern words need _call_extern_ct
                else:
                    wd = wref.definition
                    if wd is None:
                        return None
                    if not isinstance(wd, (AsmDefinition, Definition)):
                        return None
                    if isinstance(wd, Definition):
                        ck = f"__defn_jit_{wref.name}"
                        if ck not in self._jit_cache:
                            sub = self._compile_definition_jit(wref)
                            if sub is None:
                                return None
                            self._jit_cache[ck] = sub
                            self._jit_cache[ck + "_addr"] = ctypes.cast(sub, ctypes.c_void_p).value
            elif opc in (OP_FOR_BEGIN, OP_FOR_END, OP_BRANCH_ZERO, OP_JUMP, OP_LABEL):
                pass
            else:
                return None

        uid = id(defn)
        lc = [0]
        def _nl(prefix: str) -> str:
            lc[0] += 1
            return f"_dj{uid}_{prefix}_{lc[0]}"

        # Build label maps
        label_map: Dict[str, str] = {}
        for node in body:
            if node._opcode == OP_LABEL:
                ln = str(node.data)
                if ln not in label_map:
                    label_map[ln] = _nl("lbl")

        # For-loop pairing
        for_map: Dict[int, Tuple[str, str]] = {}
        fstack: List[Tuple[int, str, str]] = []
        for idx, node in enumerate(body):
            if node._opcode == OP_FOR_BEGIN:
                bl, el = _nl("for_top"), _nl("for_end")
                fstack.append((idx, bl, el))
            elif node._opcode == OP_FOR_END:
                if fstack:
                    bi, bl, el = fstack.pop()
                    for_map[bi] = (bl, el)
                    for_map[idx] = (bl, el)

        # begin/again pairing
        ba_map: Dict[int, Tuple[str, str]] = {}
        bstack: List[Tuple[int, str, str]] = []
        for idx, node in enumerate(body):
            if node._opcode == OP_WORD and node._word_ref is None:
                nm = node.data
                if nm == "begin":
                    bl, al = _nl("begin"), _nl("again")
                    bstack.append((idx, bl, al))
                elif nm == "again":
                    if bstack:
                        bi, bl, al = bstack.pop()
                        ba_map[bi] = (bl, al)
                        ba_map[idx] = (bl, al)

        lines: List[str] = []
        # Entry wrapper
        lines.extend([
            "_ct_entry:",
            "    push rbx",
            "    push r12",
            "    push r13",
            "    push r14",
            "    push r15",
            "    sub rsp, 16",
            "    mov [rsp], rdx",
            "    mov r12, rdi",
            "    mov r13, rsi",
        ])

        begin_rt: List[Tuple[str, str]] = []

        def _patch_asm_body(asm_body: str, prefix: str) -> List[str]:
            """Patch an asm body for inlining: uniquify labels, patch [rel]."""
            result: List[str] = []
            local_labels: Set[str] = set()
            has_ret = False
            for raw_line in asm_body.splitlines():
                line = raw_line.strip()
                lm = _RE_LABEL_PAT.match(line)
                if lm:
                    local_labels.add(lm.group(1))
                if line == "ret":
                    has_ret = True
            end_label = f"{prefix}_end" if has_ret else None
            for raw_line in asm_body.splitlines():
                line = raw_line.strip()
                if not line or line.startswith(";") or line.startswith("extern"):
                    continue
                if line == "ret":
                    result.append(f"    jmp {end_label}")
                    continue
                for label in local_labels:
                    line = re.sub(rf'(?<!\w){re.escape(label)}(?=\s|:|,|$|\]|\))', prefix + label, line)
                m = _RE_REL_PAT.search(line)
                if m and m.group(1) in bss:
                    sym = m.group(1)
                    addr = bss[sym]
                    if line.lstrip().startswith("lea"):
                        line = _RE_REL_PAT.sub(str(addr), line).replace("lea", "mov", 1)
                    else:
                        result.append("    push rax")
                        result.append(f"    mov rax, {addr}")
                        new_line = _RE_REL_PAT.sub("[rax]", line)
                        result.append(f"    {new_line}")
                        result.append("    pop rax")
                        continue
                # Convert NASM 'rel' to explicit rip-relative for Keystone
                if '[rel ' in line:
                    line = line.replace('[rel ', '[rip + ')
                result.append(f"    {line}")
            if end_label is not None:
                result.append(f"{end_label}:")
            return result

        for idx, node in enumerate(body):
            opc = node._opcode

            if opc == OP_LITERAL:
                val = int(node.data) & 0xFFFFFFFFFFFFFFFF
                if val >= 0x8000000000000000:
                    val -= 0x10000000000000000
                lines.append("    sub r12, 8")
                if -0x80000000 <= val <= 0x7FFFFFFF:
                    lines.append(f"    mov qword [r12], {val}")
                else:
                    lines.append(f"    mov rax, {val}")
                    lines.append("    mov [r12], rax")

            elif opc == OP_WORD:
                wref = node._word_ref
                if wref is None:
                    name = node.data
                    if name == "begin":
                        pair = ba_map.get(idx)
                        if pair:
                            begin_rt.append(pair)
                            lines.append(f"{pair[0]}:")
                    elif name == "again":
                        pair = ba_map.get(idx)
                        if pair:
                            lines.append(f"    jmp {pair[0]}")
                            lines.append(f"{pair[1]}:")
                            if begin_rt and begin_rt[-1] == pair:
                                begin_rt.pop()
                    elif name == "continue":
                        if begin_rt:
                            lines.append(f"    jmp {begin_rt[-1][0]}")
                    elif name == "exit":
                        if begin_rt:
                            pair = begin_rt.pop()
                            lines.append(f"    jmp {pair[1]}")
                        else:
                            lines.append("    jmp _ct_save")
                    continue

                wd = wref.definition
                if isinstance(wd, AsmDefinition):
                    prefix = _nl(f"a{idx}")
                    lines.extend(_patch_asm_body(wd.body.strip("\n"), prefix))
                elif isinstance(wd, Definition):
                    # Call JIT'd sub-definition
                    ck = f"__defn_jit_{wref.name}"
                    func_addr = self._jit_cache.get(ck + "_addr")
                    if func_addr is None:
                        return None  # should have been pre-compiled above
                    # Save & call: rdi=r12, rsi=r13, rdx=output_ptr
                    lines.append("    mov rax, [rsp]")
                    lines.append("    mov rdi, r12")
                    lines.append("    mov rsi, r13")
                    lines.append("    mov rdx, rax")
                    lines.append(f"    mov rax, {func_addr}")
                    lines.append("    call rax")
                    # Restore r12/r13 from output struct
                    lines.append("    mov rax, [rsp]")
                    lines.append("    mov r12, [rax]")
                    lines.append("    mov r13, [rax + 8]")

            elif opc == OP_FOR_BEGIN:
                pair = for_map.get(idx)
                if pair is None:
                    return None
                bl, el = pair
                lines.append("    mov rax, [r12]")
                lines.append("    add r12, 8")
                lines.append("    cmp rax, 0")
                lines.append(f"    jle {el}")
                lines.append("    sub r13, 8")
                lines.append("    mov [r13], rax")
                lines.append(f"{bl}:")

            elif opc == OP_FOR_END:
                pair = for_map.get(idx)
                if pair is None:
                    return None
                bl, el = pair
                lines.append("    dec qword [r13]")
                lines.append("    cmp qword [r13], 0")
                lines.append(f"    jg {bl}")
                lines.append("    add r13, 8")
                lines.append(f"{el}:")

            elif opc == OP_BRANCH_ZERO:
                ln = str(node.data)
                al = label_map.get(ln)
                if al is None:
                    return None
                lines.append("    mov rax, [r12]")
                lines.append("    add r12, 8")
                lines.append("    test rax, rax")
                lines.append(f"    jz {al}")

            elif opc == OP_JUMP:
                ln = str(node.data)
                al = label_map.get(ln)
                if al is None:
                    return None
                lines.append(f"    jmp {al}")

            elif opc == OP_LABEL:
                ln = str(node.data)
                al = label_map.get(ln)
                if al is None:
                    return None
                lines.append(f"{al}:")

        # Epilog
        lines.extend([
            "_ct_save:",
            "    mov rax, [rsp]",
            "    mov [rax], r12",
            "    mov [rax + 8], r13",
            "    add rsp, 16",
            "    pop r15",
            "    pop r14",
            "    pop r13",
            "    pop r12",
            "    pop rbx",
            "    ret",
        ])

        def _norm(l: str) -> str:
            l = l.split(";", 1)[0].rstrip()
            for sz in ("qword", "dword", "word", "byte"):
                l = l.replace(f"{sz} [", f"{sz} ptr [")
            return l
        normalized = [_norm(l) for l in lines if _norm(l).strip()]

        ks = Ks(KS_ARCH_X86, KS_MODE_64)
        try:
            encoding, _ = ks.asm("\n".join(normalized))
        except KsError:
            return None
        if encoding is None:
            return None

        code = bytes(encoding)
        page_size = max(len(code), 4096)
        _libc = ctypes.CDLL(None, use_errno=True)
        _libc.mmap.restype = ctypes.c_void_p
        _libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int,
                                ctypes.c_int, ctypes.c_int, ctypes.c_long]
        PROT_RWX = 0x1 | 0x2 | 0x4
        MAP_PRIVATE = 0x02
        MAP_ANONYMOUS = 0x20
        ptr = _libc.mmap(None, page_size, PROT_RWX,
                          MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)
        if ptr == ctypes.c_void_p(-1).value or ptr is None:
            return None
        ctypes.memmove(ptr, code, len(code))
        self._jit_code_pages.append((ptr, page_size))
        if CompileTimeVM._JIT_FUNC_TYPE is None:
            CompileTimeVM._JIT_FUNC_TYPE = ctypes.CFUNCTYPE(None, ctypes.c_int64, ctypes.c_int64, ctypes.c_void_p)
        return self._JIT_FUNC_TYPE(ptr)

    # -- Old non-runtime asm execution (kept for non-runtime CT mode) -------

    def _run_asm_definition(self, word: Word) -> None:
        definition = word.definition
        if Ks is None:
            raise ParseError("keystone is required for compile-time :asm execution; install keystone-engine")
        if not isinstance(definition, AsmDefinition):  # pragma: no cover - defensive
            raise ParseError(f"word '{word.name}' has no asm body")
        asm_body = definition.body.strip("\n")

        # Determine whether this asm expects string semantics via declared effects.
        string_mode = WORD_EFFECT_STRING_IO in definition.effects

        handles = self._handles

        non_int_data = any(not isinstance(v, int) for v in self.stack)
        non_int_return = any(not isinstance(v, int) for v in self.return_stack)

        # Collect all strings present on data and return stacks so we can point
        # puts() at a real buffer and pass its range check (data_start..data_end).
        strings: List[str] = []
        if string_mode:
            for v in self.stack + self.return_stack:
                if isinstance(v, str):
                    strings.append(v)
        data_blob = b""
        string_addrs: Dict[str, Tuple[int, int]] = {}
        if strings:
            offset = 0
            parts: List[bytes] = []
            seen: Dict[str, Tuple[int, int]] = {}
            for s in strings:
                if s in seen:
                    string_addrs[s] = seen[s]
                    continue
                encoded = s.encode("utf-8") + b"\x00"
                parts.append(encoded)
                addr = offset
                length = len(encoded) - 1
                seen[s] = (addr, length)
                string_addrs[s] = (addr, length)
                offset += len(encoded)
            data_blob = b"".join(parts)
        string_buffer: Optional[ctypes.Array[Any]] = None
        data_start = 0
        data_end = 0
        if data_blob:
            string_buffer = ctypes.create_string_buffer(data_blob)
            data_start = ctypes.addressof(string_buffer)
            data_end = data_start + len(data_blob)
            handles.refs.append(string_buffer)
            for s, (off, _len) in string_addrs.items():
                handles.objects[data_start + off] = s

        PRINT_BUF_BYTES = 128
        print_buffer = ctypes.create_string_buffer(PRINT_BUF_BYTES)
        handles.refs.append(print_buffer)
        print_buf = ctypes.addressof(print_buffer)

        wrapper_lines = []
        wrapper_lines.extend([
            "_ct_entry:",
            "    push rbx",
            "    push r12",
            "    push r13",
            "    push r14",
            "    push r15",
            "    mov r12, rdi",  # data stack pointer
            "    mov r13, rsi",  # return stack pointer
            "    mov r14, rdx",  # out ptr for r12
            "    mov r15, rcx",  # out ptr for r13
        ])
        if asm_body:
            patched_body = []
            # Build BSS symbol table for [rel X] -> concrete address substitution
            _bss_symbols: Dict[str, int] = {
                "data_start": data_start,
                "data_end": data_end,
                "print_buf": print_buf,
                "print_buf_end": print_buf + PRINT_BUF_BYTES,
            }
            if self.memory is not None:
                _bss_symbols.update({
                    "persistent": self.memory.persistent_addr,
                    "persistent_end": self.memory.persistent_addr + self.memory._persistent_size,
                })
            for line in asm_body.splitlines():
                line = line.strip()
                if line == "ret":
                    line = "jmp _ct_save"
                # Replace [rel SYMBOL] with concrete addresses
                m = _RE_REL_PAT.search(line)
                if m and m.group(1) in _bss_symbols:
                    sym = m.group(1)
                    addr = _bss_symbols[sym]
                    # lea REG, [rel X]  ->  mov REG, addr
                    if line.lstrip().startswith("lea"):
                        line = _RE_REL_PAT.sub(str(addr), line).replace("lea", "mov", 1)
                    else:
                        # For memory operands like mov byte [rel X], val
                        # replace [rel X] with [<addr>]
                        tmp_reg = "rax"
                        # Use a scratch register to hold the address
                        patched_body.append(f"push rax")
                        patched_body.append(f"mov rax, {addr}")
                        new_line = _RE_REL_PAT.sub("[rax]", line)
                        patched_body.append(new_line)
                        patched_body.append(f"pop rax")
                        continue
                # Convert NASM 'rel' to explicit rip-relative for Keystone
                if '[rel ' in line:
                    line = line.replace('[rel ', '[rip + ')
                patched_body.append(line)
            wrapper_lines.extend(patched_body)
        wrapper_lines.extend([
            "_ct_save:",
            "    mov [r14], r12",
            "    mov [r15], r13",
            "    pop r15",
            "    pop r14",
            "    pop r13",
            "    pop r12",
            "    pop rbx",
            "    ret",
        ])
        def _normalize_sizes(line: str) -> str:
            for size in ("qword", "dword", "word", "byte"):
                line = line.replace(f"{size} [", f"{size} ptr [")
            return line

        def _strip_comment(line: str) -> str:
            return line.split(";", 1)[0].rstrip()

        normalized_lines = []
        for raw in wrapper_lines:
            stripped = _strip_comment(raw)
            if not stripped.strip():
                continue
            normalized_lines.append(_normalize_sizes(stripped))
        ks = Ks(KS_ARCH_X86, KS_MODE_64)
        try:
            encoding, _ = ks.asm("\n".join(normalized_lines))
        except KsError as exc:
            debug_lines = "\n".join(normalized_lines)
            raise ParseError(
                f"keystone failed for word '{word.name}': {exc}\n--- asm ---\n{debug_lines}\n--- end asm ---"
            ) from exc
        if encoding is None:
            raise ParseError(
                f"keystone produced no code for word '{word.name}' (lines: {len(wrapper_lines)})"
            )

        code = bytes(encoding)
        import mmap
        code_buf = mmap.mmap(-1, len(code), prot=mmap.PROT_READ | mmap.PROT_WRITE | mmap.PROT_EXEC)
        code_buf.write(code)
        code_ptr = ctypes.addressof(ctypes.c_char.from_buffer(code_buf))
        func_type = ctypes.CFUNCTYPE(None, ctypes.c_uint64, ctypes.c_uint64, ctypes.c_uint64, ctypes.c_uint64)
        func = func_type(code_ptr)

        handles = self._handles

        def _marshal_stack(py_stack: List[Any]) -> Tuple[int, int, int, Any]:
            capacity = len(py_stack) + 16
            buffer = (ctypes.c_int64 * capacity)()
            base = ctypes.addressof(buffer)
            top = base + capacity * 8
            sp = top
            for value in py_stack:
                sp -= 8
                if isinstance(value, int):
                    ctypes.c_int64.from_address(sp).value = value
                elif isinstance(value, str):
                    if string_mode:
                        offset, strlen = string_addrs.get(value, (0, 0))
                        addr = data_start + offset if data_start else handles.store(value)
                        # puts expects (len, addr) with len on top
                        ctypes.c_int64.from_address(sp).value = addr
                        sp -= 8
                        ctypes.c_int64.from_address(sp).value = strlen
                    else:
                        ctypes.c_int64.from_address(sp).value = handles.store(value)
                else:
                    ctypes.c_int64.from_address(sp).value = handles.store(value)
            return sp, top, base, buffer

        # r12/r13 must point at the top element (or top of buffer if empty)
        buffers: List[Any] = []
        d_sp, d_top, d_base, d_buf = _marshal_stack(self.stack)
        buffers.append(d_buf)
        r_sp, r_top, r_base, r_buf = _marshal_stack(self.return_stack)
        buffers.append(r_buf)
        out_d = ctypes.c_uint64(0)
        out_r = ctypes.c_uint64(0)
        func(d_sp, r_sp, ctypes.addressof(out_d), ctypes.addressof(out_r))

        new_d = out_d.value
        new_r = out_r.value
        if not (d_base <= new_d <= d_top):
            raise ParseError(f"compile-time asm '{word.name}' corrupted data stack pointer")
        if not (r_base <= new_r <= r_top):
            raise ParseError(f"compile-time asm '{word.name}' corrupted return stack pointer")

        def _unmarshal_stack(sp: int, top: int, table: _CTHandleTable) -> List[Any]:
            if sp == top:
                return []
            values: List[Any] = []
            addr = top - 8
            while addr >= sp:
                raw = ctypes.c_int64.from_address(addr).value
                if raw in table.objects:
                    obj = table.objects[raw]
                    if isinstance(obj, str) and values and isinstance(values[-1], int):
                        # collapse (len, addr) pairs back into the original string
                        values.pop()
                        values.append(obj)
                    else:
                        values.append(obj)
                else:
                    values.append(raw)
                addr -= 8
            return values

        self.stack = _unmarshal_stack(new_d, d_top, handles)
        self.return_stack = _unmarshal_stack(new_r, r_top, handles)

    def _call_word_by_name(self, name: str) -> None:
        word = self.dictionary.lookup(name)
        if word is None:
            raise ParseError(f"unknown word '{name}' during compile-time execution")
        self._call_word(word)

    def _resolve_words_in_body(self, defn: Definition) -> None:
        """Pre-resolve word name -> Word objects on Op nodes (once per Definition)."""
        if defn._words_resolved:
            return
        lookup = self.dictionary.lookup
        for node in defn.body:
            if node._opcode == OP_WORD and node._word_ref is None:
                name = str(node.data)
                # Skip structural keywords that _execute_nodes handles inline
                if name not in ("begin", "again", "continue", "exit", "get_addr"):
                    ref = lookup(name)
                    if ref is not None:
                        node._word_ref = ref
        defn._words_resolved = True

    def _prepare_definition(self, defn: Definition) -> Tuple[Dict[str, int], Dict[int, int], Dict[int, int]]:
        """Return (label_positions, for_pairs, begin_pairs), cached on the Definition."""
        if defn._label_positions is None:
            lp, fp, bp = self._analyze_nodes(defn.body)
            defn._label_positions = lp
            defn._for_pairs = fp
            defn._begin_pairs = bp
        self._resolve_words_in_body(defn)
        if self.runtime_mode:
            # Merged JIT runs are a performance optimization, but have shown
            # intermittent instability on some environments. Keep them opt-in.
            if os.environ.get("L2_CT_MERGED_JIT", "0") == "1":
                if defn._merged_runs is None:
                    defn._merged_runs = self._find_mergeable_runs(defn)
            else:
                defn._merged_runs = {}
        return defn._label_positions, defn._for_pairs, defn._begin_pairs

    def _find_mergeable_runs(self, defn: Definition) -> Dict[int, Tuple[int, str]]:
        """Find consecutive runs of JIT-able asm word ops (length >= 2)."""
        runs: Dict[int, Tuple[int, str]] = {}
        body = defn.body
        n = len(body)
        i = 0
        while i < n:
            # Start of a potential run
            if body[i]._opcode == OP_WORD and body[i]._word_ref is not None:
                w = body[i]._word_ref
                if (w.runtime_intrinsic is None and isinstance(w.definition, AsmDefinition)
                        and not w.compile_time_override):
                    run_start = i
                    run_words = [w.name]
                    i += 1
                    while i < n and body[i]._opcode == OP_WORD and body[i]._word_ref is not None:
                        w2 = body[i]._word_ref
                        if (w2.runtime_intrinsic is None and isinstance(w2.definition, AsmDefinition)
                                and not w2.compile_time_override):
                            run_words.append(w2.name)
                            i += 1
                        else:
                            break
                    if len(run_words) >= 2:
                        key = f"__merged_{defn.name}_{run_start}_{i}"
                        runs[run_start] = (i, key)
                    continue
            i += 1
        return runs

    def _compile_merged_jit(self, words: List[Word], cache_key: str) -> Any:
        """Compile multiple asm word bodies into a single JIT function."""
        if Ks is None:
            raise ParseError("keystone-engine is required for JIT execution")

        bss = self._bss_symbols

        lines: List[str] = []
        # Entry wrapper (same as _compile_jit)
        lines.extend([
            "_ct_entry:",
            "    push rbx",
            "    push r12",
            "    push r13",
            "    push r14",
            "    push r15",
            "    sub rsp, 16",
            "    mov [rsp], rdx",
            "    mov r12, rdi",
            "    mov r13, rsi",
        ])

        # Append each word's asm body, with labels uniquified
        for word_idx, word in enumerate(words):
            defn = word.definition
            asm_body = defn.body.strip("\n")
            prefix = f"_m{word_idx}_"

            # Collect all labels in this asm body first
            local_labels: Set[str] = set()
            for raw_line in asm_body.splitlines():
                line = raw_line.strip()
                lm = _RE_LABEL_PAT.match(line)
                if lm:
                    local_labels.add(lm.group(1))

            for raw_line in asm_body.splitlines():
                line = raw_line.strip()
                if not line or line.startswith(";"):
                    continue
                if line.startswith("extern"):
                    continue
                if line == "ret":
                    # Last word: jmp to save; others: fall through
                    if word_idx < len(words) - 1:
                        continue  # just skip ret -> fall through
                    else:
                        line = "jmp _ct_save"

                # Replace all references to local labels with prefixed versions
                for label in local_labels:
                    # Use word-boundary replacement to avoid partial matches
                    line = re.sub(rf'(?<!\w){re.escape(label)}(?=\s|:|,|$|\]|\))', prefix + label, line)

                # Patch [rel SYMBOL] -> concrete address
                m = _RE_REL_PAT.search(line)
                if m and m.group(1) in bss:
                    sym = m.group(1)
                    addr = bss[sym]
                    if line.lstrip().startswith("lea"):
                        line = _RE_REL_PAT.sub(str(addr), line).replace("lea", "mov", 1)
                    else:
                        lines.append("    push rax")
                        lines.append(f"    mov rax, {addr}")
                        new_line = _RE_REL_PAT.sub("[rax]", line)
                        lines.append(f"    {new_line}")
                        lines.append("    pop rax")
                        continue
                # Convert NASM 'rel' to explicit rip-relative for Keystone
                if '[rel ' in line:
                    line = line.replace('[rel ', '[rip + ')
                lines.append(f"    {line}")

        # Save epilog
        lines.extend([
            "_ct_save:",
            "    mov rax, [rsp]",
            "    mov [rax], r12",
            "    mov [rax + 8], r13",
            "    add rsp, 16",
            "    pop r15",
            "    pop r14",
            "    pop r13",
            "    pop r12",
            "    pop rbx",
            "    ret",
        ])

        # Normalize for Keystone
        def _norm(l: str) -> str:
            l = l.split(";", 1)[0].rstrip()
            for sz in ("qword", "dword", "word", "byte"):
                l = l.replace(f"{sz} [", f"{sz} ptr [")
            return l
        normalized = [_norm(l) for l in lines if _norm(l).strip()]

        ks = Ks(KS_ARCH_X86, KS_MODE_64)
        try:
            encoding, _ = ks.asm("\n".join(normalized))
        except KsError as exc:
            debug_txt = "\n".join(normalized)
            raise ParseError(
                f"JIT merged assembly failed for '{cache_key}': {exc}\n--- asm ---\n{debug_txt}\n--- end ---"
            ) from exc
        if encoding is None:
            raise ParseError(f"JIT merged produced no code for '{cache_key}'")

        code = bytes(encoding)
        page_size = max(len(code), 4096)
        _libc = ctypes.CDLL(None, use_errno=True)
        _libc.mmap.restype = ctypes.c_void_p
        _libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int,
                                ctypes.c_int, ctypes.c_int, ctypes.c_long]
        PROT_RWX = 0x1 | 0x2 | 0x4
        MAP_PRIVATE = 0x02
        MAP_ANONYMOUS = 0x20
        ptr = _libc.mmap(None, page_size, PROT_RWX,
                          MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)
        if ptr == ctypes.c_void_p(-1).value or ptr is None:
            raise RuntimeError(f"mmap failed for merged JIT code ({page_size} bytes)")
        ctypes.memmove(ptr, code, len(code))
        self._jit_code_pages.append((ptr, page_size))
        return self._JIT_FUNC_TYPE(ptr)

    def _execute_nodes(self, nodes: Sequence[Op], *, _defn: Optional[Definition] = None) -> None:
        # Use cached analysis if we have one, else compute fresh
        if _defn is not None:
            label_positions, loop_pairs, begin_pairs = self._prepare_definition(_defn)
        else:
            label_positions, loop_pairs, begin_pairs = self._analyze_nodes(nodes)
        prev_loop_stack = self.loop_stack
        self.loop_stack = []
        begin_stack: List[Tuple[int, int]] = []

        # Local variable aliases for hot-path speedup
        _runtime_mode = self.runtime_mode
        _push = self.push
        _pop = self.pop
        _pop_int = self.pop_int
        _push_return = self.push_return
        _pop_return = self.pop_return
        _peek_return = self.peek_return
        _poke_return = self.poke_return
        _call_word = self._call_word
        _dict_lookup = self.dictionary.lookup

        # Hot JIT-call locals (avoid repeated attribute access)
        _jit_cache = self._jit_cache if _runtime_mode else None
        _jit_out2 = self._jit_out2 if _runtime_mode else None
        _jit_out2_addr = self._jit_out2_addr if _runtime_mode else 0
        _compile_jit = self._compile_jit if _runtime_mode else None
        _compile_merged = self._compile_merged_jit if _runtime_mode else None
        _AsmDef = AsmDefinition
        _merged_runs = (_defn._merged_runs if _defn is not None and _defn._merged_runs else None) if _runtime_mode else None

        # Inline ctypes for runtime mode — eliminates per-op function call overhead
        if _runtime_mode:
            _c_int64_at = ctypes.c_int64.from_address
            _I64_MASK = 0xFFFFFFFFFFFFFFFF
            _I64_SIGN = 0x8000000000000000
            _I64_WRAP = 0x10000000000000000
            _store_string = self.memory.store_string
        else:
            _c_int64_at = _I64_MASK = _I64_SIGN = _I64_WRAP = _store_string = None  # type: ignore[assignment]

        ip = 0
        prev_location = self.current_location
        # Local opcode constants (avoid global dict lookup)
        _OP_WORD = OP_WORD
        _OP_LITERAL = OP_LITERAL
        _OP_WORD_PTR = OP_WORD_PTR
        _OP_FOR_BEGIN = OP_FOR_BEGIN
        _OP_FOR_END = OP_FOR_END
        _OP_BRANCH_ZERO = OP_BRANCH_ZERO
        _OP_JUMP = OP_JUMP
        _OP_LABEL = OP_LABEL
        _OP_LIST_BEGIN = OP_LIST_BEGIN
        _OP_LIST_END = OP_LIST_END
        _OP_LIST_LITERAL = OP_LIST_LITERAL
        try:
            while ip < len(nodes):
                _node = nodes[ip]
                kind = _node._opcode

                if kind == _OP_WORD:
                    # Merged JIT run: call one combined function for N words
                    if _merged_runs is not None:
                        run_info = _merged_runs.get(ip)
                        if run_info is not None:
                            end_ip, cache_key = run_info
                            func = _jit_cache.get(cache_key)
                            if func is None:
                                hit_key = cache_key + "_hits"
                                hits = _jit_cache.get(hit_key, 0) + 1
                                _jit_cache[hit_key] = hits
                                if hits >= 2:
                                    run_words = [nodes[j]._word_ref for j in range(ip, end_ip)]
                                    func = _compile_merged(run_words, cache_key)
                                    _jit_cache[cache_key] = func
                            if func is not None:
                                func(self.r12, self.r13, _jit_out2_addr)
                                self.r12 = _jit_out2[0]
                                self.r13 = _jit_out2[1]
                                ip = end_ip
                                continue

                    # Fast path: pre-resolved word reference
                    word = _node._word_ref
                    if word is not None:
                        if _runtime_mode:
                            ri = word.runtime_intrinsic
                            if ri is not None:
                                self.call_stack.append(word.name)
                                try:
                                    ri(self)
                                except _CTVMJump as jmp:
                                    self.call_stack.pop()
                                    ip = jmp.target_ip
                                    continue
                                except _CTVMReturn:
                                    self.call_stack.pop()
                                    return
                                finally:
                                    if self.call_stack and self.call_stack[-1] == word.name:
                                        self.call_stack.pop()
                                ip += 1
                                continue
                            defn = word.definition
                            if isinstance(defn, _AsmDef):
                                wn = word.name
                                func = _jit_cache.get(wn)
                                if func is None:
                                    func = _compile_jit(word)
                                    _jit_cache[wn] = func
                                func(self.r12, self.r13, _jit_out2_addr)
                                self.r12 = _jit_out2[0]
                                self.r13 = _jit_out2[1]
                                ip += 1
                                continue
                            # Whole-word JIT for Definition bodies
                            ck = "__defn_jit_" + word.name
                            jf = _jit_cache.get(ck)
                            if jf is None and _jit_cache.get(ck + "_miss") is None:
                                jf = self._compile_definition_jit(word)
                                if jf is not None:
                                    _jit_cache[ck] = jf
                                    _jit_cache[ck + "_addr"] = ctypes.cast(jf, ctypes.c_void_p).value
                                else:
                                    _jit_cache[ck + "_miss"] = True
                            if jf is not None:
                                jf(self.r12, self.r13, _jit_out2_addr)
                                self.r12 = _jit_out2[0]
                                self.r13 = _jit_out2[1]
                                ip += 1
                                continue
                        self.current_location = _node.loc
                        try:
                            _call_word(word)
                        except _CTVMJump as jmp:
                            ip = jmp.target_ip
                            continue
                        except _CTVMReturn:
                            return
                        ip += 1
                        continue

                    # Structural keywords or unresolved words
                    name = _node.data
                    if name == "begin":
                        end_idx = begin_pairs.get(ip)
                        if end_idx is None:
                            raise ParseError("'begin' without matching 'again'")
                        begin_stack.append((ip, end_idx))
                        ip += 1
                        continue
                    if name == "again":
                        if not begin_stack or begin_stack[-1][1] != ip:
                            raise ParseError("'again' without matching 'begin'")
                        ip = begin_stack[-1][0] + 1
                        continue
                    if name == "continue":
                        if not begin_stack:
                            raise ParseError("'continue' outside begin/again loop")
                        ip = begin_stack[-1][0] + 1
                        continue
                    if name == "exit":
                        if begin_stack:
                            frame = begin_stack.pop()
                            ip = frame[1] + 1
                            continue
                        return
                    if _runtime_mode and name == "get_addr":
                        r12 = self.r12 - 8
                        _c_int64_at(r12).value = ip + 1
                        self.r12 = r12
                        ip += 1
                        continue
                    self.current_location = _node.loc
                    w = _dict_lookup(name)
                    if w is None:
                        raise ParseError(f"unknown word '{name}' during compile-time execution")
                    try:
                        _call_word(w)
                    except _CTVMJump as jmp:
                        ip = jmp.target_ip
                        continue
                    except _CTVMReturn:
                        return
                    ip += 1
                    continue

                if kind == _OP_LITERAL:
                    if _runtime_mode:
                        data = _node.data
                        if isinstance(data, str):
                            addr, length = _store_string(data)
                            r12 = self.r12 - 16
                            _c_int64_at(r12 + 8).value = addr
                            _c_int64_at(r12).value = length
                            self.r12 = r12
                        else:
                            r12 = self.r12 - 8
                            v = int(data) & _I64_MASK
                            if v >= _I64_SIGN:
                                v -= _I64_WRAP
                            _c_int64_at(r12).value = v
                            self.r12 = r12
                    else:
                        _push(_node.data)
                    ip += 1
                    continue

                if kind == _OP_FOR_END:
                    if _runtime_mode:
                        val = _c_int64_at(self.r13).value - 1
                        _c_int64_at(self.r13).value = val
                        if val > 0:
                            ip = self.loop_stack[-1] + 1
                            continue
                        self.r13 += 8
                    else:
                        if not self.loop_stack:
                            raise ParseError("'next' without matching 'for'")
                        val = _peek_return() - 1
                        _poke_return(val)
                        if val > 0:
                            ip = self.loop_stack[-1] + 1
                            continue
                        _pop_return()
                    self.loop_stack.pop()
                    ip += 1
                    continue

                if kind == _OP_FOR_BEGIN:
                    if _runtime_mode:
                        count = _c_int64_at(self.r12).value
                        self.r12 += 8
                        if count <= 0:
                            match = loop_pairs.get(ip)
                            if match is None:
                                raise ParseError("internal loop bookkeeping error")
                            ip = match + 1
                            continue
                        r13 = self.r13 - 8
                        v = count & _I64_MASK
                        if v >= _I64_SIGN:
                            v -= _I64_WRAP
                        _c_int64_at(r13).value = v
                        self.r13 = r13
                    else:
                        count = _pop_int()
                        if count <= 0:
                            match = loop_pairs.get(ip)
                            if match is None:
                                raise ParseError("internal loop bookkeeping error")
                            ip = match + 1
                            continue
                        _push_return(count)
                    self.loop_stack.append(ip)
                    ip += 1
                    continue

                if kind == _OP_BRANCH_ZERO:
                    if _runtime_mode:
                        condition = _c_int64_at(self.r12).value
                        self.r12 += 8
                        if condition == 0:
                            ip = label_positions.get(str(_node.data), -1)
                            if ip == -1:
                                raise ParseError(f"unknown label during compile-time execution")
                        else:
                            ip += 1
                    else:
                        condition = _pop()
                        if isinstance(condition, bool):
                            flag = condition
                        elif isinstance(condition, int):
                            flag = condition != 0
                        else:
                            raise ParseError("branch expects integer or boolean condition")
                        if not flag:
                            ip = label_positions.get(str(_node.data), -1)
                            if ip == -1:
                                raise ParseError(f"unknown label during compile-time execution")
                        else:
                            ip += 1
                    continue

                if kind == _OP_JUMP:
                    ip = label_positions.get(str(_node.data), -1)
                    if ip == -1:
                        raise ParseError(f"unknown label during compile-time execution")
                    continue

                if kind == _OP_LABEL:
                    ip += 1
                    continue

                if kind == _OP_WORD_PTR:
                    target_name = str(_node.data)
                    target_word = _dict_lookup(target_name)
                    if target_word is None:
                        raise ParseError(
                            f"unknown word '{target_name}' referenced by pointer during compile-time execution"
                        )
                    if _runtime_mode:
                        # Push native code address so asm can jmp/call it
                        addr = self._compile_raw_jit(target_word)
                        _push(addr)
                        # Store reverse mapping so _rt_jmp can resolve back to Word
                        self._handles.objects[addr] = target_word
                    else:
                        _push(self._handles.store(target_word))
                    ip += 1
                    continue

                if kind == _OP_LIST_BEGIN:
                    if _runtime_mode:
                        self._list_capture_stack.append(self.r12)
                    else:
                        self._list_capture_stack.append(len(self.stack))
                    ip += 1
                    continue

                if kind == _OP_LIST_LITERAL:
                    values = list(_node.data or [])
                    count = len(values)
                    buf_size = (count + 1) * 8
                    addr = self.memory.allocate(buf_size)
                    CTMemory.write_qword(addr, count)
                    for idx_item, val in enumerate(values):
                        CTMemory.write_qword(addr + 8 + idx_item * 8, int(val))
                    _push(addr)
                    ip += 1
                    continue

                if kind == _OP_LIST_END:
                    if not self._list_capture_stack:
                        raise ParseError("']' without matching '['")
                    saved = self._list_capture_stack.pop()
                    if _runtime_mode:
                        items: List[int] = []
                        ptr = saved - 8
                        while ptr >= self.r12:
                            items.append(_c_int64_at(ptr).value)
                            ptr -= 8
                        self.r12 = saved
                    else:
                        items = self.stack[saved:]
                        del self.stack[saved:]
                    count = len(items)
                    buf_size = (count + 1) * 8
                    addr = self.memory.allocate(buf_size)
                    CTMemory.write_qword(addr, count)
                    for idx_item, val in enumerate(items):
                        CTMemory.write_qword(addr + 8 + idx_item * 8, val)
                    _push(addr)
                    ip += 1
                    continue

                self.current_location = _node.loc
                raise ParseError(f"unsupported compile-time op (opcode={kind})")
        finally:
            self.current_location = prev_location
            self.loop_stack = prev_loop_stack

    def _analyze_nodes(self, nodes: Sequence[Op]) -> Tuple[Dict[str, int], Dict[int, int], Dict[int, int]]:
        """Single-pass analysis: returns (label_positions, for_pairs, begin_pairs)."""
        label_positions: Dict[str, int] = {}
        for_pairs: Dict[int, int] = {}
        begin_pairs: Dict[int, int] = {}
        for_stack: List[int] = []
        begin_stack: List[int] = []
        for idx, node in enumerate(nodes):
            opc = node._opcode
            if opc == OP_LABEL:
                label_positions[str(node.data)] = idx
            elif opc == OP_FOR_BEGIN:
                for_stack.append(idx)
            elif opc == OP_FOR_END:
                if not for_stack:
                    raise ParseError("'next' without matching 'for'")
                begin_idx = for_stack.pop()
                for_pairs[begin_idx] = idx
                for_pairs[idx] = begin_idx
            elif opc == OP_WORD:
                d = node.data
                if d == "begin":
                    begin_stack.append(idx)
                elif d == "again":
                    if not begin_stack:
                        raise ParseError("'again' without matching 'begin'")
                    begin_idx = begin_stack.pop()
                    begin_pairs[begin_idx] = idx
                    begin_pairs[idx] = begin_idx
        if for_stack:
            raise ParseError("'for' without matching 'next'")
        if begin_stack:
            raise ParseError("'begin' without matching 'again'")
        return label_positions, for_pairs, begin_pairs

    def _label_positions(self, nodes: Sequence[Op]) -> Dict[str, int]:
        positions: Dict[str, int] = {}
        for idx, node in enumerate(nodes):
            if node._opcode == OP_LABEL:
                positions[str(node.data)] = idx
        return positions

    def _for_pairs(self, nodes: Sequence[Op]) -> Dict[int, int]:
        stack: List[int] = []
        pairs: Dict[int, int] = {}
        for idx, node in enumerate(nodes):
            if node._opcode == OP_FOR_BEGIN:
                stack.append(idx)
            elif node._opcode == OP_FOR_END:
                if not stack:
                    raise ParseError("'next' without matching 'for'")
                begin_idx = stack.pop()
                pairs[begin_idx] = idx
                pairs[idx] = begin_idx
        if stack:
            raise ParseError("'for' without matching 'next'")
        return pairs

    def _begin_pairs(self, nodes: Sequence[Op]) -> Dict[int, int]:
        stack: List[int] = []
        pairs: Dict[int, int] = {}
        for idx, node in enumerate(nodes):
            if node._opcode == OP_WORD and node.data == "begin":
                stack.append(idx)
            elif node._opcode == OP_WORD and node.data == "again":
                if not stack:
                    raise ParseError("'again' without matching 'begin'")
                begin_idx = stack.pop()
                pairs[begin_idx] = idx
                pairs[idx] = begin_idx
        if stack:
            raise ParseError("'begin' without matching 'again'")
        return pairs

    def _jump_to_label(self, labels: Dict[str, int], target: str) -> int:
        if target not in labels:
            raise ParseError(f"unknown label '{target}' during compile-time execution")
        return labels[target]


# ---------------------------------------------------------------------------
# NASM Emitter
# ---------------------------------------------------------------------------


class Emission:
    __slots__ = ('text', 'data', 'bss')

    def __init__(self, text: List[str] = None, data: List[str] = None, bss: List[str] = None) -> None:
        self.text = text if text is not None else []
        self.data = data if data is not None else []
        self.bss = bss if bss is not None else []

    def snapshot(self) -> str:
        parts: List[str] = []
        if self.text:
            parts.extend(["section .text", *self.text])
        if self.data:
            parts.extend(["section .data", *self.data])
        if self.bss:
            parts.extend(["section .bss", *self.bss])
        parts.append("section .note.GNU-stack noalloc noexec nowrite")
        return "\n".join(parts)


class FunctionEmitter:
    """Utility for emitting per-word assembly."""

    def __init__(self, text: List[str], debug_enabled: bool = False) -> None:
        self.text = text
        self.debug_enabled = debug_enabled
        self._current_loc: Optional[SourceLocation] = None
        self._generated_debug_path = "<generated>"

    def _emit_line_directive(self, line: int, path: str, increment: int) -> None:
        escaped = path.replace("\\", "\\\\").replace('"', '\\"')
        self.text.append(f'%line {line}+{increment} "{escaped}"')

    def set_location(self, loc: Optional[SourceLocation]) -> None:
        if not self.debug_enabled:
            return
        if loc is None:
            if self._current_loc is None:
                return
            self._emit_line_directive(1, self._generated_debug_path, increment=1)
            self._current_loc = None
            return
        if self._current_loc == loc:
            return
        self._emit_line_directive(loc.line, str(loc.path), increment=0)
        self._current_loc = loc

    def emit(self, line: str) -> None:
        self.text.append(line)

    def comment(self, message: str) -> None:
        self.text.append(f"    ; {message}")

    def push_literal(self, value: int) -> None:
        _a = self.text.append
        _a(f"    ; push {value}")
        _a("    sub r12, 8")
        _a(f"    mov qword [r12], {value}")

    def push_float(self, label: str) -> None:
        _a = self.text.append
        _a(f"    ; push float from {label}")
        _a("    sub r12, 8")
        _a(f"    mov rax, [rel {label}]")
        _a("    mov [r12], rax")

    def push_label(self, label: str) -> None:
        _a = self.text.append
        _a(f"    ; push {label}")
        _a("    sub r12, 8")
        _a(f"    mov qword [r12], {label}")

    def push_from(self, register: str) -> None:
        _a = self.text.append
        _a("    sub r12, 8")
        _a(f"    mov [r12], {register}")

    def pop_to(self, register: str) -> None:
        _a = self.text.append
        _a(f"    mov {register}, [r12]")
        _a("    add r12, 8")


def _int_trunc_div(lhs: int, rhs: int) -> int:
    if rhs == 0:
        raise ZeroDivisionError("division by zero")
    quotient = abs(lhs) // abs(rhs)
    if (lhs < 0) ^ (rhs < 0):
        quotient = -quotient
    return quotient


def _int_trunc_mod(lhs: int, rhs: int) -> int:
    if rhs == 0:
        raise ZeroDivisionError("division by zero")
    return lhs - _int_trunc_div(lhs, rhs) * rhs


def _bool_to_int(value: bool) -> int:
    return 1 if value else 0


_FOLDABLE_WORDS: Dict[str, Tuple[int, Callable[..., int]]] = {
    "+": (2, lambda a, b: a + b),
    "-": (2, lambda a, b: a - b),
    "*": (2, lambda a, b: a * b),
    "/": (2, _int_trunc_div),
    "%": (2, _int_trunc_mod),
    "==": (2, lambda a, b: _bool_to_int(a == b)),
    "!=": (2, lambda a, b: _bool_to_int(a != b)),
    "<": (2, lambda a, b: _bool_to_int(a < b)),
    "<=": (2, lambda a, b: _bool_to_int(a <= b)),
    ">": (2, lambda a, b: _bool_to_int(a > b)),
    ">=": (2, lambda a, b: _bool_to_int(a >= b)),
    "not": (1, lambda a: _bool_to_int(a == 0)),
}


_sanitize_label_cache: Dict[str, str] = {}


def sanitize_label(name: str) -> str:
    # Keep the special `_start` label unchanged so the program entrypoint
    # remains a plain `_start` symbol expected by the linker.
    if name == "_start":
        _sanitize_label_cache[name] = name
        return name
    cached = _sanitize_label_cache.get(name)
    if cached is not None:
        return cached
    parts: List[str] = []
    for ch in name:
        if ch.isalnum() or ch == "_":
            parts.append(ch)
        else:
            parts.append(f"_{ord(ch):02x}")
    safe = "".join(parts) or "anon"
    if safe[0].isdigit():
        safe = "_" + safe
    # Prefix sanitized labels to avoid accidental collisions with
    # assembler pseudo-ops or common identifiers (e.g. `abs`). The
    # prefix is applied consistently so all emitted references using
    # `sanitize_label` remain correct.
    prefixed = f"w_{safe}"
    _sanitize_label_cache[name] = prefixed
    return prefixed


# Auto-inline asm bodies with at most this many instructions (excl. ret/blanks).
_ASM_AUTO_INLINE_THRESHOLD = 8

# Pre-compiled regexes for sanitizing symbol references in asm bodies.
_RE_ASM_CALL = re.compile(r"\bcall\s+([A-Za-z_][A-Za-z0-9_]*)\b")
_RE_ASM_GLOBAL = re.compile(r"\bglobal\s+([A-Za-z_][A-Za-z0-9_]*)\b")
_RE_ASM_EXTERN = re.compile(r"\bextern\s+([A-Za-z_][A-Za-z0-9_]*)\b")
_RE_ASM_CALL_EXTRACT = re.compile(r"call\s+(?:qword\s+)?(?:\[rel\s+([A-Za-z0-9_.$@]+)\]|([A-Za-z0-9_.$@]+))")


def _is_identifier(text: str) -> bool:
    if not text:
        return False
    first = text[0]
    if not (first.isalpha() or first == "_"):
        return False
    return all(ch.isalnum() or ch == "_" for ch in text)


_C_TYPE_IGNORED_QUALIFIERS = {
    "const",
    "volatile",
    "register",
    "restrict",
    "static",
    "extern",
    "_Atomic",
}

_C_FIELD_TYPE_ALIASES: Dict[str, str] = {
    "i8": "int8_t",
    "u8": "uint8_t",
    "i16": "int16_t",
    "u16": "uint16_t",
    "i32": "int32_t",
    "u32": "uint32_t",
    "i64": "int64_t",
    "u64": "uint64_t",
    "isize": "long",
    "usize": "size_t",
    "f32": "float",
    "f64": "double",
    "ptr": "void*",
}

_C_SCALAR_TYPE_INFO: Dict[str, Tuple[int, int, str]] = {
    "char": (1, 1, "INTEGER"),
    "signed char": (1, 1, "INTEGER"),
    "unsigned char": (1, 1, "INTEGER"),
    "short": (2, 2, "INTEGER"),
    "short int": (2, 2, "INTEGER"),
    "unsigned short": (2, 2, "INTEGER"),
    "unsigned short int": (2, 2, "INTEGER"),
    "int": (4, 4, "INTEGER"),
    "unsigned int": (4, 4, "INTEGER"),
    "int32_t": (4, 4, "INTEGER"),
    "uint32_t": (4, 4, "INTEGER"),
    "long": (8, 8, "INTEGER"),
    "unsigned long": (8, 8, "INTEGER"),
    "long long": (8, 8, "INTEGER"),
    "unsigned long long": (8, 8, "INTEGER"),
    "int64_t": (8, 8, "INTEGER"),
    "uint64_t": (8, 8, "INTEGER"),
    "size_t": (8, 8, "INTEGER"),
    "ssize_t": (8, 8, "INTEGER"),
    "void": (0, 1, "INTEGER"),
    "float": (4, 4, "SSE"),
    "double": (8, 8, "SSE"),
}


def _round_up(value: int, align: int) -> int:
    if align <= 1:
        return value
    return ((value + align - 1) // align) * align


def _canonical_c_type_name(type_name: str) -> str:
    text = " ".join(type_name.strip().split())
    if not text:
        return text
    text = _C_FIELD_TYPE_ALIASES.get(text, text)
    text = text.replace(" *", "*")
    return text


def _is_struct_type(type_name: str) -> bool:
    return _canonical_c_type_name(type_name).startswith("struct ")


def _c_type_size_align_class(
    type_name: str,
    cstruct_layouts: Dict[str, CStructLayout],
) -> Tuple[int, int, str, Optional[CStructLayout]]:
    t = _canonical_c_type_name(type_name)
    if not t:
        return 8, 8, "INTEGER", None
    if t.endswith("*"):
        return 8, 8, "INTEGER", None
    if t in _C_SCALAR_TYPE_INFO:
        size, align, cls = _C_SCALAR_TYPE_INFO[t]
        return size, align, cls, None
    if t.startswith("struct "):
        struct_name = t[len("struct "):].strip()
        layout = cstruct_layouts.get(struct_name)
        if layout is None:
            raise CompileError(
                f"unknown cstruct '{struct_name}' used in extern signature"
            )
        return layout.size, layout.align, "STRUCT", layout
    # Preserve backward compatibility for unknown scalar-ish names.
    return 8, 8, "INTEGER", None


def _merge_eightbyte_class(current: str, incoming: str) -> str:
    if current == "NO_CLASS":
        return incoming
    if current == incoming:
        return current
    if current == "INTEGER" or incoming == "INTEGER":
        return "INTEGER"
    return incoming


def _classify_struct_eightbytes(
    layout: CStructLayout,
    cstruct_layouts: Dict[str, CStructLayout],
    cache: Optional[Dict[str, Optional[List[str]]]] = None,
) -> Optional[List[str]]:
    if cache is None:
        cache = {}
    cached = cache.get(layout.name)
    if cached is not None or layout.name in cache:
        return cached

    if layout.size <= 0:
        cache[layout.name] = []
        return []
    if layout.size > 16:
        cache[layout.name] = None
        return None

    chunk_count = (layout.size + 7) // 8
    classes: List[str] = ["NO_CLASS"] * chunk_count

    for field in layout.fields:
        f_size, _, f_class, nested = _c_type_size_align_class(field.type_name, cstruct_layouts)
        if f_size == 0:
            continue
        if nested is not None:
            nested_classes = _classify_struct_eightbytes(nested, cstruct_layouts, cache)
            if nested_classes is None:
                cache[layout.name] = None
                return None
            base_chunk = field.offset // 8
            for idx, cls in enumerate(nested_classes):
                chunk = base_chunk + idx
                if chunk >= len(classes):
                    cache[layout.name] = None
                    return None
                classes[chunk] = _merge_eightbyte_class(classes[chunk], cls or "INTEGER")
            continue

        start_chunk = field.offset // 8
        end_chunk = (field.offset + f_size - 1) // 8
        if end_chunk >= len(classes):
            cache[layout.name] = None
            return None
        if f_class == "SSE" and start_chunk != end_chunk:
            cache[layout.name] = None
            return None
        for chunk in range(start_chunk, end_chunk + 1):
            classes[chunk] = _merge_eightbyte_class(classes[chunk], f_class)

    for idx, cls in enumerate(classes):
        if cls == "NO_CLASS":
            classes[idx] = "INTEGER"
    cache[layout.name] = classes
    return classes


def _split_trailing_identifier(text: str) -> Tuple[str, Optional[str]]:
    if not text:
        return text, None
    idx = len(text)
    while idx > 0 and (text[idx - 1].isalnum() or text[idx - 1] == "_"):
        idx -= 1
    if idx == 0 or idx == len(text):
        return text, None
    prefix = text[:idx]
    suffix = text[idx:]
    if any(not ch.isalnum() and ch != "_" for ch in prefix):
        return prefix, suffix
    return text, None


def _normalize_c_type_tokens(tokens: Sequence[str], *, allow_default: bool) -> str:
    pointer_count = 0
    parts: List[str] = []
    for raw in tokens:
        text = raw.strip()
        if not text:
            continue
        if set(text) == {"*"}:
            pointer_count += len(text)
            continue
        while text.startswith("*"):
            pointer_count += 1
            text = text[1:]
        while text.endswith("*"):
            pointer_count += 1
            text = text[:-1]
        if not text:
            continue
        if text in _C_TYPE_IGNORED_QUALIFIERS:
            continue
        parts.append(text)
    if not parts:
        if allow_default:
            base = "int"
        else:
            raise ParseError("expected C type before parameter name")
    else:
        base = " ".join(parts)
    return base + ("*" * pointer_count)


def _ctype_uses_sse(type_name: Optional[str]) -> bool:
    if type_name is None:
        return False
    base = type_name.rstrip("*")
    return base in {"float", "double"}


def _parse_string_literal(token: Token) -> Optional[str]:
    text = token.lexeme
    if len(text) < 2 or text[0] != '"' or text[-1] != '"':
        return None
    body = text[1:-1]
    result: List[str] = []
    idx = 0
    while idx < len(body):
        char = body[idx]
        if char != "\\":
            result.append(char)
            idx += 1
            continue
        idx += 1
        if idx >= len(body):
            raise ParseError(
                f"unterminated escape sequence in string literal at {token.line}:{token.column}"
            )
        escape = body[idx]
        idx += 1
        if escape == 'n':
            result.append("\n")
        elif escape == 't':
            result.append("\t")
        elif escape == 'r':
            result.append("\r")
        elif escape == '0':
            result.append("\0")
        elif escape == '"':
            result.append('"')
        elif escape == "\\":
            result.append("\\")
        else:
            raise ParseError(
                f"unsupported escape sequence '\\{escape}' in string literal at {token.line}:{token.column}"
            )
    return "".join(result)


class _CTHandleTable:
    """Keeps Python object references stable across compile-time asm calls."""

    def __init__(self) -> None:
        self.objects: Dict[int, Any] = {}
        self.refs: List[Any] = []
        self.string_buffers: List[ctypes.Array[Any]] = []

    def clear(self) -> None:
        self.objects.clear()
        self.refs.clear()
        self.string_buffers.clear()

    def store(self, value: Any) -> int:
        addr = id(value)
        self.refs.append(value)
        self.objects[addr] = value
        return addr


class Assembler:
    def __init__(
        self,
        dictionary: Dictionary,
        *,
        enable_constant_folding: bool = True,
        enable_static_list_folding: bool = True,
        enable_peephole_optimization: bool = True,
        enable_loop_unroll: bool = True,
        enable_auto_inline: bool = True,
        enable_extern_type_check: bool = True,
        enable_stack_check: bool = True,
        loop_unroll_threshold: int = 8,
        verbosity: int = 0,
    ) -> None:
        self.dictionary = dictionary
        self._string_literals: Dict[str, Tuple[str, int]] = {}
        self._float_literals: Dict[float, str] = {}
        self._data_section: Optional[List[str]] = None
        self._inline_stack: List[str] = []
        self._inline_counter: int = 0
        self._unroll_counter: int = 0
        self._emit_stack: List[str] = []
        self._cstruct_layouts: Dict[str, CStructLayout] = {}
        self._export_all_defs: bool = False
        self.enable_constant_folding = enable_constant_folding
        self.enable_static_list_folding = enable_static_list_folding
        self.enable_peephole_optimization = enable_peephole_optimization
        self.enable_loop_unroll = enable_loop_unroll
        self.enable_auto_inline = enable_auto_inline
        self.enable_extern_type_check = enable_extern_type_check
        self.enable_stack_check = enable_stack_check
        self.loop_unroll_threshold = loop_unroll_threshold
        self.verbosity = verbosity
        self._last_cfg_definitions: List[Definition] = []
        self._need_cfg: bool = False

    def _copy_definition_for_cfg(self, definition: Definition) -> Definition:
        return Definition(
            name=definition.name,
            body=[_make_op(node.op, node.data, node.loc) for node in definition.body],
            immediate=definition.immediate,
            compile_only=definition.compile_only,
            terminator=definition.terminator,
            inline=definition.inline,
        )

    def _format_cfg_op(self, node: Op) -> str:
        kind = node._opcode
        data = node.data
        if kind == OP_LITERAL:
            return f"push {data!r}"
        if kind == OP_WORD:
            return str(data)
        if kind == OP_WORD_PTR:
            return f"&{data}"
        if kind == OP_BRANCH_ZERO:
            return "branch_zero"
        if kind == OP_JUMP:
            return "jump"
        if kind == OP_LABEL:
            return f".{data}:"
        if kind == OP_FOR_BEGIN:
            return "for"
        if kind == OP_FOR_END:
            return "end  (for)"
        if kind == OP_LIST_BEGIN:
            return "list_begin"
        if kind == OP_LIST_END:
            return "list_end"
        if kind == OP_LIST_LITERAL:
            return f"list_literal {data}"
        return f"{node.op}" if data is None else f"{node.op} {data!r}"

    @staticmethod
    def _dot_html_escape(text: str) -> str:
        return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;").replace('"', "&quot;")

    @staticmethod
    def _dot_escape(text: str) -> str:
        return text.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")

    @staticmethod
    def _dot_id(text: str) -> str:
        return re.sub(r"[^A-Za-z0-9_]", "_", text)

    def _cfg_loc_str(self, node: Op) -> str:
        if node.loc is None:
            return ""
        return f"{node.loc.path.name}:{node.loc.line}"

    def _definition_cfg_blocks_and_edges(self, definition: Definition) -> Tuple[List[Tuple[int, int]], List[Tuple[int, int, str]]]:
        nodes = definition.body
        if not nodes:
            return [], []

        label_positions = self._cfg_label_positions(nodes)
        for_pairs = self._for_pairs(nodes)

        leaders: Set[int] = {0}

        def add_leader(idx: int) -> None:
            if 0 <= idx < len(nodes):
                leaders.add(idx)

        for idx, node in enumerate(nodes):
            kind = node._opcode
            if kind == OP_LABEL:
                leaders.add(idx)
            elif kind == OP_BRANCH_ZERO:
                target = label_positions.get(str(node.data))
                if target is not None:
                    add_leader(target)
                add_leader(idx + 1)
            elif kind == OP_JUMP:
                target = label_positions.get(str(node.data))
                if target is not None:
                    add_leader(target)
                add_leader(idx + 1)
            elif kind == OP_FOR_BEGIN:
                end_idx = for_pairs.get(idx)
                if end_idx is not None:
                    add_leader(end_idx + 1)
                add_leader(idx + 1)
            elif kind == OP_FOR_END:
                begin_idx = for_pairs.get(idx)
                if begin_idx is not None:
                    add_leader(begin_idx + 1)
                add_leader(idx + 1)

        ordered = sorted(leaders)
        blocks: List[Tuple[int, int]] = []
        for i, start in enumerate(ordered):
            end = ordered[i + 1] if i + 1 < len(ordered) else len(nodes)
            if start < end:
                blocks.append((start, end))

        block_by_ip: Dict[int, int] = {}
        for block_idx, (start, end) in enumerate(blocks):
            for ip in range(start, end):
                block_by_ip[ip] = block_idx

        edges: List[Tuple[int, int, str]] = []

        def add_edge(src_block: int, target_ip: int, label: str) -> None:
            if target_ip < 0 or target_ip >= len(nodes):
                return
            dst_block = block_by_ip.get(target_ip)
            if dst_block is None:
                return
            edges.append((src_block, dst_block, label))

        for block_idx, (_start, end) in enumerate(blocks):
            last_ip = end - 1
            node = nodes[last_ip]
            kind = node._opcode

            if kind == OP_BRANCH_ZERO:
                target = label_positions.get(str(node.data))
                if target is not None:
                    add_edge(block_idx, target, "zero")
                add_edge(block_idx, last_ip + 1, "nonzero")
                continue

            if kind == OP_JUMP:
                target = label_positions.get(str(node.data))
                if target is not None:
                    add_edge(block_idx, target, "jmp")
                continue

            if kind == OP_FOR_BEGIN:
                end_idx = for_pairs.get(last_ip)
                add_edge(block_idx, last_ip + 1, "enter")
                if end_idx is not None:
                    add_edge(block_idx, end_idx + 1, "empty")
                continue

            if kind == OP_FOR_END:
                begin_idx = for_pairs.get(last_ip)
                if begin_idx is not None:
                    add_edge(block_idx, begin_idx + 1, "loop")
                add_edge(block_idx, last_ip + 1, "exit")
                continue

            add_edge(block_idx, last_ip + 1, "next")

        edges.sort(key=lambda item: (item[0], item[1], item[2]))
        return blocks, edges

    def _cfg_label_positions(self, nodes: Sequence[Op]) -> Dict[str, int]:
        positions: Dict[str, int] = {}
        for idx, node in enumerate(nodes):
            if node._opcode == OP_LABEL:
                positions[str(node.data)] = idx
        return positions

    # ---- edge style lookup ----
    _CFG_EDGE_STYLES: Dict[str, Dict[str, str]] = {
        "nonzero": {"color": "#2e7d32", "fontcolor": "#2e7d32", "label": "T", "penwidth": "2"},
        "zero":    {"color": "#c62828", "fontcolor": "#c62828", "label": "F", "style": "dashed", "penwidth": "2"},
        "jmp":     {"color": "#1565c0", "fontcolor": "#1565c0", "label": "jmp", "penwidth": "1.5"},
        "next":    {"color": "#616161", "fontcolor": "#616161", "label": ""},
        "enter":   {"color": "#2e7d32", "fontcolor": "#2e7d32", "label": "enter", "penwidth": "2"},
        "empty":   {"color": "#c62828", "fontcolor": "#c62828", "label": "empty", "style": "dashed", "penwidth": "1.5"},
        "loop":    {"color": "#6a1b9a", "fontcolor": "#6a1b9a", "label": "loop", "style": "bold", "penwidth": "2"},
        "exit":    {"color": "#ef6c00", "fontcolor": "#ef6c00", "label": "exit", "penwidth": "1.5"},
    }

    def _cfg_edge_attrs(self, label: str) -> str:
        style = self._CFG_EDGE_STYLES.get(label, {"label": label, "color": "black"})
        parts = [f'{k}="{v}"' for k, v in style.items()]
        return ", ".join(parts)

    def render_last_cfg_dot(self) -> str:
        lines: List[str] = [
            "digraph l2_cfg {",
            '    rankdir=TB;',
            '    newrank=true;',
            '    compound=true;',
            '    fontname="Helvetica";',
            '    node [shape=plaintext, fontname="Courier New", fontsize=10];',
            '    edge [fontname="Helvetica", fontsize=9];',
        ]

        if not self._last_cfg_definitions:
            lines.append('    empty [shape=box, label="(no definitions)"];')
            lines.append("}")
            return "\n".join(lines)

        for defn in self._last_cfg_definitions:
            cluster_id = self._dot_id(f"cluster_{defn.name}")
            prefix = self._dot_id(defn.name)

            blocks, edges = self._definition_cfg_blocks_and_edges(defn)

            # Determine which blocks are loop-back targets
            back_targets: Set[int] = set()
            for src, dst, elabel in edges:
                if elabel == "loop" or (elabel == "jmp" and dst <= src):
                    back_targets.add(dst)

            # Determine exit blocks (no outgoing edges)
            has_successor: Set[int] = {src for src, _, _ in edges}

            lines.append(f"    subgraph {cluster_id} {{")
            lines.append(f'        label=<<B>{self._dot_html_escape(defn.name)}</B>>;')
            lines.append('        labeljust=l;')
            lines.append('        style=dashed; color="#9e9e9e";')
            lines.append(f'        fontname="Helvetica"; fontsize=12;')

            if not blocks:
                node_id = self._dot_id(f"{defn.name}_empty")
                lines.append(f'        {node_id} [shape=box, label="(empty)"];')
                lines.append("    }")
                continue

            for block_idx, (start, end) in enumerate(blocks):
                node_id = f"{prefix}_b{block_idx}"
                is_entry = block_idx == 0
                is_exit = block_idx not in has_successor
                is_loop_head = block_idx in back_targets

                # Pick header colour
                if is_entry:
                    hdr_bg = "#c8e6c9"  # green
                    hdr_fg = "#1b5e20"
                elif is_exit:
                    hdr_bg = "#ffcdd2"  # red
                    hdr_fg = "#b71c1c"
                elif is_loop_head:
                    hdr_bg = "#bbdefb"  # blue
                    hdr_fg = "#0d47a1"
                else:
                    hdr_bg = "#e0e0e0"  # grey
                    hdr_fg = "#212121"

                # Block annotation
                tag = ""
                if is_entry:
                    tag = " (entry)"
                if is_exit:
                    tag += " (exit)"
                if is_loop_head:
                    tag += " (loop)"

                # Source location from first non-label instruction
                loc_str = ""
                for ip in range(start, end):
                    n = defn.body[ip]
                    if n._opcode != OP_LABEL:
                        loc_str = self._cfg_loc_str(n)
                        break
                if not loc_str and defn.body[start].loc:
                    loc_str = self._cfg_loc_str(defn.body[start])

                # Build HTML table label
                hdr_text = f"BB{block_idx}{tag}"
                if loc_str:
                    hdr_text += f"  [{loc_str}]"

                rows: List[str] = []
                rows.append(f'<TR><TD BGCOLOR="{hdr_bg}" ALIGN="LEFT"><FONT COLOR="{hdr_fg}"><B>{self._dot_html_escape(hdr_text)}</B></FONT></TD></TR>')

                for ip in range(start, end):
                    n = defn.body[ip]
                    op_text = self._format_cfg_op(n)
                    esc = self._dot_html_escape(f"  {ip:3d}  {op_text}")
                    kind = n._opcode
                    if kind in (OP_BRANCH_ZERO, OP_JUMP, OP_FOR_BEGIN, OP_FOR_END):
                        # Highlight control-flow ops
                        rows.append(f'<TR><TD ALIGN="LEFT" BGCOLOR="#fff9c4"><FONT COLOR="#f57f17" FACE="Courier New">{esc}</FONT></TD></TR>')
                    elif kind == OP_LABEL:
                        rows.append(f'<TR><TD ALIGN="LEFT" BGCOLOR="#f5f5f5"><FONT COLOR="#9e9e9e" FACE="Courier New">{esc}</FONT></TD></TR>')
                    else:
                        rows.append(f'<TR><TD ALIGN="LEFT"><FONT FACE="Courier New">{esc}</FONT></TD></TR>')

                table = f'<<TABLE BORDER="1" CELLBORDER="0" CELLSPACING="0" CELLPADDING="4">{"".join(rows)}</TABLE>>'
                lines.append(f"        {node_id} [label={table}];")

            for src, dst, edge_label in edges:
                src_id = f"{prefix}_b{src}"
                dst_id = f"{prefix}_b{dst}"
                attrs = self._cfg_edge_attrs(edge_label)
                lines.append(f"        {src_id} -> {dst_id} [{attrs}];")

            lines.append("    }")

        # Legend
        lines.append("    subgraph cluster_legend {")
        lines.append('        label=<<B>Legend</B>>;')
        lines.append('        fontname="Helvetica"; fontsize=10;')
        lines.append('        style=solid; color="#bdbdbd";')
        lines.append('        node [shape=box, fontname="Helvetica", fontsize=9, width=1.3, height=0.3];')
        lines.append('        edge [style=invis];')
        lines.append('        leg_entry [label="entry", style=filled, fillcolor="#c8e6c9", fontcolor="#1b5e20"];')
        lines.append('        leg_exit  [label="exit",  style=filled, fillcolor="#ffcdd2", fontcolor="#b71c1c"];')
        lines.append('        leg_loop  [label="loop header", style=filled, fillcolor="#bbdefb", fontcolor="#0d47a1"];')
        lines.append('        leg_norm  [label="basic block", style=filled, fillcolor="#e0e0e0", fontcolor="#212121"];')
        lines.append('        leg_entry -> leg_exit -> leg_loop -> leg_norm;')
        lines.append("    }")

        lines.append("}")
        return "\n".join(lines)

    @staticmethod
    def _log_op_diff(name: str, pass_name: str, before: list, after: list) -> None:
        """Print a contextual diff of (op, data) tuples for v4 optimization detail."""
        import difflib
        def _fmt(op_data: tuple) -> str:
            op, data = op_data
            if op == "literal":
                return f"  {data!r}" if isinstance(data, str) else f"  {data}"
            if op == "word":
                return f"  {data}"
            return f"  [{op}] {data!r}" if data is not None else f"  [{op}]"
        before_lines = [_fmt(t) for t in before]
        after_lines = [_fmt(t) for t in after]
        diff = list(difflib.unified_diff(before_lines, after_lines, n=1, lineterm=""))
        if len(diff) <= 2:
            return
        for line in diff[2:]:  # skip --- / +++ header
            if line.startswith("@@"):
                print(f"[v4]   {pass_name} '{name}' {line}")
            elif line.startswith("-"):
                print(f"[v4]     - {line[1:]}")
            elif line.startswith("+"):
                print(f"[v4]     + {line[1:]}")
            else:
                print(f"[v4]      {line[1:]}")

    def _peephole_optimize_definition(self, definition: Definition) -> None:
        nodes = definition.body
        if not nodes:
            return
        all_rules = _PEEPHOLE_ALL_RULES
        first_words = _PEEPHOLE_FIRST_WORDS
        _OP_W = OP_WORD

        # Outer loop: keeps re-running all passes until nothing changes.
        any_changed = True
        while any_changed:
            any_changed = False

            # ---------- Pass 1: word-only pattern rewriting ----------
            changed = True
            while changed:
                changed = False
                optimized: List[Op] = []
                _opt_append = optimized.append
                idx = 0
                nlen = len(nodes)
                while idx < nlen:
                    node = nodes[idx]
                    if node._opcode != _OP_W:
                        _opt_append(node)
                        idx += 1
                        continue
                    word_name = node.data
                    if word_name not in first_words:
                        _opt_append(node)
                        idx += 1
                        continue

                    # Try longest match first (max pattern len = 3)
                    matched = False
                    # Try window=3
                    if idx + 2 < nlen:
                        b = nodes[idx + 1]
                        c = nodes[idx + 2]
                        if b._opcode == _OP_W and c._opcode == _OP_W:
                            repl = all_rules.get((word_name, b.data, c.data))
                            if repl is not None:
                                base_loc = node.loc
                                for r in repl:
                                    if r[0] == 'l' and r[:8] == "literal_":
                                        _opt_append(_make_literal_op(int(r[8:]), loc=base_loc))
                                    else:
                                        _opt_append(_make_word_op(r, base_loc))
                                idx += 3
                                changed = True
                                matched = True
                    # Try window=2
                    if not matched and idx + 1 < nlen:
                        b = nodes[idx + 1]
                        if b._opcode == _OP_W:
                            repl = all_rules.get((word_name, b.data))
                            if repl is not None:
                                base_loc = node.loc
                                for r in repl:
                                    if r[0] == 'l' and r[:8] == "literal_":
                                        _opt_append(_make_literal_op(int(r[8:]), loc=base_loc))
                                    else:
                                        _opt_append(_make_word_op(r, base_loc))
                                idx += 2
                                changed = True
                                matched = True
                    if not matched:
                        _opt_append(node)
                        idx += 1
                if changed:
                    any_changed = True
                nodes = optimized

            # ---------- Pass 2: literal + word algebraic identities ----------
            _CANCEL_PAIRS = _PEEPHOLE_CANCEL_PAIRS
            _SHIFT_OPS = _PEEPHOLE_SHIFT_OPS
            changed = True
            while changed:
                changed = False
                optimized = []
                _opt_a = optimized.append
                nlen = len(nodes)
                idx = 0
                while idx < nlen:
                    node = nodes[idx]
                    n_oc = node._opcode

                    # -- Redundant unary pairs (word word) --
                    if n_oc == OP_WORD and idx + 1 < nlen:
                        b = nodes[idx + 1]
                        if b._opcode == OP_WORD:
                            wa, wb = node.data, b.data
                            if (wa, wb) in _CANCEL_PAIRS:
                                idx += 2
                                changed = True
                                continue
                            if wa == "abs" and wb == "abs":
                                _opt_a(node)
                                idx += 2
                                changed = True
                                continue

                    # -- scalar literal patterns (excludes string literals which push 2 values) --
                    if n_oc == OP_LITERAL and type(node.data) is not str and idx + 1 < nlen:
                        b = nodes[idx + 1]
                        b_oc = b._opcode

                        # literal + dup -> literal literal
                        if b_oc == OP_WORD and b.data == "dup":
                            _opt_a(node)
                            _opt_a(_make_literal_op(node.data, node.loc))
                            idx += 2
                            changed = True
                            continue

                        # literal + drop -> (nothing)
                        if b_oc == OP_WORD and b.data == "drop":
                            idx += 2
                            changed = True
                            continue

                        # literal literal + 2drop / swap
                        if b_oc == OP_LITERAL and type(b.data) is not str and idx + 2 < nlen:
                            c = nodes[idx + 2]
                            if c._opcode == OP_WORD:
                                cd = c.data
                                if cd == "2drop":
                                    idx += 3
                                    changed = True
                                    continue
                                if cd == "swap":
                                    _opt_a(_make_literal_op(b.data, b.loc))
                                    _opt_a(_make_literal_op(node.data, node.loc))
                                    idx += 3
                                    changed = True
                                    continue

                        # Binary op identities: literal K + word
                        if type(node.data) is int and b_oc == OP_WORD:
                            k = node.data
                            w = b.data
                            base_loc = node.loc or b.loc

                            if (w == "+" and k == 0) or (w == "-" and k == 0) or (w == "*" and k == 1) or (w == "/" and k == 1):
                                idx += 2; changed = True; continue

                            if w == "*":
                                if k == 0:
                                    _opt_a(_make_word_op("drop", base_loc))
                                    _opt_a(_make_literal_op(0, base_loc))
                                    idx += 2; changed = True; continue
                                if k == -1:
                                    _opt_a(_make_word_op("neg", base_loc))
                                    idx += 2; changed = True; continue
                                if k > 1 and (k & (k - 1)) == 0:
                                    _opt_a(_make_literal_op(k.bit_length() - 1, base_loc))
                                    _opt_a(_make_word_op("shl", base_loc))
                                    idx += 2; changed = True; continue

                            if w == "band":
                                if k == 0:
                                    _opt_a(_make_word_op("drop", base_loc))
                                    _opt_a(_make_literal_op(0, base_loc))
                                    idx += 2; changed = True; continue
                                if k == -1:
                                    idx += 2; changed = True; continue

                            if w == "bor":
                                if k == -1:
                                    _opt_a(_make_word_op("drop", base_loc))
                                    _opt_a(_make_literal_op(-1, base_loc))
                                    idx += 2; changed = True; continue
                                if k == 0:
                                    idx += 2; changed = True; continue

                            if w == "bxor" and k == 0:
                                idx += 2; changed = True; continue

                            if w == "%" and k == 1:
                                _opt_a(_make_word_op("drop", base_loc))
                                _opt_a(_make_literal_op(0, base_loc))
                                idx += 2; changed = True; continue

                            if w == "==" and k == 0:
                                _opt_a(_make_word_op("not", base_loc))
                                idx += 2; changed = True; continue

                            if w in _SHIFT_OPS and k == 0:
                                idx += 2; changed = True; continue

                            if w == "+":
                                if k == 1:
                                    _opt_a(_make_word_op("inc", base_loc))
                                    idx += 2; changed = True; continue
                                if k == -1:
                                    _opt_a(_make_word_op("dec", base_loc))
                                    idx += 2; changed = True; continue
                            if w == "-":
                                if k == 1:
                                    _opt_a(_make_word_op("dec", base_loc))
                                    idx += 2; changed = True; continue
                                if k == -1:
                                    _opt_a(_make_word_op("inc", base_loc))
                                    idx += 2; changed = True; continue

                    _opt_a(node)
                    idx += 1
                if changed:
                    any_changed = True
                nodes = optimized

            # ---------- Pass 3: dead-code after unconditional jump/end ----------
            new_nodes: List[Op] = []
            dead = False
            for node in nodes:
                kind = node._opcode
                if dead:
                    # A label ends the dead region.
                    if kind == OP_LABEL:
                        dead = False
                        new_nodes.append(node)
                    else:
                        any_changed = True
                    continue
                new_nodes.append(node)
                if kind in _PEEPHOLE_TERMINATORS:
                    dead = True
            if len(new_nodes) != len(nodes):
                any_changed = True
            nodes = new_nodes

        definition.body = nodes

    def _fold_constants_in_definition(self, definition: Definition) -> None:
        if not definition.body:
            return
        optimized: List[Op] = []
        for node in definition.body:
            optimized.append(node)
            self._attempt_constant_fold_tail(optimized)
        definition.body = optimized

    def _attempt_constant_fold_tail(self, nodes: List[Op]) -> None:
        _LIT = OP_LITERAL
        _W = OP_WORD
        while nodes:
            last = nodes[-1]
            if last._opcode != _W:
                return
            fold_entry = _FOLDABLE_WORDS.get(last.data)
            if fold_entry is None:
                return
            arity, func = fold_entry
            nlen = len(nodes)
            if nlen < arity + 1:
                return
            # Fast path for binary ops (arity 2, the most common case)
            if arity == 2:
                a = nodes[-3]
                b = nodes[-2]
                if a._opcode != _LIT or type(a.data) is not int:
                    return
                if b._opcode != _LIT or type(b.data) is not int:
                    return
                try:
                    result = func(a.data, b.data)
                except Exception:
                    return
                new_loc = a.loc or last.loc
                del nodes[-3:]
                nodes.append(_make_literal_op(result, new_loc))
                continue
            # Fast path for unary ops (arity 1)
            if arity == 1:
                a = nodes[-2]
                if a._opcode != _LIT or type(a.data) is not int:
                    return
                try:
                    result = func(a.data)
                except Exception:
                    return
                new_loc = a.loc or last.loc
                del nodes[-2:]
                nodes.append(_make_literal_op(result, new_loc))
                continue
            # General case
            operands = nodes[-(arity + 1):-1]
            if any(op._opcode != _LIT or type(op.data) is not int for op in operands):
                return
            values = [op.data for op in operands]
            try:
                result = func(*values)
            except Exception:
                return
            new_loc = operands[0].loc or last.loc
            nodes[-(arity + 1):] = [_make_literal_op(result, new_loc)]

    def _for_pairs(self, nodes: Sequence[Op]) -> Dict[int, int]:
        stack: List[int] = []
        pairs: Dict[int, int] = {}
        for idx, node in enumerate(nodes):
            if node._opcode == OP_FOR_BEGIN:
                stack.append(idx)
            elif node._opcode == OP_FOR_END:
                if not stack:
                    raise CompileError("'end' without matching 'for'")
                begin_idx = stack.pop()
                pairs[begin_idx] = idx
                pairs[idx] = begin_idx
        if stack:
            raise CompileError("'for' without matching 'end'")
        return pairs

    def _collect_internal_labels(self, nodes: Sequence[Op]) -> Set[str]:
        labels: Set[str] = set()
        for node in nodes:
            kind = node._opcode
            data = node.data
            if kind == OP_LABEL:
                labels.add(str(data))
            elif kind == OP_FOR_BEGIN or kind == OP_FOR_END:
                labels.add(str(data["loop"]))
                labels.add(str(data["end"]))
            elif kind == OP_LIST_BEGIN or kind == OP_LIST_END:
                labels.add(str(data))
        return labels

    def _clone_nodes_with_label_remap(
        self,
        nodes: Sequence[Op],
        internal_labels: Set[str],
        suffix: str,
    ) -> List[Op]:
        label_map: Dict[str, str] = {}

        def remap(label: str) -> str:
            if label not in internal_labels:
                return label
            if label not in label_map:
                label_map[label] = f"{label}__unr{suffix}"
            return label_map[label]

        cloned: List[Op] = []
        for node in nodes:
            kind = node._opcode
            data = node.data
            if kind == OP_LABEL:
                cloned.append(_make_op("label", remap(str(data)), loc=node.loc))
                continue
            if kind == OP_JUMP or kind == OP_BRANCH_ZERO:
                target = str(data)
                mapped = remap(target) if target in internal_labels else target
                cloned.append(_make_op(node.op, mapped, node.loc))
                continue
            if kind == OP_FOR_BEGIN or kind == OP_FOR_END:
                cloned.append(
                    Op(
                        op=node.op,
                        data={
                            "loop": remap(str(data["loop"])),
                            "end": remap(str(data["end"])),
                        },
                        loc=node.loc,
                    )
                )
                continue
            if kind == OP_LIST_BEGIN or kind == OP_LIST_END:
                cloned.append(_make_op(node.op, remap(str(data)), loc=node.loc))
                continue
            cloned.append(_make_op(node.op, data, node.loc))
        return cloned

    def _unroll_constant_for_loops(self, definition: Definition) -> None:
        threshold = self.loop_unroll_threshold
        if threshold <= 0:
            return
        nodes = definition.body
        pairs = self._for_pairs(nodes)
        if not pairs:
            return

        rebuilt: List[Op] = []
        idx = 0
        while idx < len(nodes):
            node = nodes[idx]
            if node._opcode == OP_FOR_BEGIN and idx > 0:
                prev = nodes[idx - 1]
                if prev._opcode == OP_LITERAL and isinstance(prev.data, int):
                    count = int(prev.data)
                    end_idx = pairs.get(idx)
                    if end_idx is None:
                        raise CompileError("internal loop bookkeeping error")
                    if count <= 0:
                        if rebuilt and rebuilt[-1] is prev:
                            rebuilt.pop()
                        idx = end_idx + 1
                        continue
                    if count <= threshold:
                        if rebuilt and rebuilt[-1] is prev:
                            rebuilt.pop()
                        body = nodes[idx + 1:end_idx]
                        internal_labels = self._collect_internal_labels(body)
                        for copy_idx in range(count):
                            suffix = f"{self._unroll_counter}_{copy_idx}"
                            rebuilt.extend(
                                self._clone_nodes_with_label_remap(
                                    body,
                                    internal_labels,
                                    suffix,
                                )
                            )
                        self._unroll_counter += 1
                        idx = end_idx + 1
                        continue
            rebuilt.append(node)
            idx += 1

        definition.body = rebuilt

    def _fold_static_list_literals_definition(self, definition: Definition) -> None:
        nodes = definition.body
        rebuilt: List[Op] = []
        idx = 0
        while idx < len(nodes):
            node = nodes[idx]
            if node.op != "list_begin":
                rebuilt.append(node)
                idx += 1
                continue

            depth = 1
            j = idx + 1
            static_values: List[int] = []
            is_static = True

            while j < len(nodes):
                cur = nodes[j]
                if cur._opcode == OP_LIST_BEGIN:
                    depth += 1
                    is_static = False
                    j += 1
                    continue
                if cur._opcode == OP_LIST_END:
                    depth -= 1
                    if depth == 0:
                        break
                    j += 1
                    continue

                if depth == 1:
                    if cur._opcode == OP_LITERAL and isinstance(cur.data, int):
                        static_values.append(int(cur.data))
                    else:
                        is_static = False
                j += 1

            if depth != 0:
                rebuilt.append(node)
                idx += 1
                continue

            if is_static:
                rebuilt.append(_make_op("list_literal", static_values, node.loc))
                idx = j + 1
                continue

            rebuilt.append(node)
            idx += 1

        definition.body = rebuilt

    # Known stack effects: (inputs_consumed, outputs_produced)
    _BUILTIN_STACK_EFFECTS: Dict[str, Tuple[int, int]] = {
        "dup":   (1, 2),
        "drop":  (1, 0),
        "swap":  (2, 2),
        "over":  (2, 3),
        "rot":   (3, 3),
        "nip":   (2, 1),
        "tuck":  (2, 3),
        "+":     (2, 1),
        "-":     (2, 1),
        "*":     (2, 1),
        "/":     (2, 1),
        "mod":   (2, 1),
        "=":     (2, 1),
        "!=":    (2, 1),
        "<":     (2, 1),
        ">":     (2, 1),
        "<=":    (2, 1),
        ">=":    (2, 1),
        "and":   (2, 1),
        "or":    (2, 1),
        "xor":   (2, 1),
        "not":   (1, 1),
        "shl":   (2, 1),
        "shr":   (2, 1),
        "neg":   (1, 1),
        "@":     (1, 1),
        "!":     (2, 0),
        "@8":    (1, 1),
        "!8":    (2, 0),
        "@16":   (1, 1),
        "!16":   (2, 0),
        "@32":   (1, 1),
        "!32":   (2, 0),
    }

    def _check_extern_types(self, definitions: Sequence[Union["Definition", "AsmDefinition"]]) -> None:
        """Basic type checking: verify stack depth at extern call sites and builtin underflows."""
        _v = self.verbosity
        _effects = self._BUILTIN_STACK_EFFECTS
        _OP_LIT = OP_LITERAL
        _OP_W = OP_WORD
        _OP_WP = OP_WORD_PTR
        _OP_LIST_LIT = OP_LIST_LITERAL
        _check_extern = self.enable_extern_type_check
        _check_stack = self.enable_stack_check
        extern_issues: List[str] = []
        stack_issues: List[str] = []
        for defn in definitions:
            if not isinstance(defn, Definition):
                continue
            # depth tracks values on the data stack relative to entry.
            # 'main' starts with an empty stack.  For other words we can
            # only check underflows when a stack-effect comment provides
            # the input count (e.g. ``# a b -- c`` -> 2 inputs).
            si = defn.stack_inputs
            if si is not None:
                known_entry_depth = si
            elif defn.name == 'main':
                known_entry_depth = 0
            else:
                known_entry_depth = -1  # unknown — disable underflow checks
            depth: Optional[int] = known_entry_depth if known_entry_depth >= 0 else 0
            for node in defn.body:
                opc = node._opcode
                if depth is None:
                    # After control flow we can't track depth reliably
                    if opc == _OP_W and _check_extern:
                        word = self.dictionary.lookup(str(node.data))
                        if word and word.is_extern and word.extern_signature:
                            # Can't verify — depth unknown after branch
                            if _v >= 3:
                                print(f"[v3] type-check: '{defn.name}' -> extern '{word.name}' skipped (unknown depth)")
                    continue
                if opc == _OP_LIT:
                    # String literals push 2 values (addr + len), others push 1.
                    depth += 2 if isinstance(node.data, str) else 1
                elif opc == _OP_WP:
                    depth += 1
                elif opc == _OP_LIST_LIT:
                    depth += 1
                elif opc in (OP_BRANCH_ZERO, OP_JUMP, OP_LABEL, OP_FOR_BEGIN, OP_FOR_END):
                    # Control flow — stop tracking precisely
                    if opc == OP_BRANCH_ZERO:
                        depth -= 1
                    depth = None
                elif opc == _OP_W:
                    name = str(node.data)
                    word = self.dictionary.lookup(name)
                    if word is None:
                        depth = None
                        continue
                    if word.is_extern and word.extern_signature:
                        inputs = word.extern_inputs
                        outputs = word.extern_outputs
                        if _check_extern and known_entry_depth >= 0 and depth < inputs:
                            extern_issues.append(
                                f"in '{defn.name}': extern '{name}' expects {inputs} "
                                f"argument{'s' if inputs != 1 else ''}, but only {depth} "
                                f"value{'s' if depth != 1 else ''} on the stack"
                            )
                        depth = depth - inputs + outputs
                    elif name in _effects:
                        consumed, produced = _effects[name]
                        if known_entry_depth >= 0 and depth < consumed:
                            if _check_stack:
                                stack_issues.append(
                                    f"in '{defn.name}': '{name}' needs {consumed} "
                                    f"value{'s' if consumed != 1 else ''}, but only {depth} "
                                    f"on the stack"
                                )
                            depth = None
                        else:
                            depth = depth - consumed + produced
                    elif word.is_extern:
                        # Extern without signature — apply inputs/outputs
                        depth = depth - word.extern_inputs + word.extern_outputs
                    else:
                        # Unknown word — lose depth tracking
                        depth = None

        for issue in extern_issues:
            print(f"[extern-type-check] warning: {issue}")
        for issue in stack_issues:
            print(f"[stack-check] warning: {issue}")
        if _v >= 1 and not extern_issues and not stack_issues:
            print(f"[v1] type-check: no issues found")

    def _reachable_runtime_defs(self, runtime_defs: Sequence[Union[Definition, AsmDefinition]], extra_roots: Optional[Sequence[str]] = None) -> Set[str]:
        edges: Dict[str, Set[str]] = {}
        _OP_W = OP_WORD
        _OP_WP = OP_WORD_PTR
        for definition in runtime_defs:
            refs: Set[str] = set()
            if isinstance(definition, Definition):
                for node in definition.body:
                    oc = node._opcode
                    if oc == _OP_W or oc == _OP_WP:
                        refs.add(str(node.data))
            elif isinstance(definition, AsmDefinition):
                # Collect obvious textual `call` targets from asm bodies so
                # asm-defined entry points can create edges into the word
                # graph.  The extractor below will tolerate common call forms
                # such as `call foo` and `call [rel foo]`.
                asm_calls = self._extract_called_symbols_from_asm(definition.body)
                for sym in asm_calls:
                    refs.add(sym)
            edges[definition.name] = refs

        # Map sanitized labels back to their original definition names so
        # calls to emitted/sanitized labels (e.g. `w_foo`) can be resolved
        # to the corresponding word names present in `edges`.
        sanitized_map: Dict[str, str] = {sanitize_label(n): n for n in edges}

        reachable: Set[str] = set()
        stack: List[str] = ["main"]
        if extra_roots:
            for r in extra_roots:
                if r and r not in stack:
                    stack.append(r)
        while stack:
            name = stack.pop()
            if name in reachable:
                continue
            reachable.add(name)
            for dep in edges.get(name, ()):
                # Direct name hit
                if dep in edges and dep not in reachable:
                    stack.append(dep)
                    continue
                # Possibly a sanitized label; resolve back to original name
                resolved = sanitized_map.get(dep)
                if resolved and resolved not in reachable:
                    stack.append(resolved)
        return reachable

    def _extract_called_symbols_from_asm(self, asm_body: str) -> Set[str]:
        """Return set of symbol names called from a raw asm body.

        This looks for typical `call <symbol>` forms and also
        `call [rel <symbol>]` and `call qword [rel <symbol>]`.
        """
        calls: Set[str] = set()
        for m in _RE_ASM_CALL_EXTRACT.finditer(asm_body):
            sym = m.group(1) or m.group(2)
            if sym:
                calls.add(sym)
        return calls

    def _emit_externs(self, text: List[str]) -> None:
        externs = sorted([w.name for w in self.dictionary.words.values() if getattr(w, "is_extern", False)])
        for name in externs:
            text.append(f"extern {name}")

    def emit(self, module: Module, debug: bool = False, entry_mode: str = "program") -> Emission:
        if entry_mode not in {"program", "library"}:
            raise CompileError(f"unknown entry mode '{entry_mode}'")
        is_program = entry_mode == "program"
        emission = Emission()
        self._export_all_defs = not is_program
        self._last_cfg_definitions = []
        try:
            self._emit_externs(emission.text)
            # Determine whether user provided a top-level `:asm _start` in
            # the module forms so the prelude can avoid emitting the
            # default startup stub.
            # Detect whether the user supplied a `_start` either as a top-level
            # AsmDefinition form or as a registered dictionary word (imports
            # or CT execution may register it). This influences prelude
            # generation so the default stub is suppressed when present.
            user_has_start = any(
                isinstance(f, AsmDefinition) and f.name == "_start" for f in module.forms
            ) or (
                (self.dictionary.lookup("_start") is not None)
                and isinstance(self.dictionary.lookup("_start").definition, AsmDefinition)
            ) or (
                (module.prelude is not None) and any(l.strip().startswith("_start:") for l in module.prelude)
            )
            # Defer runtime prelude generation until after top-level forms are
            # parsed into `definitions` so we can accurately detect a user
            # provided `_start` AsmDefinition and suppress the default stub.
            # Note: module.prelude was already inspected above when
            # computing `user_has_start`, so avoid referencing
            # `prelude_lines` before it's constructed.
            # Prelude will be generated after definitions are known.
            # If user provided a raw assembly `_start` via `:asm _start {...}`
            # inject it verbatim into the text section so it becomes the
            # program entrypoint. Emit the raw body (no automatic `ret`).
            # Do not inject `_start` body here; rely on definitions emission
            # and the earlier `user_has_start` check to suppress the default
            # startup stub. This avoids emitting `_start` twice.
            self._string_literals = {}
            self._float_literals = {}
            self._data_section = emission.data
            self._cstruct_layouts = dict(module.cstruct_layouts)

            valid_defs = (Definition, AsmDefinition)
            raw_defs = [form for form in module.forms if isinstance(form, valid_defs)]
            definitions = self._dedup_definitions(raw_defs)

            stray_forms = [form for form in module.forms if not isinstance(form, valid_defs)]
            if stray_forms:
                raise CompileError("top-level literals or word references are not supported yet")

            _v = self.verbosity
            if _v >= 1:
                import time as _time_mod
                _emit_t0 = _time_mod.perf_counter()
                n_def = sum(1 for d in definitions if isinstance(d, Definition))
                n_asm = sum(1 for d in definitions if isinstance(d, AsmDefinition))
                print(f"[v1] definitions: {n_def} high-level, {n_asm} asm")
                opts = []
                if self.enable_loop_unroll: opts.append(f"loop-unroll(threshold={self.loop_unroll_threshold})")
                if self.enable_peephole_optimization: opts.append("peephole")
                if self.enable_constant_folding: opts.append("constant-folding")
                if self.enable_static_list_folding: opts.append("static-list-folding")
                if self.enable_auto_inline: opts.append("auto-inline")
                if self.enable_extern_type_check: opts.append("extern-type-check")
                if self.enable_stack_check: opts.append("stack-check")
                print(f"[v1] optimizations: {', '.join(opts) if opts else 'none'}")

            if _v >= 2:
                # v2: log per-definition summary before optimization
                for defn in definitions:
                    if isinstance(defn, Definition):
                        print(f"[v2] def '{defn.name}': {len(defn.body)} ops, inline={getattr(defn, 'inline', False)}, compile_only={getattr(defn, 'compile_only', False)}")
                    else:
                        print(f"[v2] asm '{defn.name}'")

            # --- Early DCE: compute reachable set before optimization passes
            # so we skip optimizing definitions that will be eliminated. ---
            if is_program:
                _early_rt = [d for d in definitions if not getattr(d, "compile_only", False)]
                _early_reachable = self._reachable_runtime_defs(_early_rt)
                # Also include inline defs that are referenced by reachable defs
                # (they need optimization for correct inlining).
            else:
                _early_reachable = None  # library mode: optimize everything

            if self.enable_loop_unroll:
                if _v >= 1: _t0 = _time_mod.perf_counter()
                for defn in definitions:
                    if isinstance(defn, Definition):
                        if _early_reachable is not None and defn.name not in _early_reachable:
                            continue
                        self._unroll_constant_for_loops(defn)
                if _v >= 1:
                    print(f"[v1] loop unrolling: {(_time_mod.perf_counter() - _t0)*1000:.2f}ms")
            if self.enable_peephole_optimization:
                if _v >= 1: _t0 = _time_mod.perf_counter()
                if _v >= 4:
                    for defn in definitions:
                        if isinstance(defn, Definition):
                            if _early_reachable is not None and defn.name not in _early_reachable:
                                continue
                            before_ops = [(n.op, n.data) for n in defn.body]
                            self._peephole_optimize_definition(defn)
                            after_ops = [(n.op, n.data) for n in defn.body]
                            if before_ops != after_ops:
                                print(f"[v2] peephole '{defn.name}': {len(before_ops)} -> {len(after_ops)} ops ({len(before_ops) - len(after_ops)} removed)")
                                self._log_op_diff(defn.name, "peephole", before_ops, after_ops)
                elif _v >= 2:
                    for defn in definitions:
                        if isinstance(defn, Definition):
                            if _early_reachable is not None and defn.name not in _early_reachable:
                                continue
                            _before = len(defn.body)
                            self._peephole_optimize_definition(defn)
                            _after = len(defn.body)
                            if _before != _after:
                                print(f"[v2] peephole '{defn.name}': {_before} -> {_after} ops ({_before - _after} removed)")
                else:
                    for defn in definitions:
                        if isinstance(defn, Definition):
                            if _early_reachable is not None and defn.name not in _early_reachable:
                                continue
                            self._peephole_optimize_definition(defn)
                if _v >= 1:
                    print(f"[v1] peephole optimization: {(_time_mod.perf_counter() - _t0)*1000:.2f}ms")
            if self.enable_constant_folding:
                if _v >= 1: _t0 = _time_mod.perf_counter()
                if _v >= 4:
                    for defn in definitions:
                        if isinstance(defn, Definition):
                            if _early_reachable is not None and defn.name not in _early_reachable:
                                continue
                            before_ops = [(n.op, n.data) for n in defn.body]
                            self._fold_constants_in_definition(defn)
                            after_ops = [(n.op, n.data) for n in defn.body]
                            if before_ops != after_ops:
                                print(f"[v2] constant-fold '{defn.name}': {len(before_ops)} -> {len(after_ops)} ops ({len(before_ops) - len(after_ops)} folded)")
                                self._log_op_diff(defn.name, "constant-fold", before_ops, after_ops)
                elif _v >= 2:
                    for defn in definitions:
                        if isinstance(defn, Definition):
                            if _early_reachable is not None and defn.name not in _early_reachable:
                                continue
                            _before = len(defn.body)
                            self._fold_constants_in_definition(defn)
                            _after = len(defn.body)
                            if _before != _after:
                                print(f"[v2] constant-fold '{defn.name}': {_before} -> {_after} ops ({_before - _after} folded)")
                else:
                    for defn in definitions:
                        if isinstance(defn, Definition):
                            if _early_reachable is not None and defn.name not in _early_reachable:
                                continue
                            self._fold_constants_in_definition(defn)
                if _v >= 1:
                    print(f"[v1] constant folding: {(_time_mod.perf_counter() - _t0)*1000:.2f}ms")
            if self.enable_static_list_folding:
                if _v >= 1: _t0 = _time_mod.perf_counter()
                for defn in definitions:
                    if isinstance(defn, Definition):
                        if _early_reachable is not None and defn.name not in _early_reachable:
                            continue
                        self._fold_static_list_literals_definition(defn)
                if _v >= 1:
                    print(f"[v1] static list folding: {(_time_mod.perf_counter() - _t0)*1000:.2f}ms")

            runtime_defs = [defn for defn in definitions if not getattr(defn, "compile_only", False)]
            if is_program:
                if not any(defn.name == "main" for defn in runtime_defs):
                    raise CompileError("missing 'main' definition")
                # Determine if any user-provided `_start` asm calls into
                # defined words and use those call targets as additional
                # reachability roots. This avoids unconditionally emitting
                # every `:asm` body while still preserving functions that
                # are invoked from a custom `_start` stub.
                # Build a quick lookup of runtime definition names -> defn
                name_to_def: Dict[str, Union[Definition, AsmDefinition]] = {d.name: d for d in runtime_defs}
                # Look for an asm `_start` among parsed definitions (not just runtime_defs)
                asm_start = next((d for d in definitions if isinstance(d, AsmDefinition) and d.name == "_start"), None)
                extra_roots: List[str] = []
                if asm_start is not None:
                    called = self._extract_called_symbols_from_asm(asm_start.body)
                    # Resolve called symbols to definition names using both
                    # raw and sanitized forms.
                    sanitized_map = {sanitize_label(n): n for n in name_to_def}
                    for sym in called:
                        if sym in name_to_def:
                            extra_roots.append(sym)
                        else:
                            resolved = sanitized_map.get(sym)
                            if resolved:
                                extra_roots.append(resolved)

                # Ensure a user-provided raw `_start` asm definition is
                # always emitted (it should override the default stub).
                if asm_start is not None and asm_start not in runtime_defs:
                    runtime_defs.append(asm_start)

                reachable = self._reachable_runtime_defs(runtime_defs, extra_roots=extra_roots)
                if len(reachable) != len(runtime_defs):
                    if _v >= 2:
                        eliminated = [defn.name for defn in runtime_defs if defn.name not in reachable]
                    _n_before_dce = len(runtime_defs)
                    runtime_defs = [defn for defn in runtime_defs if defn.name in reachable]
                    if _v >= 1:
                        print(f"[v1] DCE: {_n_before_dce} -> {len(runtime_defs)} definitions ({_n_before_dce - len(runtime_defs)} eliminated)")
                    if _v >= 2 and eliminated:
                        print(f"[v2] DCE eliminated: {', '.join(eliminated)}")
                # Ensure `_start` is preserved even if not reachable from
                # `main` or the discovered roots; user-provided `_start`
                # must override the default stub.
                if asm_start is not None and asm_start not in runtime_defs:
                    runtime_defs.append(asm_start)
            elif self._export_all_defs:
                exported = sorted({sanitize_label(defn.name) for defn in runtime_defs})
                for label in exported:
                    emission.text.append(f"global {label}")

            # Inline-only definitions are expanded at call sites; skip emitting standalone labels.
            runtime_defs = [defn for defn in runtime_defs if not getattr(defn, "inline", False)]

            if self._need_cfg:
                self._last_cfg_definitions = [
                    self._copy_definition_for_cfg(defn)
                    for defn in runtime_defs
                    if isinstance(defn, Definition)
                ]

            if self.enable_extern_type_check or self.enable_stack_check:
                if _v >= 1: _t0 = _time_mod.perf_counter()
                self._check_extern_types(runtime_defs)
                if _v >= 1:
                    print(f"[v1] type/stack checking: {(_time_mod.perf_counter() - _t0)*1000:.2f}ms")

            if _v >= 1:
                print(f"[v1] emitting {len(runtime_defs)} runtime definitions")

            if _v >= 1: _t0 = _time_mod.perf_counter()
            for definition in runtime_defs:
                if _v >= 3:
                    body_len = len(definition.body) if isinstance(definition, Definition) else 0
                    kind = "asm" if isinstance(definition, AsmDefinition) else "def"
                    # v3: dump full body opcodes
                    print(f"[v3] emit {kind} '{definition.name}' (body ops: {body_len})")
                    if isinstance(definition, Definition) and definition.body:
                        for i, node in enumerate(definition.body):
                            print(f"[v3]   [{i}] {node.op}({node.data!r})")
                self._emit_definition(definition, emission.text, debug=debug)
            if _v >= 1:
                print(f"[v1] code emission: {(_time_mod.perf_counter() - _t0)*1000:.2f}ms")

            # --- now generate and emit the runtime prelude ---
            # Determine whether a user-provided `_start` exists among the
            # parsed definitions or in a compile-time-injected prelude. If
            # present, suppress the default startup stub emitted by the
            # runtime prelude.
            user_has_start = any(isinstance(d, AsmDefinition) and d.name == "_start" for d in definitions)
            if module.prelude is not None and not user_has_start:
                if any(line.strip().startswith("_start:") for line in module.prelude):
                    user_has_start = True
            base_prelude = self._runtime_prelude(entry_mode, has_user_start=user_has_start)
            # Use the generated base prelude. Avoid directly prepending
            # `module.prelude` which can contain raw, unsanitized assembly
            # fragments (often sourced from cached stdlib assembly) that
            # duplicate or conflict with the sanitized definitions the
            # emitter produces. Prepending `module.prelude` has caused
            # duplicate `_start` and symbol conflicts; prefer the
            # canonical `base_prelude` produced by the emitter.
            prelude_lines = base_prelude
            if user_has_start and prelude_lines is not None:
                # Avoid re-declaring the default startup symbol when the
                # user provided their own `_start`. Do not remove the
                # user's `_start` body. Only
                # filter out any stray `global _start` markers.
                prelude_lines = [l for l in prelude_lines if l.strip() != "global _start"]
            # Tag any `_start:` occurrences in the prelude with a
            # provenance comment so generated ASM files make it easy
            # to see where each `_start` originated. This is
            # non-destructive (comments only) and helps debug duplicates.
            if prelude_lines is not None:
                tagged = []
                for l in prelude_lines:
                    if l.strip().startswith("_start:"):
                        tagged.append("; __ORIGIN__ prelude")
                        tagged.append(l)
                    else:
                        tagged.append(l)
                prelude_lines = tagged
            # Prepend prelude lines to any already-emitted text (definitions).
            emission.text = (prelude_lines if prelude_lines is not None else []) + list(emission.text)
            try:
                self._emitted_start = user_has_start
            except Exception:
                self._emitted_start = False
            # If no `_start` has been emitted (either detected in
            # definitions/module.prelude or already present in the
            # composed `emission.text`), append the default startup
            # stub now (after definitions) so the emitter does not
            # produce duplicate `_start` labels.
            if is_program and not (user_has_start or getattr(self, "_emitted_start", False)):
                _start_lines = [
                    "; __ORIGIN__ default_stub",
                    "global _start",
                    "_start:",
                    "    ; Linux x86-64 startup: argc/argv from stack",
                    "    mov rdi, [rsp]",         # argc
                    "    lea rsi, [rsp+8]",      # argv
                    "    mov [rel sys_argc], rdi",
                    "    mov [rel sys_argv], rsi",
                    "    ; initialize data/return stack pointers",
                    "    lea r12, [rel dstack_top]",
                    "    mov r15, r12",
                    "    lea r13, [rel rstack_top]",
                    f"    call {sanitize_label('main')}",
                    "    mov rax, 0",
                    "    cmp r12, r15",
                    "    je .no_exit_value",
                    "    mov rax, [r12]",
                    "    add r12, 8",
                    ".no_exit_value:",
                ]
                _start_lines.extend([
                    "    mov rdi, rax",
                    "    mov rax, 60",
                    "    syscall",
                ])
                emission.text.extend(_start_lines)

            self._emit_variables(module.variables)

            if self._data_section is not None:
                if not self._data_section:
                    self._data_section.append("data_start:")
                if not self._data_section or self._data_section[-1] != "data_end:":
                    self._data_section.append("data_end:")
            bss_lines = module.bss if module.bss is not None else self._bss_layout()
            emission.bss.extend(bss_lines)
            if _v >= 1:
                _emit_dt = (_time_mod.perf_counter() - _emit_t0) * 1000
                print(f"[v1] total emit: {_emit_dt:.2f}ms")
            return emission
        finally:
            self._data_section = None
            self._export_all_defs = False

    def _dedup_definitions(self, definitions: Sequence[Union[Definition, AsmDefinition]]) -> List[Union[Definition, AsmDefinition]]:
        seen: Set[str] = set()
        ordered: List[Union[Definition, AsmDefinition]] = []
        for defn in reversed(definitions):
            if defn.name in seen:
                continue
            seen.add(defn.name)
            ordered.append(defn)
        ordered.reverse()
        return ordered

    def _emit_variables(self, variables: Dict[str, str]) -> None:
        if not variables:
            return
        self._ensure_data_start()
        existing = set()
        if self._data_section is not None:
            for line in self._data_section:
                if ":" in line:
                    label = line.split(":", 1)[0]
                    existing.add(label.strip())
        for label in variables.values():
            if label in existing:
                continue
            self._data_section.append(f"{label}: dq 0")

    def _ensure_data_start(self) -> None:
        if self._data_section is None:
            raise CompileError("data section is not initialized")
        if not self._data_section:
            self._data_section.append("data_start:")

    def _intern_string_literal(self, value: str) -> Tuple[str, int]:
        if self._data_section is None:
            raise CompileError("string literal emission requested without data section")
        self._ensure_data_start()
        if value in self._string_literals:
            return self._string_literals[value]
        label = f"str_{len(self._string_literals)}"
        encoded = value.encode("utf-8")
        bytes_with_nul = list(encoded) + [0]
        byte_list = ", ".join(str(b) for b in bytes_with_nul)
        self._data_section.append(f"{label}: db {byte_list}")
        self._data_section.append(f"{label}_len equ {len(encoded)}")
        self._string_literals[value] = (label, len(encoded))
        return self._string_literals[value]

    def _intern_float_literal(self, value: float) -> str:
        if self._data_section is None:
            raise CompileError("float literal emission requested without data section")
        self._ensure_data_start()
        if value in self._float_literals:
            return self._float_literals[value]
        label = f"flt_{len(self._float_literals)}"
        # Use hex representation of double precision float
        import struct
        hex_val = _get_struct().pack('>d', value).hex()
        # NASM expects hex starting with 0x
        self._data_section.append(f"{label}: dq 0x{hex_val}")
        self._float_literals[value] = label
        return label

    def _emit_definition(
        self,
        definition: Union[Definition, AsmDefinition],
        text: List[str],
        *,
        debug: bool = False,
    ) -> None:
        # If a `_start` label has already been emitted in the prelude,
        # skip emitting a second `_start` definition which would cause
        # assembler redefinition errors. The prelude-provided `_start`
        # (if present) is taken to be authoritative.
        if definition.name == "_start" and getattr(self, "_emitted_start", False):
            return
        # If this is a raw assembly definition, tag its origin so the
        # generated ASM clearly shows the source of the label (helpful
        # when diagnosing duplicate `_start` occurrences).
        if isinstance(definition, AsmDefinition):
            text.append(f"; __ORIGIN__ AsmDefinition {definition.name}")
        label = sanitize_label(definition.name)

        # Record start index so we can write a per-definition snapshot
        start_index = len(text)
        # If this definition is the program entry `_start`, ensure it's
        # exported as a global symbol so the linker sets the process
        # entry point correctly. Some earlier sanitizer passes may
        # remove `global _start` from prelude fragments; make sure user
        # provided `_start` remains globally visible.
        if label == "_start":
            text.append("global _start")
        text.append(f"{label}:")
        builder = FunctionEmitter(text, debug_enabled=debug)
        self._emit_stack.append(definition.name)
        try:
            if isinstance(definition, Definition):
                for node in definition.body:
                    self._emit_node(node, builder)
            elif isinstance(definition, AsmDefinition):
                self._emit_asm_body(definition, builder)
            else:  # pragma: no cover - defensive
                raise CompileError("unknown definition type")
            builder.emit("    ret")
        finally:
            self._emit_stack.pop()

    def _emit_inline_definition(self, word: Word, builder: FunctionEmitter) -> None:
        definition = word.definition
        if not isinstance(definition, Definition):
            raise CompileError(f"inline word '{word.name}' requires a high-level definition")

        self._emit_stack.append(f"{word.name} (inline)")

        suffix = self._inline_counter
        self._inline_counter += 1

        label_map: Dict[str, str] = {}

        def remap(label: str) -> str:
            if label not in label_map:
                label_map[label] = f"{label}__inl{suffix}"
            return label_map[label]

        for node in definition.body:
            kind = node._opcode
            data = node.data
            if kind == OP_LABEL:
                mapped = remap(str(data))
                self._emit_node(_make_op("label", mapped), builder)
                continue
            if kind == OP_JUMP:
                mapped = remap(str(data))
                self._emit_node(_make_op("jump", mapped), builder)
                continue
            if kind == OP_BRANCH_ZERO:
                mapped = remap(str(data))
                self._emit_node(_make_op("branch_zero", mapped), builder)
                continue
            if kind == OP_FOR_BEGIN:
                mapped = {
                    "loop": remap(data["loop"]),
                    "end": remap(data["end"]),
                }
                self._emit_node(_make_op("for_begin", mapped), builder)
                continue
            if kind == OP_FOR_END:
                mapped = {
                    "loop": remap(data["loop"]),
                    "end": remap(data["end"]),
                }
                self._emit_node(_make_op("for_end", mapped), builder)
                continue
            if kind == OP_LIST_BEGIN or kind == OP_LIST_END:
                mapped = remap(str(data))
                self._emit_node(_make_op(node.op, mapped), builder)
                continue
            self._emit_node(node, builder)

        self._emit_stack.pop()

    def _emit_asm_body(self, definition: AsmDefinition, builder: FunctionEmitter) -> None:
        body = definition.body.strip("\n")
        if not body:
            return
        _call_sub = _RE_ASM_CALL.sub
        _global_sub = _RE_ASM_GLOBAL.sub
        _extern_sub = _RE_ASM_EXTERN.sub
        def repl_sym(m: re.Match) -> str:
            name = m.group(1)
            return m.group(0).replace(name, sanitize_label(name))
        for line in body.splitlines():
            if not line.strip():
                continue
            if "call " in line or "global " in line or "extern " in line:
                line = _call_sub(repl_sym, line)
                line = _global_sub(repl_sym, line)
                line = _extern_sub(repl_sym, line)
            builder.emit(line)

    def _emit_asm_body_inline(self, definition: AsmDefinition, builder: FunctionEmitter) -> None:
        """Emit an asm body inline, stripping ret instructions."""
        # Cache sanitized lines on the definition to avoid re-parsing.
        cached = definition._inline_lines
        if cached is None:
            _call_sub = _RE_ASM_CALL.sub
            _global_sub = _RE_ASM_GLOBAL.sub
            _extern_sub = _RE_ASM_EXTERN.sub
            def repl_sym(m: re.Match) -> str:
                name = m.group(1)
                return m.group(0).replace(name, sanitize_label(name))
            cached = []
            body = definition.body.strip("\n")
            for line in body.splitlines():
                stripped = line.strip()
                if not stripped or stripped == "ret":
                    continue
                if "call " in line or "global " in line or "extern " in line:
                    line = _call_sub(repl_sym, line)
                    line = _global_sub(repl_sym, line)
                    line = _extern_sub(repl_sym, line)
                cached.append(line)
            definition._inline_lines = cached
        text = builder.text
        text.extend(cached)

    def _emit_node(self, node: Op, builder: FunctionEmitter) -> None:
        kind = node._opcode
        data = node.data
        builder.set_location(node.loc)

        if kind == OP_WORD:
            self._emit_wordref(data, builder)
            return

        if kind == OP_LITERAL:
            if isinstance(data, int):
                builder.push_literal(data)
                return
            if isinstance(data, float):
                label = self._intern_float_literal(data)
                builder.push_float(label)
                return
            if isinstance(data, str):
                label, length = self._intern_string_literal(data)
                builder.push_label(label)
                builder.push_literal(length)
                return
            raise CompileError(f"unsupported literal type {type(data)!r} while emitting '{self._emit_stack[-1]}'" if self._emit_stack else f"unsupported literal type {type(data)!r}")

        if kind == OP_WORD_PTR:
            self._emit_wordptr(data, builder)
            return

        if kind == OP_BRANCH_ZERO:
            self._emit_branch_zero(data, builder)
            return

        if kind == OP_JUMP:
            builder.emit(f"    jmp {data}")
            return

        if kind == OP_LABEL:
            builder.emit(f"{data}:")
            return

        if kind == OP_FOR_BEGIN:
            self._emit_for_begin(data, builder)
            return

        if kind == OP_FOR_END:
            self._emit_for_next(data, builder)
            return

        if kind == OP_LIST_BEGIN:
            builder.comment("list begin")
            builder.emit("    mov rax, [rel list_capture_sp]")
            builder.emit("    lea rdx, [rel list_capture_stack]")
            builder.emit("    mov [rdx + rax*8], r12")
            builder.emit("    inc rax")
            builder.emit("    mov [rel list_capture_sp], rax")
            return

        if kind == OP_LIST_LITERAL:
            values = list(data or [])
            count = len(values)
            bytes_needed = (count + 1) * 8
            builder.comment("list literal")
            builder.emit("    xor rdi, rdi")
            builder.emit(f"    mov rsi, {bytes_needed}")
            builder.emit("    mov rdx, 3")
            builder.emit("    mov r10, 34")
            builder.emit("    mov r8, -1")
            builder.emit("    xor r9, r9")
            builder.emit("    mov rax, 9")
            builder.emit("    syscall")
            builder.emit(f"    mov qword [rax], {count}")
            for idx_item, value in enumerate(values):
                builder.emit(f"    mov qword [rax + {8 + idx_item * 8}], {int(value)}")
            builder.emit("    sub r12, 8")
            builder.emit("    mov [r12], rax")
            return

        if kind == OP_LIST_END:
            base = str(data)
            loop_label = f"{base}_copy_loop"
            done_label = f"{base}_copy_done"

            builder.comment("list end")
            # pop capture start pointer
            builder.emit("    mov rax, [rel list_capture_sp]")
            builder.emit("    dec rax")
            builder.emit("    mov [rel list_capture_sp], rax")
            builder.emit("    lea r11, [rel list_capture_stack]")
            builder.emit("    mov rbx, [r11 + rax*8]")
            # count = (start_r12 - r12) / 8
            builder.emit("    mov rcx, rbx")
            builder.emit("    sub rcx, r12")
            builder.emit("    shr rcx, 3")
            builder.emit("    mov [rel list_capture_tmp], rcx")

            # bytes = (count + 1) * 8
            builder.emit("    mov rsi, rcx")
            builder.emit("    inc rsi")
            builder.emit("    shl rsi, 3")

            # mmap(NULL, bytes, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0)
            builder.emit("    xor rdi, rdi")
            builder.emit("    mov rdx, 3")
            builder.emit("    mov r10, 34")
            builder.emit("    mov r8, -1")
            builder.emit("    xor r9, r9")
            builder.emit("    mov rax, 9")
            builder.emit("    syscall")

            # store length
            builder.emit("    mov rdx, [rel list_capture_tmp]")
            builder.emit("    mov [rax], rdx")

            # copy elements, preserving original push order
            builder.emit("    xor rcx, rcx")
            builder.emit(f"{loop_label}:")
            builder.emit("    cmp rcx, rdx")
            builder.emit(f"    je {done_label}")
            builder.emit("    mov r8, rdx")
            builder.emit("    dec r8")
            builder.emit("    sub r8, rcx")
            builder.emit("    shl r8, 3")
            builder.emit("    mov r9, [r12 + r8]")
            builder.emit("    mov [rax + 8 + rcx*8], r9")
            builder.emit("    inc rcx")
            builder.emit(f"    jmp {loop_label}")
            builder.emit(f"{done_label}:")

            # drop captured values and push list pointer
            builder.emit("    mov r12, rbx")
            builder.emit("    sub r12, 8")
            builder.emit("    mov [r12], rax")
            return

        raise CompileError(f"unsupported op {node!r} while emitting '{self._emit_stack[-1]}'" if self._emit_stack else f"unsupported op {node!r}")

    def _emit_mmap_alloc(self, builder: FunctionEmitter, size: int, target_reg: str = "rax") -> None:
        alloc_size = max(1, int(size))
        builder.emit("    xor rdi, rdi")
        builder.emit(f"    mov rsi, {alloc_size}")
        builder.emit("    mov rdx, 3")
        builder.emit("    mov r10, 34")
        builder.emit("    mov r8, -1")
        builder.emit("    xor r9, r9")
        builder.emit("    mov rax, 9")
        builder.emit("    syscall")
        if target_reg != "rax":
            builder.emit(f"    mov {target_reg}, rax")

    def _analyze_extern_c_type(self, type_name: str) -> Dict[str, Any]:
        size, align, cls, layout = _c_type_size_align_class(type_name, self._cstruct_layouts)
        info: Dict[str, Any] = {
            "name": _canonical_c_type_name(type_name),
            "size": size,
            "align": align,
            "class": cls,
            "kind": "struct" if layout is not None else "scalar",
            "layout": layout,
            "pass_mode": "scalar",
            "eightbytes": [],
        }
        if layout is not None:
            eb = _classify_struct_eightbytes(layout, self._cstruct_layouts)
            info["eightbytes"] = eb or []
            info["pass_mode"] = "register" if eb is not None else "memory"
        return info

    def _emit_copy_bytes_from_ptr(
        self,
        builder: FunctionEmitter,
        *,
        src_ptr_reg: str,
        dst_expr: str,
        size: int,
    ) -> None:
        copied = 0
        while copied + 8 <= size:
            builder.emit(f"    mov r11, [{src_ptr_reg} + {copied}]")
            builder.emit(f"    mov qword [{dst_expr} + {copied}], r11")
            copied += 8
        while copied < size:
            builder.emit(f"    mov r11b, byte [{src_ptr_reg} + {copied}]")
            builder.emit(f"    mov byte [{dst_expr} + {copied}], r11b")
            copied += 1

    @staticmethod
    def _pop_preceding_literal(builder: FunctionEmitter) -> Optional[int]:
        """If the last emitted instructions are a literal push, remove them and return the value."""
        text = builder.text
        n = len(text)
        if n < 3:
            return None
        # push_literal emits:  "; push N" / "sub r12, 8" / "mov qword [r12], N"
        mov_line = text[n - 1].strip()
        sub_line = text[n - 2].strip()
        cmt_line = text[n - 3].strip()
        if not (sub_line == "sub r12, 8" and mov_line.startswith("mov qword [r12],") and cmt_line.startswith("; push")):
            return None
        val_str = mov_line.split(",", 1)[1].strip()
        try:
            value = int(val_str)
        except ValueError:
            return None
        del text[n - 3:n]
        return value

    def _emit_extern_wordref(self, name: str, word: Word, builder: FunctionEmitter) -> None:
        inputs = getattr(word, "extern_inputs", 0)
        outputs = getattr(word, "extern_outputs", 0)
        signature = getattr(word, "extern_signature", None)

        if signature is None and inputs <= 0 and outputs <= 0:
            builder.emit(f"    call {name}")
            return

        arg_types = list(signature[0]) if signature else ["long"] * inputs
        ret_type = signature[1] if signature else ("long" if outputs > 0 else "void")

        # For variadic externs, consume the preceding literal as the count of
        # extra variadic arguments.  These are NOT passed to the C function as
        # a count parameter – they simply tell the compiler how many additional
        # stack values to pop and place into registers / the C stack.
        if getattr(word, "extern_variadic", False):
            va_count = self._pop_preceding_literal(builder)
            if va_count is None:
                suffix = f" while emitting '{self._emit_stack[-1]}'" if self._emit_stack else ""
                raise CompileError(
                    f"variadic extern '{name}' requires a literal arg count on TOS{suffix}"
                )
            for _ in range(va_count):
                arg_types.append("long")
            inputs += va_count

        if len(arg_types) != inputs and signature is not None:
            suffix = f" while emitting '{self._emit_stack[-1]}'" if self._emit_stack else ""
            raise CompileError(f"extern '{name}' mismatch: {inputs} inputs vs {len(arg_types)} types{suffix}")

        arg_infos = [self._analyze_extern_c_type(t) for t in arg_types]
        ret_info = self._analyze_extern_c_type(ret_type)

        regs = ["rdi", "rsi", "rdx", "rcx", "r8", "r9"]
        xmm_regs = [f"xmm{i}" for i in range(8)]

        ret_uses_sret = ret_info["kind"] == "struct" and ret_info["pass_mode"] == "memory"
        int_idx = 1 if ret_uses_sret else 0
        xmm_idx = 0

        arg_locs: List[Dict[str, Any]] = []
        stack_cursor = 0

        for info in arg_infos:
            if info["kind"] == "struct":
                if info["pass_mode"] == "register":
                    classes: List[str] = list(info["eightbytes"])
                    need_int = sum(1 for c in classes if c == "INTEGER")
                    need_xmm = sum(1 for c in classes if c == "SSE")
                    if int_idx + need_int <= len(regs) and xmm_idx + need_xmm <= len(xmm_regs):
                        chunks: List[Tuple[str, str, int]] = []
                        int_off = int_idx
                        xmm_off = xmm_idx
                        for chunk_idx, cls in enumerate(classes):
                            if cls == "SSE":
                                chunks.append((cls, xmm_regs[xmm_off], chunk_idx * 8))
                                xmm_off += 1
                            else:
                                chunks.append(("INTEGER", regs[int_off], chunk_idx * 8))
                                int_off += 1
                        int_idx = int_off
                        xmm_idx = xmm_off
                        arg_locs.append({"mode": "struct_reg", "chunks": chunks, "info": info})
                    else:
                        stack_size = _round_up(int(info["size"]), 8)
                        stack_off = stack_cursor
                        stack_cursor += stack_size
                        arg_locs.append({"mode": "struct_stack", "stack_off": stack_off, "info": info})
                else:
                    stack_size = _round_up(int(info["size"]), 8)
                    stack_off = stack_cursor
                    stack_cursor += stack_size
                    arg_locs.append({"mode": "struct_stack", "stack_off": stack_off, "info": info})
                continue

            if info["class"] == "SSE":
                if xmm_idx < len(xmm_regs):
                    arg_locs.append({"mode": "scalar_reg", "reg": xmm_regs[xmm_idx], "class": "SSE"})
                    xmm_idx += 1
                else:
                    stack_off = stack_cursor
                    stack_cursor += 8
                    arg_locs.append({"mode": "scalar_stack", "stack_off": stack_off, "class": "SSE"})
            else:
                if int_idx < len(regs):
                    arg_locs.append({"mode": "scalar_reg", "reg": regs[int_idx], "class": "INTEGER"})
                    int_idx += 1
                else:
                    stack_off = stack_cursor
                    stack_cursor += 8
                    arg_locs.append({"mode": "scalar_stack", "stack_off": stack_off, "class": "INTEGER"})

        # Preserve and realign RSP for C ABI calls regardless current call depth.
        stack_bytes = max(15, int(stack_cursor) + 15)
        builder.emit("    mov r14, rsp")
        builder.emit(f"    sub rsp, {stack_bytes}")
        builder.emit("    and rsp, -16")

        if ret_info["kind"] == "struct":
            self._emit_mmap_alloc(builder, int(ret_info["size"]), target_reg="r15")
            if ret_uses_sret:
                builder.emit("    mov rdi, r15")

        total_args = len(arg_locs)
        for idx, loc in enumerate(reversed(arg_locs)):
            addr = f"[r12 + {idx * 8}]" if idx > 0 else "[r12]"
            mode = str(loc["mode"])

            if mode == "scalar_reg":
                reg = str(loc["reg"])
                cls = str(loc["class"])
                if cls == "SSE":
                    builder.emit(f"    mov rax, {addr}")
                    builder.emit(f"    movq {reg}, rax")
                else:
                    builder.emit(f"    mov {reg}, {addr}")
                continue

            if mode == "scalar_stack":
                stack_off = int(loc["stack_off"])
                builder.emit(f"    mov rax, {addr}")
                builder.emit(f"    mov qword [rsp + {stack_off}], rax")
                continue

            if mode == "struct_reg":
                chunks: List[Tuple[str, str, int]] = list(loc["chunks"])
                builder.emit(f"    mov rax, {addr}")
                for cls, target, off in chunks:
                    if cls == "SSE":
                        builder.emit(f"    movq {target}, [rax + {off}]")
                    else:
                        builder.emit(f"    mov {target}, [rax + {off}]")
                continue

            if mode == "struct_stack":
                stack_off = int(loc["stack_off"])
                size = int(loc["info"]["size"])
                builder.emit(f"    mov rax, {addr}")
                self._emit_copy_bytes_from_ptr(builder, src_ptr_reg="rax", dst_expr=f"rsp + {stack_off}", size=size)
                continue

            raise CompileError(f"internal extern lowering error for '{name}': unknown arg mode {mode!r}")

        if total_args:
            builder.emit(f"    add r12, {total_args * 8}")

        builder.emit(f"    mov al, {xmm_idx}")
        builder.emit(f"    call {name}")

        builder.emit("    mov rsp, r14")

        if ret_info["kind"] == "struct":
            if not ret_uses_sret:
                ret_classes: List[str] = list(ret_info["eightbytes"])
                int_ret_regs = ["rax", "rdx"]
                xmm_ret_regs = ["xmm0", "xmm1"]
                int_ret_idx = 0
                xmm_ret_idx = 0
                for chunk_idx, cls in enumerate(ret_classes):
                    off = chunk_idx * 8
                    if cls == "SSE":
                        src = xmm_ret_regs[xmm_ret_idx]
                        xmm_ret_idx += 1
                        builder.emit(f"    movq [r15 + {off}], {src}")
                    else:
                        src = int_ret_regs[int_ret_idx]
                        int_ret_idx += 1
                        builder.emit(f"    mov [r15 + {off}], {src}")
            builder.emit("    sub r12, 8")
            builder.emit("    mov [r12], r15")
            return

        if _ctype_uses_sse(ret_type):
            builder.emit("    sub r12, 8")
            builder.emit("    movq rax, xmm0")
            builder.emit("    mov [r12], rax")
        elif outputs == 1:
            builder.push_from("rax")
        elif outputs > 1:
            raise CompileError("extern only supports 0 or 1 scalar output")

    def _emit_wordref(self, name: str, builder: FunctionEmitter) -> None:
        word = self.dictionary.words.get(name)
        if word is None:
            suffix = f" while emitting '{self._emit_stack[-1]}'" if self._emit_stack else ""
            raise CompileError(f"unknown word '{name}'{suffix}")
        if word.compile_only:
            suffix = f" while emitting '{self._emit_stack[-1]}'" if self._emit_stack else ""
            raise CompileError(f"word '{name}' is compile-time only and cannot be used at runtime{suffix}")
        if getattr(word, "inline", False):
            if isinstance(word.definition, Definition):
                if word.name in self._inline_stack:
                    suffix = f" while emitting '{self._emit_stack[-1]}'" if self._emit_stack else ""
                    raise CompileError(f"recursive inline expansion for '{word.name}'{suffix}")
                self._inline_stack.append(word.name)
                self._emit_inline_definition(word, builder)
                self._inline_stack.pop()
                return
            if isinstance(word.definition, AsmDefinition):
                self._emit_asm_body_inline(word.definition, builder)
                return
        if word.intrinsic:
            word.intrinsic(builder)
            return
        # Auto-inline small asm bodies even without explicit `inline` keyword.
        if self.enable_auto_inline and isinstance(word.definition, AsmDefinition) and self._asm_auto_inline_ok(word.definition):
            self._emit_asm_body_inline(word.definition, builder)
            return
        if getattr(word, "is_extern", False):
            self._emit_extern_wordref(name, word, builder)
        else:
            builder.emit(f"    call {sanitize_label(name)}")

    @staticmethod
    def _asm_auto_inline_ok(defn: AsmDefinition) -> bool:
        """Return True if *defn* is small enough to auto-inline at call sites."""
        cached = defn._inline_lines
        if cached is not None:
            return len(cached) <= _ASM_AUTO_INLINE_THRESHOLD
        count = 0
        for line in defn.body.split('\n'):
            s = line.strip()
            if not s or s == 'ret':
                continue
            if s.endswith(':'):
                return False  # labels would duplicate on multiple inlines
            if 'rsp' in s:
                return False  # references call-frame; must stay a real call
            count += 1
            if count > _ASM_AUTO_INLINE_THRESHOLD:
                return False
        return True

    def _emit_wordptr(self, name: str, builder: FunctionEmitter) -> None:
        word = self.dictionary.lookup(name)
        if word is None:
            suffix = f" while emitting '{self._emit_stack[-1]}'" if self._emit_stack else ""
            raise CompileError(f"unknown word '{name}'{suffix}")
        if getattr(word, "is_extern", False):
            builder.push_label(name)
            return
        builder.push_label(sanitize_label(name))

    def _emit_branch_zero(self, target: str, builder: FunctionEmitter) -> None:
        builder.pop_to("rax")
        builder.emit("    test rax, rax")
        builder.emit(f"    jz {target}")

    def _emit_for_begin(self, data: Dict[str, str], builder: FunctionEmitter) -> None:
        loop_label = data["loop"]
        end_label = data["end"]
        builder.pop_to("rax")
        builder.emit("    cmp rax, 0")
        builder.emit(f"    jle {end_label}")
        builder.emit("    sub r13, 8")
        builder.emit("    mov [r13], rax")
        builder.emit(f"{loop_label}:")

    def _emit_for_next(self, data: Dict[str, str], builder: FunctionEmitter) -> None:
        loop_label = data["loop"]
        end_label = data["end"]
        builder.emit("    mov rax, [r13]")
        builder.emit("    dec rax")
        builder.emit("    mov [r13], rax")
        builder.emit(f"    jg {loop_label}")
        builder.emit("    add r13, 8")
        builder.emit(f"{end_label}:")

    def _runtime_prelude(self, entry_mode: str, has_user_start: bool = False) -> List[str]:
        lines: List[str] = [
            "%define DSTK_BYTES 65536",
            "%define RSTK_BYTES 65536",
            "%define PRINT_BUF_BYTES 128",
        ]
        is_program = entry_mode == "program"
        lines.extend([
            "global sys_argc",
            "global sys_argv",
            "section .data",
            "sys_argc: dq 0",
            "sys_argv: dq 0",
            "section .text",
        ])
        # Do not emit the default `_start` stub here; it will be appended
        # after definitions have been emitted if no user `_start` was
        # provided. This avoids duplicate or partial `_start` blocks.

        return lines

    def _bss_layout(self) -> List[str]:
        return [
            "global dstack",
            "global dstack_top",
            "global rstack",
            "global rstack_top",
            "align 16",
            "dstack: resb DSTK_BYTES",
            "dstack_top:",
            "align 16",
            "rstack: resb RSTK_BYTES",
            "rstack_top:",
            "align 16",
            "print_buf: resb PRINT_BUF_BYTES",
            "print_buf_end:",
            "align 16",
            "persistent: resb 64",
            "align 16",
            "list_capture_sp: resq 1",
            "list_capture_tmp: resq 1",
            "list_capture_stack: resq 1024",
        ]

    def write_asm(self, emission: Emission, path: Path) -> None:
        path.write_text(emission.snapshot())

# ---------------------------------------------------------------------------
# Built-in macros and intrinsics
# ---------------------------------------------------------------------------


def macro_immediate(ctx: MacroContext) -> Optional[List[Op]]:
    parser = ctx.parser
    word = parser.most_recent_definition()
    if word is None:
        raise ParseError("'immediate' must follow a definition")
    word.immediate = True
    if word.definition is not None:
        word.definition.immediate = True
    return None


def macro_compile_only(ctx: MacroContext) -> Optional[List[Op]]:
    parser = ctx.parser
    word = parser.most_recent_definition()
    if word is None:
        raise ParseError("'compile-only' must follow a definition")
    word.compile_only = True
    if word.definition is not None:
        word.definition.compile_only = True
    return None


def macro_inline(ctx: MacroContext) -> Optional[List[Op]]:
    parser = ctx.parser
    next_tok = parser.peek_token()
    if next_tok is None or next_tok.lexeme not in ("word", ":asm"):
        raise ParseError("'inline' must be followed by 'word' or ':asm'")
    if parser._pending_inline_definition:
        raise ParseError("duplicate 'inline' before definition")
    parser._pending_inline_definition = True
    return None


def _require_definition_context(parser: "Parser", word_name: str) -> Definition:
    if not parser.context_stack or not isinstance(parser.context_stack[-1], Definition):
        raise ParseError(f"'{word_name}' can only appear inside a definition")
    return parser.context_stack[-1]


def macro_label(ctx: MacroContext) -> Optional[List[Op]]:
    parser = ctx.parser
    if parser._eof():
        raise ParseError("label name missing after 'label'")
    tok = parser.next_token()
    name = tok.lexeme
    if not _is_identifier(name):
        raise ParseError(f"invalid label name '{name}'")
    definition = _require_definition_context(parser, "label")
    if any(node._opcode == OP_LABEL and node.data == name for node in definition.body):
        raise ParseError(f"duplicate label '{name}' in definition '{definition.name}'")
    parser.emit_node(_make_op("label", name))
    return None


def macro_goto(ctx: MacroContext) -> Optional[List[Op]]:
    parser = ctx.parser
    if parser._eof():
        raise ParseError("label name missing after 'goto'")
    tok = parser.next_token()
    name = tok.lexeme
    if not _is_identifier(name):
        raise ParseError(f"invalid label name '{name}'")
    _require_definition_context(parser, "goto")
    parser.emit_node(_make_op("jump", name))
    return None


def macro_compile_time(ctx: MacroContext) -> Optional[List[Op]]:
    """Run the next word at compile time and still emit it for runtime."""
    parser = ctx.parser
    if parser._eof():
        raise ParseError("word name missing after 'compile-time'")
    tok = parser.next_token()
    name = tok.lexeme
    word = parser.dictionary.lookup(name)
    if word is None:
        raise ParseError(f"unknown word '{name}' for compile-time")
    if word.compile_only:
        raise ParseError(f"word '{name}' is compile-time only")
    parser.compile_time_vm.invoke(word)
    parser.compile_time_vm._ct_executed.add(name)
    if isinstance(parser.context_stack[-1], Definition):
        parser.emit_node(_make_op("word", name))
    return None


def macro_with(ctx: MacroContext) -> Optional[List[Op]]:
    parser = ctx.parser

    names: List[str] = []
    template: Optional[Token] = None
    seen: set[str] = set()
    while True:
        if parser._eof():
            raise ParseError("missing 'in' after 'with'")
        tok = parser.next_token()
        template = template or tok
        if tok.lexeme == "in":
            break
        if not _is_identifier(tok.lexeme):
            raise ParseError("invalid variable name in 'with'")
        if tok.lexeme in seen:
            raise ParseError("duplicate variable name in 'with'")
        seen.add(tok.lexeme)
        names.append(tok.lexeme)
    if not names:
        raise ParseError("'with' requires at least one variable name")

    body: List[Token] = []
    else_line: Optional[int] = None
    depth = 0
    while True:
        if parser._eof():
            raise ParseError("unterminated 'with' block (missing 'end')")
        tok = parser.next_token()
        if else_line is not None and tok.line != else_line:
            else_line = None
        if tok.lexeme == "end":
            if depth == 0:
                break
            depth -= 1
            body.append(tok)
            continue
        if tok.lexeme == "if":
            # Support shorthand elif form `else <cond> if` inside with-blocks.
            # This inline `if` shares the same closing `end` as the preceding
            # branch and therefore must not increment nesting depth.
            if else_line != tok.line:
                depth += 1
        elif tok.lexeme == "else":
            else_line = tok.line
        elif tok.lexeme in parser.block_openers:
            depth += 1
        body.append(tok)

    helper_for: Dict[str, str] = {}
    for name in names:
        _, helper = parser.allocate_variable(name)
        helper_for[name] = helper

    emitted_tokens: List[Token] = []

    def _emit_lex(lex: str, src_tok: Optional[Token] = None) -> None:
        base = src_tok or template or Token(lexeme="", line=0, column=0, start=0, end=0)
        emitted_tokens.append(
            Token(
                lexeme=lex,
                line=base.line,
                column=base.column,
                start=base.start,
                end=base.end,
            )
        )

    # Initialize variables by storing current stack values into their buffers
    for name in reversed(names):
        helper = helper_for[name]
        _emit_lex(helper, template)
        _emit_lex("swap", template)
        _emit_lex("!", template)

    i = 0
    while i < len(body):
        tok = body[i]
        name = tok.lexeme
        helper = helper_for.get(name)
        if helper is not None:
            next_tok = body[i + 1] if i + 1 < len(body) else None
            if next_tok is not None and next_tok.lexeme == "!":
                _emit_lex(helper, tok)
                _emit_lex("swap", tok)
                _emit_lex("!", tok)
                i += 2
                continue
            if next_tok is not None and next_tok.lexeme == "@":
                _emit_lex(helper, tok)
                i += 1
                continue
            _emit_lex(helper, tok)
            _emit_lex("@", tok)
            i += 1
            continue
        _emit_lex(tok.lexeme, tok)
        i += 1

    ctx.inject_token_objects(emitted_tokens)
    return None


def macro_begin_text_macro(ctx: MacroContext) -> Optional[List[Op]]:
    parser = ctx.parser
    if parser._eof():
        raise ParseError("macro name missing after 'macro'")
    name_token = parser.next_token()
    param_count = 0
    peek = parser.peek_token()
    if peek is not None:
        try:
            param_count = int(peek.lexeme, 0)
            parser.next_token()
        except ValueError:
            param_count = 0
    parser._start_macro_recording(name_token.lexeme, param_count)
    return None


def _struct_emit_definition(tokens: List[Token], template: Token, name: str, body: Sequence[str]) -> None:
    def make_token(lexeme: str) -> Token:
        return Token(
            lexeme=lexeme,
            line=template.line,
            column=template.column,
            start=template.start,
            end=template.end,
        )

    tokens.append(make_token("word"))
    tokens.append(make_token(name))
    for lexeme in body:
        tokens.append(make_token(lexeme))
    tokens.append(make_token("end"))


class SplitLexer:
    def __init__(self, parser: Parser, separators: str) -> None:
        self.parser = parser
        self.separators = set(separators)
        self.buffer: List[Token] = []

    def _fill(self) -> None:
        while not self.buffer:
            if self.parser._eof():
                raise ParseError("unexpected EOF inside custom lexer")
            token = self.parser.next_token()
            parts = _split_token_by_chars(token, self.separators)
            if not parts:
                continue
            self.buffer.extend(parts)

    def peek(self) -> Token:
        self._fill()
        return self.buffer[0]

    def pop(self) -> Token:
        token = self.peek()
        self.buffer.pop(0)
        return token

    def expect(self, lexeme: str) -> Token:
        token = self.pop()
        if token.lexeme != lexeme:
            raise ParseError(f"expected '{lexeme}' but found '{token.lexeme}'")
        return token

    def collect_brace_block(self) -> List[Token]:
        depth = 1
        collected: List[Token] = []
        while depth > 0:
            token = self.pop()
            if token.lexeme == "{":
                depth += 1
                collected.append(token)
                continue
            if token.lexeme == "}":
                depth -= 1
                if depth == 0:
                    break
                collected.append(token)
                continue
            collected.append(token)
        return collected

    def push_back(self) -> None:
        if not self.buffer:
            return
        self.parser.tokens[self.parser.pos:self.parser.pos] = self.buffer
        self.buffer = []


def _split_token_by_chars(token: Token, separators: Set[str]) -> List[Token]:
    lex = token.lexeme
    if not lex:
        return []
    parts: List[Token] = []
    idx = 0
    while idx < len(lex):
        char = lex[idx]
        if char in separators:
            parts.append(Token(
                lexeme=char,
                line=token.line,
                column=token.column + idx,
                start=token.start + idx,
                end=token.start + idx + 1,
            ))
            idx += 1
            continue
        start_idx = idx
        while idx < len(lex) and lex[idx] not in separators:
            idx += 1
        segment = lex[start_idx:idx]
        if segment:
            parts.append(Token(
                lexeme=segment,
                line=token.line,
                column=token.column + start_idx,
                start=token.start + start_idx,
                end=token.start + idx,
            ))
    return parts


def _ensure_list(value: Any) -> List[Any]:
    if not isinstance(value, list):
        raise ParseError("expected list value")
    return value


def _ensure_dict(value: Any) -> Dict[Any, Any]:
    if not isinstance(value, dict):
        raise ParseError("expected map value")
    return value


def _ensure_lexer(value: Any) -> SplitLexer:
    if not isinstance(value, SplitLexer):
        raise ParseError("expected lexer value")
    return value


def _coerce_str(value: Any) -> str:
    if isinstance(value, str):
        return value
    if isinstance(value, bool):
        return "1" if value else "0"
    if isinstance(value, int):
        return str(value)
    raise ParseError("expected string-compatible value")


def _default_template(template: Optional[Token]) -> Token:
    if template is None:
        return Token(lexeme="", line=0, column=0, start=0, end=0)
    if not isinstance(template, Token):
        raise ParseError("expected token for template")
    return template


def _ct_nil(vm: CompileTimeVM) -> None:
    vm.push(None)


def _ct_nil_p(vm: CompileTimeVM) -> None:
    vm.push(1 if vm.pop() is None else 0)


def _ct_list_new(vm: CompileTimeVM) -> None:
    vm.push([])


def _ct_list_clone(vm: CompileTimeVM) -> None:
    lst = _ensure_list(vm.pop())
    vm.push(list(lst))


def _ct_list_append(vm: CompileTimeVM) -> None:
    value = vm.pop()
    lst = _ensure_list(vm.pop())
    lst.append(value)
    vm.push(lst)


def _ct_list_pop(vm: CompileTimeVM) -> None:
    lst = _ensure_list(vm.pop())
    if not lst:
        raise ParseError("cannot pop from empty list")
    value = lst.pop()
    vm.push(lst)
    vm.push(value)


def _ct_list_pop_front(vm: CompileTimeVM) -> None:
    lst = _ensure_list(vm.pop())
    if not lst:
        raise ParseError("cannot pop from empty list")
    value = lst.pop(0)
    vm.push(lst)
    vm.push(value)


def _ct_list_peek_front(vm: CompileTimeVM) -> None:
    lst = _ensure_list(vm.pop())
    if not lst:
        raise ParseError("cannot peek from empty list")
    vm.push(lst)
    vm.push(lst[0])


def _ct_list_push_front(vm: CompileTimeVM) -> None:
    value = vm.pop()
    lst = _ensure_list(vm.pop())
    lst.insert(0, value)
    vm.push(lst)


def _ct_prelude_clear(vm: CompileTimeVM) -> None:
    vm.parser.custom_prelude = []


def _ct_prelude_append(vm: CompileTimeVM) -> None:
    line = vm.pop_str()
    if vm.parser.custom_prelude is None:
        vm.parser.custom_prelude = []
    vm.parser.custom_prelude.append(line)


def _ct_prelude_set(vm: CompileTimeVM) -> None:
    lines = _ensure_list(vm.pop())
    if not all(isinstance(item, str) for item in lines):
        raise ParseError("prelude-set expects list of strings")
    vm.parser.custom_prelude = list(lines)


def _ct_bss_clear(vm: CompileTimeVM) -> None:
    vm.parser.custom_bss = []


def _ct_bss_append(vm: CompileTimeVM) -> None:
    line = vm.pop_str()
    if vm.parser.custom_bss is None:
        vm.parser.custom_bss = []
    vm.parser.custom_bss.append(line)


def _ct_bss_set(vm: CompileTimeVM) -> None:
    lines = _ensure_list(vm.pop())
    if not all(isinstance(item, str) for item in lines):
        raise ParseError("bss-set expects list of strings")
    vm.parser.custom_bss = list(lines)


def _ct_list_reverse(vm: CompileTimeVM) -> None:
    lst = _ensure_list(vm.pop())
    lst.reverse()
    vm.push(lst)


def _ct_list_length(vm: CompileTimeVM) -> None:
    lst = vm.pop_list()
    vm.push(len(lst))


def _ct_list_empty(vm: CompileTimeVM) -> None:
    lst = _ensure_list(vm.pop())
    vm.push(1 if not lst else 0)


def _ct_loop_index(vm: CompileTimeVM) -> None:
    if not vm.loop_stack:
        raise ParseError("'i' used outside of a for loop")
    frame = vm.loop_stack[-1]
    idx = frame["initial"] - frame["remaining"]
    vm.push(idx)


def _ct_control_frame_new(vm: CompileTimeVM) -> None:
    type_name = vm.pop_str()
    vm.push({"type": type_name})


def _ct_control_get(vm: CompileTimeVM) -> None:
    key = vm.pop_str()
    frame = vm.pop()
    if not isinstance(frame, dict):
        raise ParseError("ct-control-get expects a control frame")
    vm.push(frame.get(key))


def _ct_control_set(vm: CompileTimeVM) -> None:
    value = vm.pop()
    key = vm.pop_str()
    frame = vm.pop()
    if not isinstance(frame, dict):
        raise ParseError("ct-control-set expects a control frame")
    frame[key] = value
    vm.push(frame)


def _ct_control_push(vm: CompileTimeVM) -> None:
    frame = vm.pop()
    if not isinstance(frame, dict):
        raise ParseError("ct-control-push expects a control frame")
    vm.parser._push_control(dict(frame))


def _ct_control_pop(vm: CompileTimeVM) -> None:
    if not vm.parser.control_stack:
        raise ParseError("control stack underflow")
    vm.push(dict(vm.parser.control_stack.pop()))


def _ct_control_peek(vm: CompileTimeVM) -> None:
    if not vm.parser.control_stack:
        vm.push(None)
        return
    vm.push(dict(vm.parser.control_stack[-1]))


def _ct_control_depth(vm: CompileTimeVM) -> None:
    vm.push(len(vm.parser.control_stack))


def _ct_new_label(vm: CompileTimeVM) -> None:
    prefix = vm.pop_str()
    vm.push(vm.parser._new_label(prefix))


def _ct_emit_op(vm: CompileTimeVM) -> None:
    data = vm.pop()
    op_name = vm.pop_str()
    vm.parser.emit_node(_make_op(op_name, data))


def _ct_control_add_close_op(vm: CompileTimeVM) -> None:
    data = vm.pop()
    op_name = vm.pop_str()
    frame = vm.pop()
    if not isinstance(frame, dict):
        raise ParseError("ct-control-add-close-op expects a control frame")
    close_ops = frame.get("close_ops")
    if close_ops is None:
        close_ops = []
    elif not isinstance(close_ops, list):
        raise ParseError("control frame field 'close_ops' must be a list")
    close_ops.append({"op": op_name, "data": data})
    frame["close_ops"] = close_ops
    vm.push(frame)


def _ct_last_token_line(vm: CompileTimeVM) -> None:
    tok = vm.parser._last_token
    vm.push(0 if tok is None else tok.line)


def _ct_register_block_opener(vm: CompileTimeVM) -> None:
    name = vm.pop_str()
    vm.parser.block_openers.add(name)


def _ct_unregister_block_opener(vm: CompileTimeVM) -> None:
    name = vm.pop_str()
    vm.parser.block_openers.discard(name)


def _ct_register_control_override(vm: CompileTimeVM) -> None:
    name = vm.pop_str()
    vm.parser.control_overrides.add(name)


def _ct_unregister_control_override(vm: CompileTimeVM) -> None:
    name = vm.pop_str()
    vm.parser.control_overrides.discard(name)


def _ct_list_get(vm: CompileTimeVM) -> None:
    index = vm.pop_int()
    lst = _ensure_list(vm.pop())
    try:
        vm.push(lst[index])
    except IndexError as exc:
        raise ParseError("list index out of range") from exc


def _ct_list_set(vm: CompileTimeVM) -> None:
    value = vm.pop()
    index = vm.pop_int()
    lst = _ensure_list(vm.pop())
    try:
        lst[index] = value
    except IndexError as exc:
        raise ParseError("list index out of range") from exc
    vm.push(lst)


def _ct_list_clear(vm: CompileTimeVM) -> None:
    lst = _ensure_list(vm.pop())
    lst.clear()
    vm.push(lst)


def _ct_list_extend(vm: CompileTimeVM) -> None:
    source = _ensure_list(vm.pop())
    target = _ensure_list(vm.pop())
    target.extend(source)
    vm.push(target)


def _ct_list_last(vm: CompileTimeVM) -> None:
    lst = _ensure_list(vm.pop())
    if not lst:
        raise ParseError("list is empty")
    vm.push(lst[-1])


def _ct_map_new(vm: CompileTimeVM) -> None:
    vm.push({})


def _ct_map_set(vm: CompileTimeVM) -> None:
    value = vm.pop()
    key = vm.pop()
    map_obj = _ensure_dict(vm.pop())
    map_obj[key] = value
    vm.push(map_obj)


def _ct_map_get(vm: CompileTimeVM) -> None:
    key = vm.pop()
    map_obj = _ensure_dict(vm.pop())
    vm.push(map_obj)
    if key in map_obj:
        vm.push(map_obj[key])
        vm.push(1)
    else:
        vm.push(None)
        vm.push(0)


def _ct_map_has(vm: CompileTimeVM) -> None:
    key = vm.pop()
    map_obj = _ensure_dict(vm.pop())
    vm.push(map_obj)
    vm.push(1 if key in map_obj else 0)


def _ct_string_eq(vm: CompileTimeVM) -> None:
    try:
        right = vm.pop_str()
        left = vm.pop_str()
    except ParseError as exc:
        raise ParseError(f"string= expects strings; stack={vm.stack!r}") from exc
    vm.push(1 if left == right else 0)


def _ct_string_length(vm: CompileTimeVM) -> None:
    value = vm.pop_str()
    vm.push(len(value))


def _ct_string_append(vm: CompileTimeVM) -> None:
    right = vm.pop_str()
    left = vm.pop_str()
    vm.push(left + right)


def _ct_string_to_number(vm: CompileTimeVM) -> None:
    text = vm.pop_str()
    try:
        value = int(text, 0)
        vm.push(value)
        vm.push(1)
    except ValueError:
        vm.push(0)
        vm.push(0)


def _ct_set_token_hook(vm: CompileTimeVM) -> None:
    hook_name = vm.pop_str()
    vm.parser.token_hook = hook_name


def _ct_clear_token_hook(vm: CompileTimeVM) -> None:
    vm.parser.token_hook = None


def _ct_use_l2_compile_time(vm: CompileTimeVM) -> None:
    if vm.stack:
        name = vm.pop_str()
        word = vm.dictionary.lookup(name)
    else:
        word = vm.parser.most_recent_definition()
        if word is None:
            raise ParseError("use-l2-ct with empty stack and no recent definition")
        name = word.name
    if word is None:
        raise ParseError(f"unknown word '{name}' for use-l2-ct")
    word.compile_time_intrinsic = None
    word.compile_time_override = True


def _ct_add_token(vm: CompileTimeVM) -> None:
    tok = vm.pop_str()
    vm.parser.reader.add_tokens([tok])


def _ct_add_token_chars(vm: CompileTimeVM) -> None:
    chars = vm.pop_str()
    vm.parser.reader.add_token_chars(chars)


def _ct_shunt(vm: CompileTimeVM) -> None:
    """Convert an infix token list (strings) to postfix using +,-,*,/,%."""
    ops: List[str] = []
    output: List[str] = []
    prec = {"+": 1, "-": 1, "*": 2, "/": 2, "%": 2}
    tokens = _ensure_list(vm.pop())
    for tok in tokens:
        if not isinstance(tok, str):
            raise ParseError("shunt expects list of strings")
        if tok == "(":
            ops.append(tok)
            continue
        if tok == ")":
            while ops and ops[-1] != "(":
                output.append(ops.pop())
            if not ops:
                raise ParseError("mismatched parentheses in expression")
            ops.pop()
            continue
        if tok in prec:
            while ops and ops[-1] in prec and prec[ops[-1]] >= prec[tok]:
                output.append(ops.pop())
            ops.append(tok)
            continue
        output.append(tok)
    while ops:
        top = ops.pop()
        if top == "(":
            raise ParseError("mismatched parentheses in expression")
        output.append(top)
    vm.push(output)


def _ct_int_to_string(vm: CompileTimeVM) -> None:
    value = vm.pop_int()
    vm.push(str(value))


def _ct_identifier_p(vm: CompileTimeVM) -> None:
    value = vm._resolve_handle(vm.pop())
    if isinstance(value, Token):
        value = value.lexeme
    if not isinstance(value, str):
        vm.push(0)
        return
    vm.push(1 if _is_identifier(value) else 0)


def _ct_token_lexeme(vm: CompileTimeVM) -> None:
    value = vm._resolve_handle(vm.pop())
    if isinstance(value, Token):
        vm.push(value.lexeme)
        return
    if isinstance(value, str):
        vm.push(value)
        return
    raise ParseError("expected token or string on compile-time stack")


def _ct_token_from_lexeme(vm: CompileTimeVM) -> None:
    template_value = vm.pop()
    lexeme = vm.pop_str()
    template = _default_template(template_value)
    vm.push(Token(
        lexeme=lexeme,
        line=template.line,
        column=template.column,
        start=template.start,
        end=template.end,
    ))


def _ct_next_token(vm: CompileTimeVM) -> None:
    token = vm.parser.next_token()
    vm.push(token)


def _ct_peek_token(vm: CompileTimeVM) -> None:
    vm.push(vm.parser.peek_token())


def _ct_inject_tokens(vm: CompileTimeVM) -> None:
    tokens = _ensure_list(vm.pop())
    if not all(isinstance(item, Token) for item in tokens):
        raise ParseError("inject-tokens expects a list of tokens")
    vm.parser.inject_token_objects(tokens)


def _ct_emit_definition(vm: CompileTimeVM) -> None:
    body = _ensure_list(vm.pop())
    name_value = vm.pop()
    if isinstance(name_value, Token):
        template = name_value
        name = name_value.lexeme
    elif isinstance(name_value, str):
        template = _default_template(vm.pop())
        name = name_value
    else:
        raise ParseError("emit-definition expects token or string for name")
    lexemes = [
        item.lexeme if isinstance(item, Token) else _coerce_str(item)
        for item in body
    ]
    generated: List[Token] = []
    _struct_emit_definition(generated, template, name, lexemes)
    vm.parser.inject_token_objects(generated)


def _ct_parse_error(vm: CompileTimeVM) -> None:
    message = vm.pop_str()
    raise ParseError(message)


def _ct_static_assert(vm: CompileTimeVM) -> None:
    condition = vm._resolve_handle(vm.pop())
    if isinstance(condition, bool):
        ok = condition
    elif isinstance(condition, int):
        ok = condition != 0
    else:
        raise ParseError(
            f"static_assert expects integer/boolean condition, got {type(condition).__name__}"
        )
    if not ok:
        loc = vm.current_location
        if loc is not None:
            raise ParseError(f"static assertion failed at {loc.path}:{loc.line}:{loc.column}")
        raise ParseError("static assertion failed")


def _ct_lexer_new(vm: CompileTimeVM) -> None:
    separators = vm.pop_str()
    vm.push(SplitLexer(vm.parser, separators))


def _ct_lexer_pop(vm: CompileTimeVM) -> None:
    lexer = _ensure_lexer(vm.pop())
    token = lexer.pop()
    vm.push(lexer)
    vm.push(token)


def _ct_lexer_peek(vm: CompileTimeVM) -> None:
    lexer = _ensure_lexer(vm.pop())
    vm.push(lexer)
    vm.push(lexer.peek())


def _ct_lexer_expect(vm: CompileTimeVM) -> None:
    lexeme = vm.pop_str()
    lexer = _ensure_lexer(vm.pop())
    token = lexer.expect(lexeme)
    vm.push(lexer)
    vm.push(token)


def _ct_lexer_collect_brace(vm: CompileTimeVM) -> None:
    lexer = _ensure_lexer(vm.pop())
    vm.push(lexer)
    vm.push(lexer.collect_brace_block())


def _ct_lexer_push_back(vm: CompileTimeVM) -> None:
    lexer = _ensure_lexer(vm.pop())
    lexer.push_back()
    vm.push(lexer)


def _ct_eval(vm: CompileTimeVM) -> None:
    """Pop a string from TOS and execute it in the compile-time VM."""
    if vm.runtime_mode:
        length = vm.pop_int()
        addr = vm.pop_int()
        source = ctypes.string_at(addr, length).decode("utf-8")
    else:
        source = vm.pop_str()
    tokens = list(vm.parser.reader.tokenize(source))
    # Parse as if inside a definition body to get Op nodes
    parser = vm.parser
    # Save parser state
    old_tokens = parser.tokens
    old_pos = parser.pos
    old_iter = parser._token_iter
    old_exhausted = parser._token_iter_exhausted
    old_source = parser.source
    # Set up temporary token stream
    parser.tokens = list(tokens)
    parser.pos = 0
    parser._token_iter = iter([])
    parser._token_iter_exhausted = True
    parser.source = "<eval>"
    # Collect ops by capturing what _handle_token appends
    temp_defn = Definition(name="__eval__", body=[])
    parser.context_stack.append(temp_defn)
    try:
        while not parser._eof():
            token = parser._consume()
            parser._handle_token(token)
    finally:
        parser.context_stack.pop()
        # Restore parser state
        parser.tokens = old_tokens
        parser.pos = old_pos
        parser._token_iter = old_iter
        parser._token_iter_exhausted = old_exhausted
        parser.source = old_source
    # Execute collected ops in the VM
    if temp_defn.body:
        vm._execute_nodes(temp_defn.body)


# ---------------------------------------------------------------------------
# Runtime intrinsics that cannot run as native JIT  (for --ct-run-main)
# ---------------------------------------------------------------------------

def _rt_exit(vm: CompileTimeVM) -> None:
    code = vm.pop_int()
    raise _CTVMExit(code)


def _rt_jmp(vm: CompileTimeVM) -> None:
    target = vm.pop()
    resolved = vm._resolve_handle(target)
    if isinstance(resolved, Word):
        vm._call_word(resolved)
        raise _CTVMReturn()
    if isinstance(resolved, bool):
        raise _CTVMJump(int(resolved))
    if not isinstance(resolved, int):
        raise ParseError(
            f"jmp expects an address or word pointer, got {type(resolved).__name__}: {resolved!r}"
        )
    raise _CTVMJump(resolved)


def _rt_syscall(vm: CompileTimeVM) -> None:
    """Execute a real Linux syscall via a JIT stub, intercepting exit/exit_group."""
    # Lazily compile the syscall JIT stub
    stub = vm._jit_cache.get("__syscall_stub")
    if stub is None:
        stub = _compile_syscall_stub(vm)
        vm._jit_cache["__syscall_stub"] = stub

    # out[0] = final r12, out[1] = final r13, out[2] = flag (0=normal, 1=exit, code in out[3])
    out = vm._jit_out4
    stub(vm.r12, vm.r13, vm._jit_out4_addr)
    vm.r12 = out[0]
    vm.r13 = out[1]
    if out[2] == 1:
        raise _CTVMExit(out[3])


def _compile_syscall_stub(vm: CompileTimeVM) -> Any:
    """JIT-compile a native syscall stub that intercepts exit/exit_group."""
    if Ks is None:
        raise ParseError("keystone-engine is required for JIT syscall execution")

    # The stub uses the same wrapper convention as _compile_jit:
    #   rdi = r12 (data stack ptr), rsi = r13 (return stack ptr), rdx = output ptr
    # Output struct: [r12, r13, exit_flag, exit_code]
    #
    # Stack protocol (matching _emit_syscall_intrinsic):
    #   TOS:   syscall number -> rax
    #   TOS-1: arg count -> rcx
    #   then args on stack as ... arg0 arg1 ... argN (argN is top)
    #

    lines = [
        "_stub_entry:",
        "    push rbx",
        "    push r12",
        "    push r13",
        "    push r14",
        "    push r15",
        "    sub rsp, 16",
        "    mov [rsp], rdx",        # save output-struct pointer
        "    mov r12, rdi",          # data stack
        "    mov r13, rsi",          # return stack
        # Pop syscall number
        "    mov rax, [r12]",
        "    add r12, 8",
        # Pop arg count
        "    mov rcx, [r12]",
        "    add r12, 8",
        # Clamp to [0,6]
        "    cmp rcx, 0",
        "    jge _count_nonneg",
        "    xor rcx, rcx",
        "_count_nonneg:",
        "    cmp rcx, 6",
        "    jle _count_clamped",
        "    mov rcx, 6",
        "_count_clamped:",
        # Save syscall num in r15
        "    mov r15, rax",
        # Check for exit (60) / exit_group (231)
        "    cmp r15, 60",
        "    je _do_exit",
        "    cmp r15, 231",
        "    je _do_exit",
        # Clear syscall arg registers
        "    xor rdi, rdi",
        "    xor rsi, rsi",
        "    xor rdx, rdx",
        "    xor r10, r10",
        "    xor r8, r8",
        "    xor r9, r9",
        # Pop args in the same order as _emit_syscall_intrinsic
        "    cmp rcx, 6",
        "    jl _skip_r9",
        "    mov r9, [r12]",
        "    add r12, 8",
        "_skip_r9:",
        "    cmp rcx, 5",
        "    jl _skip_r8",
        "    mov r8, [r12]",
        "    add r12, 8",
        "_skip_r8:",
        "    cmp rcx, 4",
        "    jl _skip_r10",
        "    mov r10, [r12]",
        "    add r12, 8",
        "_skip_r10:",
        "    cmp rcx, 3",
        "    jl _skip_rdx",
        "    mov rdx, [r12]",
        "    add r12, 8",
        "_skip_rdx:",
        "    cmp rcx, 2",
        "    jl _skip_rsi",
        "    mov rsi, [r12]",
        "    add r12, 8",
        "_skip_rsi:",
        "    cmp rcx, 1",
        "    jl _skip_rdi",
        "    mov rdi, [r12]",
        "    add r12, 8",
        "_skip_rdi:",
        "    mov rax, r15",          # syscall number
        "    syscall",
        # Push result
        "    sub r12, 8",
        "    mov [r12], rax",
        # Normal return: flag=0
        "    mov rax, [rsp]",        # output-struct pointer
        "    mov qword [rax], r12",
        "    mov qword [rax+8], r13",
        "    mov qword [rax+16], 0", # exit_flag = 0
        "    mov qword [rax+24], 0", # exit_code = 0
        "    jmp _stub_epilogue",
        # Exit path: don't actually call syscall, just report it
        "_do_exit:",
        "    xor rbx, rbx",
        "    cmp rcx, 1",
        "    jl _exit_code_ready",
        "    mov rbx, [r12]",        # arg0 = exit code (for exit/exit_group)
        "    add r12, 8",
        "_exit_code_ready:",
        "    mov rax, [rsp]",        # output-struct pointer
        "    mov qword [rax], r12",
        "    mov qword [rax+8], r13",
        "    mov qword [rax+16], 1", # exit_flag = 1
        "    mov [rax+24], rbx",     # exit_code
        "_stub_epilogue:",
        "    add rsp, 16",
        "    pop r15",
        "    pop r14",
        "    pop r13",
        "    pop r12",
        "    pop rbx",
        "    ret",
    ]

    def _norm(l: str) -> str:
        l = l.split(";", 1)[0].rstrip()
        for sz in ("qword", "dword", "word", "byte"):
            l = l.replace(f"{sz} [", f"{sz} ptr [")
        return l
    normalized = [_norm(l) for l in lines if _norm(l).strip()]

    ks = Ks(KS_ARCH_X86, KS_MODE_64)
    try:
        encoding, _ = ks.asm("\n".join(normalized))
    except KsError as exc:
        debug_txt = "\n".join(normalized)
        raise ParseError(f"JIT syscall stub assembly failed: {exc}\n--- asm ---\n{debug_txt}\n--- end ---") from exc
    if encoding is None:
        raise ParseError("JIT syscall stub produced no code")

    code = bytes(encoding)
    page_size = max(len(code), 4096)
    _libc = ctypes.CDLL(None, use_errno=True)
    _libc.mmap.restype = ctypes.c_void_p
    _libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int,
                            ctypes.c_int, ctypes.c_int, ctypes.c_long]
    PROT_RWX = 0x1 | 0x2 | 0x4
    MAP_PRIVATE = 0x02
    MAP_ANONYMOUS = 0x20
    ptr = _libc.mmap(None, page_size, PROT_RWX, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)
    if ptr == ctypes.c_void_p(-1).value or ptr is None:
        raise RuntimeError("mmap failed for JIT syscall stub")
    ctypes.memmove(ptr, code, len(code))
    vm._jit_code_pages.append((ptr, page_size))
    # Same signature: (r12, r13, out_ptr) -> void
    if CompileTimeVM._JIT_FUNC_TYPE is None:
        CompileTimeVM._JIT_FUNC_TYPE = ctypes.CFUNCTYPE(None, ctypes.c_int64, ctypes.c_int64, ctypes.c_void_p)
    func = CompileTimeVM._JIT_FUNC_TYPE(ptr)
    return func


def _register_runtime_intrinsics(dictionary: Dictionary) -> None:
    """Register runtime intrinsics only for words that cannot run as native JIT.

    Most :asm words now run as native JIT-compiled machine code on real
    memory stacks.  Only a handful need Python-level interception:
      - exit    : must not actually call sys_exit (would kill the compiler)
      - jmp     : needs interpreter-level IP manipulation
      - syscall : the ``syscall`` word is compiler-generated (no asm body);
                  intercept to block sys_exit and handle safely
    Note: get_addr is handled inline in _execute_nodes before _call_word.
    """
    _RT_MAP: Dict[str, Callable[[CompileTimeVM], None]] = {
        "exit": _rt_exit,
        "jmp": _rt_jmp,
        "syscall": _rt_syscall,
    }
    for name, func in _RT_MAP.items():
        word = dictionary.lookup(name)
        if word is None:
            word = Word(name=name)
            dictionary.register(word)
        word.runtime_intrinsic = func


def _register_compile_time_primitives(dictionary: Dictionary) -> None:
    def register(name: str, func: Callable[[CompileTimeVM], None], *, compile_only: bool = False) -> None:
        word = dictionary.lookup(name)
        if word is None:
            word = Word(name=name)
            dictionary.register(word)
        word.compile_time_intrinsic = func
        if compile_only:
            word.compile_only = True

    register("nil", _ct_nil, compile_only=True)
    register("nil?", _ct_nil_p, compile_only=True)
    register("list-new", _ct_list_new, compile_only=True)
    register("list-clone", _ct_list_clone, compile_only=True)
    register("list-append", _ct_list_append, compile_only=True)
    register("list-pop", _ct_list_pop, compile_only=True)
    register("list-pop-front", _ct_list_pop_front, compile_only=True)
    register("list-peek-front", _ct_list_peek_front, compile_only=True)
    register("list-push-front", _ct_list_push_front, compile_only=True)
    register("list-reverse", _ct_list_reverse, compile_only=True)
    register("list-length", _ct_list_length, compile_only=True)
    register("list-empty?", _ct_list_empty, compile_only=True)
    register("list-get", _ct_list_get, compile_only=True)
    register("list-set", _ct_list_set, compile_only=True)
    register("list-clear", _ct_list_clear, compile_only=True)
    register("list-extend", _ct_list_extend, compile_only=True)
    register("list-last", _ct_list_last, compile_only=True)
    register("i", _ct_loop_index, compile_only=True)
    register("ct-control-frame-new", _ct_control_frame_new, compile_only=True)
    register("ct-control-get", _ct_control_get, compile_only=True)
    register("ct-control-set", _ct_control_set, compile_only=True)
    register("ct-control-push", _ct_control_push, compile_only=True)
    register("ct-control-pop", _ct_control_pop, compile_only=True)
    register("ct-control-peek", _ct_control_peek, compile_only=True)
    register("ct-control-depth", _ct_control_depth, compile_only=True)
    register("ct-control-add-close-op", _ct_control_add_close_op, compile_only=True)
    register("ct-new-label", _ct_new_label, compile_only=True)
    register("ct-emit-op", _ct_emit_op, compile_only=True)
    register("ct-last-token-line", _ct_last_token_line, compile_only=True)
    register("ct-register-block-opener", _ct_register_block_opener, compile_only=True)
    register("ct-unregister-block-opener", _ct_unregister_block_opener, compile_only=True)
    register("ct-register-control-override", _ct_register_control_override, compile_only=True)
    register("ct-unregister-control-override", _ct_unregister_control_override, compile_only=True)

    register("prelude-clear", _ct_prelude_clear, compile_only=True)
    register("prelude-append", _ct_prelude_append, compile_only=True)
    register("prelude-set", _ct_prelude_set, compile_only=True)
    register("bss-clear", _ct_bss_clear, compile_only=True)
    register("bss-append", _ct_bss_append, compile_only=True)
    register("bss-set", _ct_bss_set, compile_only=True)

    register("map-new", _ct_map_new, compile_only=True)
    register("map-set", _ct_map_set, compile_only=True)
    register("map-get", _ct_map_get, compile_only=True)
    register("map-has?", _ct_map_has, compile_only=True)

    register("string=", _ct_string_eq, compile_only=True)
    register("string-length", _ct_string_length, compile_only=True)
    register("string-append", _ct_string_append, compile_only=True)
    register("string>number", _ct_string_to_number, compile_only=True)
    register("int>string", _ct_int_to_string, compile_only=True)
    register("identifier?", _ct_identifier_p, compile_only=True)
    register("shunt", _ct_shunt, compile_only=True)

    register("token-lexeme", _ct_token_lexeme, compile_only=True)
    register("token-from-lexeme", _ct_token_from_lexeme, compile_only=True)
    register("next-token", _ct_next_token, compile_only=True)
    register("peek-token", _ct_peek_token, compile_only=True)
    register("inject-tokens", _ct_inject_tokens, compile_only=True)
    register("add-token", _ct_add_token, compile_only=True)
    register("add-token-chars", _ct_add_token_chars, compile_only=True)
    register("set-token-hook", _ct_set_token_hook, compile_only=True)
    register("clear-token-hook", _ct_clear_token_hook, compile_only=True)
    register("use-l2-ct", _ct_use_l2_compile_time, compile_only=True)
    word_use_l2 = dictionary.lookup("use-l2-ct")
    if word_use_l2:
        word_use_l2.immediate = True
    register("emit-definition", _ct_emit_definition, compile_only=True)
    register("parse-error", _ct_parse_error, compile_only=True)
    register("static_assert", _ct_static_assert, compile_only=True)

    register("lexer-new", _ct_lexer_new, compile_only=True)
    register("lexer-pop", _ct_lexer_pop, compile_only=True)
    register("lexer-peek", _ct_lexer_peek, compile_only=True)
    register("lexer-expect", _ct_lexer_expect, compile_only=True)
    register("lexer-collect-brace", _ct_lexer_collect_brace, compile_only=True)
    register("lexer-push-back", _ct_lexer_push_back, compile_only=True)
    register("eval", _ct_eval, compile_only=True)


PY_EXEC_GLOBALS: Dict[str, Any] = {
    "MacroContext": MacroContext,
    "Token": Token,
    "Op": Op,
    "StructField": StructField,
    "Definition": Definition,
    "Module": Module,
    "ParseError": ParseError,
    "emit_definition": _struct_emit_definition,
    "is_identifier": _is_identifier,
}


def _parse_cfield_type(parser: Parser, struct_name: str) -> str:
    if parser._eof():
        raise ParseError(f"field type missing in cstruct '{struct_name}'")
    tok = parser.next_token().lexeme

    if tok == "struct":
        if parser._eof():
            raise ParseError(f"struct field type missing name in cstruct '{struct_name}'")
        name_tok = parser.next_token().lexeme
        type_name = f"struct {name_tok}"
        if not parser._eof():
            peek = parser.peek_token()
            if peek is not None and set(peek.lexeme) == {"*"}:
                type_name += peek.lexeme
                parser.next_token()
        return _canonical_c_type_name(type_name)

    canonical = _canonical_c_type_name(tok)
    return _canonical_c_type_name(_C_FIELD_TYPE_ALIASES.get(canonical, canonical))


def macro_struct_begin(ctx: MacroContext) -> Optional[List[Op]]:
    parser = ctx.parser
    if parser._eof():
        raise ParseError("struct name missing after 'struct'")
    name_token = parser.next_token()
    struct_name = name_token.lexeme
    fields: List[StructField] = []
    current_offset = 0
    while True:
        if parser._eof():
            raise ParseError("unterminated struct definition (missing 'end')")
        token = parser.next_token()
        if token.lexeme == "end":
            break
        if token.lexeme != "field":
            raise ParseError(
                f"expected 'field' or 'end' in struct '{struct_name}' definition"
            )
        if parser._eof():
            raise ParseError("field name missing in struct definition")
        field_name_token = parser.next_token()
        if parser._eof():
            raise ParseError(f"field size missing for '{field_name_token.lexeme}'")
        size_token = parser.next_token()
        try:
            field_size = int(size_token.lexeme, 0)
        except ValueError as exc:
            raise ParseError(
                f"invalid field size '{size_token.lexeme}' in struct '{struct_name}'"
            ) from exc
        fields.append(StructField(field_name_token.lexeme, current_offset, field_size))
        current_offset += field_size

    generated: List[Token] = []
    _struct_emit_definition(generated, name_token, f"{struct_name}.size", [str(current_offset)])
    for field in fields:
        size_word = f"{struct_name}.{field.name}.size"
        offset_word = f"{struct_name}.{field.name}.offset"
        _struct_emit_definition(generated, name_token, size_word, [str(field.size)])
        _struct_emit_definition(generated, name_token, offset_word, [str(field.offset)])
        _struct_emit_definition(
            generated,
            name_token,
            f"{struct_name}.{field.name}@",
            [offset_word, "+", "@"],
        )
        _struct_emit_definition(
            generated,
            name_token,
            f"{struct_name}.{field.name}!",
            ["swap", offset_word, "+", "swap", "!"],
        )

    parser.tokens[parser.pos:parser.pos] = generated
    return None


def macro_cstruct_begin(ctx: MacroContext) -> Optional[List[Op]]:
    parser = ctx.parser
    if parser._eof():
        raise ParseError("cstruct name missing after 'cstruct'")
    name_token = parser.next_token()
    struct_name = name_token.lexeme
    fields: List[CStructField] = []
    current_offset = 0
    max_align = 1

    while True:
        if parser._eof():
            raise ParseError("unterminated cstruct definition (missing 'end')")
        token = parser.next_token()
        if token.lexeme == "end":
            break
        if token.lexeme != "cfield":
            raise ParseError(
                f"expected 'cfield' or 'end' in cstruct '{struct_name}' definition"
            )
        if parser._eof():
            raise ParseError("field name missing in cstruct definition")
        field_name_token = parser.next_token()
        type_name = _parse_cfield_type(parser, struct_name)
        field_size, field_align, _, _ = _c_type_size_align_class(type_name, parser.cstruct_layouts)
        if field_size <= 0:
            raise ParseError(
                f"invalid cfield type '{type_name}' for '{field_name_token.lexeme}' in cstruct '{struct_name}'"
            )

        current_offset = _round_up(current_offset, field_align)
        fields.append(
            CStructField(
                name=field_name_token.lexeme,
                type_name=type_name,
                offset=current_offset,
                size=field_size,
                align=field_align,
            )
        )
        current_offset += field_size
        if field_align > max_align:
            max_align = field_align

    total_size = _round_up(current_offset, max_align)
    parser.cstruct_layouts[struct_name] = CStructLayout(
        name=struct_name,
        size=total_size,
        align=max_align,
        fields=fields,
    )

    generated: List[Token] = []
    _struct_emit_definition(generated, name_token, f"{struct_name}.size", [str(total_size)])
    _struct_emit_definition(generated, name_token, f"{struct_name}.align", [str(max_align)])
    for field in fields:
        size_word = f"{struct_name}.{field.name}.size"
        offset_word = f"{struct_name}.{field.name}.offset"
        _struct_emit_definition(generated, name_token, size_word, [str(field.size)])
        _struct_emit_definition(generated, name_token, offset_word, [str(field.offset)])
        if field.size == 8:
            _struct_emit_definition(
                generated,
                name_token,
                f"{struct_name}.{field.name}@",
                [offset_word, "+", "@"],
            )
            _struct_emit_definition(
                generated,
                name_token,
                f"{struct_name}.{field.name}!",
                ["swap", offset_word, "+", "swap", "!"],
            )

    parser.tokens[parser.pos:parser.pos] = generated
    return None

def macro_here(ctx: MacroContext) -> Optional[List[Op]]:
    tok = ctx.parser._last_token
    if tok is None:
        return [_make_op("literal", "<source>:0:0")]
    loc = ctx.parser.location_for_token(tok)
    return [_make_op("literal", f"{loc.path.name}:{loc.line}:{loc.column}")]


def bootstrap_dictionary() -> Dictionary:
    dictionary = Dictionary()
    dictionary.register(Word(name="immediate", immediate=True, macro=macro_immediate))
    dictionary.register(Word(name="compile-only", immediate=True, macro=macro_compile_only))
    dictionary.register(Word(name="inline", immediate=True, macro=macro_inline))
    dictionary.register(Word(name="label", immediate=True, macro=macro_label))
    dictionary.register(Word(name="goto", immediate=True, macro=macro_goto))
    dictionary.register(Word(name="compile-time", immediate=True, macro=macro_compile_time))
    dictionary.register(Word(name="here", immediate=True, macro=macro_here))
    dictionary.register(Word(name="with", immediate=True, macro=macro_with))
    dictionary.register(Word(name="macro", immediate=True, macro=macro_begin_text_macro))
    dictionary.register(Word(name="struct", immediate=True, macro=macro_struct_begin))
    dictionary.register(Word(name="cstruct", immediate=True, macro=macro_cstruct_begin))
    _register_compile_time_primitives(dictionary)
    _register_runtime_intrinsics(dictionary)
    return dictionary


# ---------------------------------------------------------------------------
# Driver
# ---------------------------------------------------------------------------


class FileSpan:
    __slots__ = ('path', 'start_line', 'end_line', 'local_start_line')

    def __init__(self, path: Path, start_line: int, end_line: int, local_start_line: int) -> None:
        self.path = path
        self.start_line = start_line
        self.end_line = end_line
        self.local_start_line = local_start_line


# Uppercase macro prefixes to strip (API export macros like RLAPI, WINGDIAPI, etc.)
# Keep common uppercase type names.
_C_HEADER_KEEP_UPPER = frozenset({"FILE", "DIR", "EOF", "NULL", "BOOL"})


def _parse_c_header_externs(header_text: str) -> List[str]:
    """Extract function declarations from a C header and return L2 ``extern`` lines."""
    text = re.sub(r"/\*.*?\*/", " ", header_text, flags=re.DOTALL)
    text = re.sub(r"//[^\n]*", "", text)
    text = re.sub(r"^\s*#[^\n]*$", "", text, flags=re.MULTILINE)
    text = text.replace("\\\n", " ")
    # Collapse whitespace (including newlines) so multi-line declarations become single-line
    text = re.sub(r"\s+", " ", text)
    # Strip __attribute__((...)), __nonnull((...)), __THROW, __wur, and similar GCC extensions
    text = re.sub(r"__\w+\s*\(\([^)]*\)\)", "", text)
    text = re.sub(r"__\w+", "", text)
    # Remove __restrict
    text = text.replace("__restrict", "")

    # Match function declarations: <tokens> <name>(<params>);
    _RE = re.compile(
        r"([\w][\w\s*]+?)"           # return type tokens + function name
        r"\s*\(([^)]*?)\)"           # parameter list
        r"\s*;"
    )

    results: List[str] = []
    for m in _RE.finditer(text):
        prefix = m.group(1).strip()
        params_raw = m.group(2).strip()

        if "..." in params_raw:
            # Variadic function: strip the ... from parameter list, keep fixed args
            params_fixed = re.sub(r",?\s*\.\.\.", "", params_raw).strip()
            param_str = "void" if params_fixed in ("void", "") else params_fixed
            is_variadic = True
        else:
            is_variadic = False

        tokens = prefix.split()
        if len(tokens) < 2:
            continue

        # Last token is function name (may have leading * for pointer-returning functions)
        func_name = tokens[-1].lstrip("*")
        if not func_name or not re.match(r"^[A-Za-z_]\w*$", func_name):
            continue

        # Skip typedef (struct/enum/union return types are fine — the regex
        # already ensures this matched a function declaration with parentheses)
        if tokens[0] in ("typedef",):
            continue

        # Build return type: strip API macros and calling-convention qualifiers
        type_tokens = tokens[:-1]
        cleaned: List[str] = []
        for t in type_tokens:
            if t in ("extern", "static", "inline"):
                continue
            # Strip uppercase macro prefixes (3+ chars, all caps) unless known type
            if re.match(r"^[A-Z_][A-Z_0-9]{2,}$", t) and t not in _C_HEADER_KEEP_UPPER:
                continue
            cleaned.append(t)

        # Pointer stars attached to the function name belong to the return type
        leading_stars = len(tokens[-1]) - len(tokens[-1].lstrip("*"))
        ret_type = " ".join(cleaned)
        if leading_stars:
            ret_type += " " + "*" * leading_stars
        ret_type = ret_type.strip()
        if not ret_type:
            ret_type = "int"

        if not is_variadic:
            param_str = "void" if params_raw in ("void", "") else params_raw

        va_suffix = ", ..." if is_variadic else ""
        results.append(f"extern {ret_type} {func_name}({param_str}{va_suffix})")
    return results


# Map C types to L2 cstruct field types
_C_TO_L2_FIELD_TYPE: Dict[str, str] = {
    "char": "i8", "signed char": "i8", "unsigned char": "u8",
    "short": "i16", "unsigned short": "u16", "short int": "i16",
    "int": "i32", "unsigned int": "u32", "unsigned": "u32",
    "long": "i64", "unsigned long": "u64", "long int": "i64",
    "long long": "i64", "unsigned long long": "u64",
    "float": "f32", "double": "f64",
    "size_t": "u64", "ssize_t": "i64",
    "int8_t": "i8", "uint8_t": "u8",
    "int16_t": "i16", "uint16_t": "u16",
    "int32_t": "i32", "uint32_t": "u32",
    "int64_t": "i64", "uint64_t": "u64",
}


def _parse_c_header_structs(header_text: str) -> List[str]:
    """Extract struct definitions from C header text and return L2 ``cstruct`` lines."""
    text = re.sub(r"/\*.*?\*/", " ", header_text, flags=re.DOTALL)
    text = re.sub(r"//[^\n]*", "", text)
    text = re.sub(r"#[^\n]*", "", text)
    text = re.sub(r"\s+", " ", text)

    results: List[str] = []
    # Match: struct Name { fields }; or typedef struct Name { fields } Alias;
    # or typedef struct { fields } Name;
    _RE_STRUCT = re.compile(
        r"(?:typedef\s+)?struct\s*(\w*)\s*\{([^}]*)\}\s*(\w*)\s*;",
    )
    for m in _RE_STRUCT.finditer(text):
        struct_name = m.group(1).strip()
        body = m.group(2).strip()
        typedef_name = m.group(3).strip()
        # Prefer typedef name if present
        name = typedef_name if typedef_name else struct_name
        if not name or name.startswith("_"):
            continue
        fields = _extract_struct_fields(body)
        if not fields:
            continue
        # Generate L2 cstruct declaration
        field_parts = []
        for fname, ftype in fields:
            field_parts.append(f"cfield {fname} {ftype}")
        results.append(f"cstruct {name} {' '.join(field_parts)} end")
    return results


def _extract_struct_fields(body: str) -> List[Tuple[str, str]]:
    """Parse C struct field declarations into (name, l2_type) pairs."""
    fields: List[Tuple[str, str]] = []
    for decl in body.split(";"):
        decl = decl.strip()
        if not decl:
            continue
        # Skip bitfields
        if ":" in decl:
            continue
        # Skip nested struct/union definitions (but allow struct pointers)
        if ("struct " in decl or "union " in decl) and "*" not in decl:
            continue
        tokens = decl.split()
        if len(tokens) < 2:
            continue
        # Last token is field name (may have * prefix for pointers)
        field_name = tokens[-1].lstrip("*")
        if not field_name or not re.match(r"^[A-Za-z_]\w*$", field_name):
            continue
        # Check if pointer
        is_ptr = "*" in decl
        if is_ptr:
            fields.append((field_name, "ptr"))
            continue
        # Build type from all tokens except field name
        type_tokens = tokens[:-1]
        # Remove qualifiers
        type_tokens = [t for t in type_tokens if t not in ("const", "volatile", "static",
                                                            "register", "restrict", "_Atomic")]
        ctype = " ".join(type_tokens)
        l2_type = _C_TO_L2_FIELD_TYPE.get(ctype)
        if l2_type is None:
            # Unknown type, treat as pointer-sized
            fields.append((field_name, "ptr"))
        else:
            fields.append((field_name, l2_type))
    return fields


class Compiler:
    def __init__(
        self,
        include_paths: Optional[Sequence[Path]] = None,
        *,
        macro_expansion_limit: int = DEFAULT_MACRO_EXPANSION_LIMIT,
        defines: Optional[Sequence[str]] = None,
    ) -> None:
        self.reader = Reader()
        self.dictionary = bootstrap_dictionary()
        self._syscall_label_counter = 0
        self._register_syscall_words()
        self.parser = Parser(
            self.dictionary,
            self.reader,
            macro_expansion_limit=macro_expansion_limit,
        )
        self.assembler = Assembler(self.dictionary)
        if include_paths is None:
            include_paths = [Path("."), Path("./stdlib")]
        self.include_paths: List[Path] = [p.expanduser().resolve() for p in include_paths]
        self._loaded_files: Set[Path] = set()
        self.defines: Set[str] = set(defines or [])

    def compile_source(
        self,
        source: str,
        spans: Optional[List[FileSpan]] = None,
        *,
        debug: bool = False,
        entry_mode: str = "program",
    ) -> Emission:
        self.parser.file_spans = spans or []
        tokens = self.reader.tokenize(_blank_asm_bodies(source))
        module = self.parser.parse(tokens, source)
        return self.assembler.emit(module, debug=debug, entry_mode=entry_mode)

    def parse_file(self, path: Path) -> None:
        """Parse a source file to populate the dictionary without emitting assembly."""
        source, spans = self._load_with_imports(path.resolve())
        self.parser.file_spans = spans or []
        tokens = self.reader.tokenize(_blank_asm_bodies(source))
        self.parser.parse(tokens, source)

    def compile_file(self, path: Path, *, debug: bool = False, entry_mode: str = "program") -> Emission:
        source, spans = self._load_with_imports(path.resolve())
        return self.compile_source(source, spans=spans, debug=debug, entry_mode=entry_mode)

    def run_compile_time_word(self, name: str, *, libs: Optional[List[str]] = None) -> None:
        word = self.dictionary.lookup(name)
        if word is None:
            raise CompileTimeError(f"word '{name}' not defined; cannot run at compile time")
        # Skip if already executed via a ``compile-time <name>`` directive.
        if name in self.parser.compile_time_vm._ct_executed:
            return
        self.parser.compile_time_vm.invoke(word, runtime_mode=True, libs=libs)

    def run_compile_time_word_repl(self, name: str, *, libs: Optional[List[str]] = None) -> None:
        """Like run_compile_time_word but uses invoke_repl for persistent state."""
        word = self.dictionary.lookup(name)
        if word is None:
            raise CompileTimeError(f"word '{name}' not defined; cannot run at compile time")
        self.parser.compile_time_vm.invoke_repl(word, libs=libs)

    _import_resolve_cache: Dict[Tuple[Path, str], Path] = {}

    def _preprocess_c_header(self, header_path: Path, raw_text: str) -> str:
        """Try running the C preprocessor on a header file for accurate parsing.

        Falls back to raw_text if the preprocessor is not available."""
        import subprocess
        try:
            result = subprocess.run(
                ["cc", "-E", "-P", "-D__attribute__(x)=", "-D__extension__=",
                 "-D__restrict=", "-D__asm__(x)=", str(header_path)],
                capture_output=True, text=True, timeout=10,
            )
            if result.returncode == 0 and result.stdout.strip():
                return result.stdout
        except (FileNotFoundError, subprocess.TimeoutExpired, OSError):
            pass
        return raw_text

    def _resolve_import_target(self, importing_file: Path, target: str) -> Path:
        cache_key = (importing_file.parent, target)
        cached = self._import_resolve_cache.get(cache_key)
        if cached is not None:
            return cached
        raw = Path(target)
        tried: List[Path] = []

        if raw.is_absolute():
            candidate = raw
            tried.append(candidate)
            if candidate.exists():
                result = candidate.resolve()
                self._import_resolve_cache[cache_key] = result
                return result

        candidate = (importing_file.parent / raw).resolve()
        tried.append(candidate)
        if candidate.exists():
            self._import_resolve_cache[cache_key] = candidate
            return candidate

        for base in self.include_paths:
            candidate = (base / raw).resolve()
            tried.append(candidate)
            if candidate.exists():
                self._import_resolve_cache[cache_key] = candidate
                return candidate

        tried_str = "\n".join(f"  - {p}" for p in tried)
        raise ParseError(
            f"cannot import {target!r} from {importing_file}\n"
            f"tried:\n{tried_str}"
        )

    def _register_syscall_words(self) -> None:
        word = self.dictionary.lookup("syscall")
        if word is None:
            word = Word(name="syscall")
            self.dictionary.register(word)
        word.intrinsic = self._emit_syscall_intrinsic

    def _emit_syscall_intrinsic(self, builder: FunctionEmitter) -> None:
        def _try_pop_known_syscall_setup() -> Optional[Tuple[int, int]]:
            """Recognize and remove literal setup for known-argc syscalls.

            Supported forms right before `syscall`:
              1) <argc> <nr>
              2) <nr> <argc> ___linux_swap
            Returns (argc, nr) when recognized.
            """

            # Form 1: ... push argc ; push nr ; syscall
            nr = Assembler._pop_preceding_literal(builder)
            if nr is not None:
                argc = Assembler._pop_preceding_literal(builder)
                if argc is not None and 0 <= argc <= 6:
                    return argc, nr
                # rollback if second literal wasn't argc
                builder.push_literal(nr)

            # Form 2: ... push nr ; push argc ; ___linux_swap ; syscall
            text = builder.text
            swap_tail = [
                "mov rax, [r12]",
                "mov rbx, [r12 + 8]",
                "mov [r12], rbx",
                "mov [r12 + 8], rax",
            ]
            if len(text) >= 4 and [s.strip() for s in text[-4:]] == swap_tail:
                del text[-4:]
                argc2 = Assembler._pop_preceding_literal(builder)
                nr2 = Assembler._pop_preceding_literal(builder)
                if argc2 is not None and nr2 is not None and 0 <= argc2 <= 6:
                    return argc2, nr2
                # rollback conservatively if match fails
                if nr2 is not None:
                    builder.push_literal(nr2)
                if argc2 is not None:
                    builder.push_literal(argc2)
                text.extend(swap_tail)

            return None

        known = _try_pop_known_syscall_setup()
        if known is not None:
            argc, nr = known
            builder.push_literal(nr)
            builder.pop_to("rax")
            if argc >= 6:
                builder.pop_to("r9")
            if argc >= 5:
                builder.pop_to("r8")
            if argc >= 4:
                builder.pop_to("r10")
            if argc >= 3:
                builder.pop_to("rdx")
            if argc >= 2:
                builder.pop_to("rsi")
            if argc >= 1:
                builder.pop_to("rdi")
            builder.emit("    syscall")
            builder.push_from("rax")
            return

        label_id = self._syscall_label_counter
        self._syscall_label_counter += 1

        def lbl(suffix: str) -> str:
            return f"syscall_{label_id}_{suffix}"

        builder.pop_to("rax")  # syscall number
        builder.pop_to("rcx")  # arg count
        builder.emit("    ; clamp arg count to [0, 6]")
        builder.emit("    cmp rcx, 0")
        builder.emit(f"    jge {lbl('count_nonneg')}")
        builder.emit("    xor rcx, rcx")
        builder.emit(f"{lbl('count_nonneg')}:")
        builder.emit("    cmp rcx, 6")
        builder.emit(f"    jle {lbl('count_clamped')}")
        builder.emit("    mov rcx, 6")
        builder.emit(f"{lbl('count_clamped')}:")

        checks = [
            (6, "r9"),
            (5, "r8"),
            (4, "r10"),
            (3, "rdx"),
            (2, "rsi"),
            (1, "rdi"),
        ]
        for threshold, reg in checks:
            builder.emit(f"    cmp rcx, {threshold}")
            builder.emit(f"    jl {lbl(f'skip_{reg}')}")
            builder.pop_to(reg)
            builder.emit(f"{lbl(f'skip_{reg}')}:")

        builder.emit("    syscall")
        builder.push_from("rax")

    def _load_with_imports(self, path: Path, seen: Optional[Set[Path]] = None) -> Tuple[str, List[FileSpan]]:
        if seen is None:
            seen = set()
        out_lines: List[str] = []
        spans: List[FileSpan] = []
        self._append_file_with_imports(path.resolve(), out_lines, spans, seen)
        self._loaded_files = set(seen)
        return "\n".join(out_lines) + "\n", spans

    def _append_file_with_imports(
        self,
        path: Path,
        out_lines: List[str],
        spans: List[FileSpan],
        seen: Set[Path],
    ) -> None:
        # path is expected to be already resolved by callers
        if path in seen:
            return
        seen.add(path)

        try:
            contents = path.read_text()
        except FileNotFoundError as exc:
            raise ParseError(f"cannot import {path}: {exc}") from exc

        in_py_block = False
        brace_depth = 0
        string_char = None
        escape = False

        segment_start_global: Optional[int] = None
        segment_start_local: int = 1
        file_line_no = 1
        _out_append = out_lines.append
        _spans_append = spans.append
        _FileSpan = FileSpan

        # ifdef/ifndef/else/endif conditional compilation stack
        # Each entry is True (include lines) or False (skip lines)
        _ifdef_stack: List[bool] = []

        def _ifdef_active() -> bool:
            return all(_ifdef_stack)

        for line in contents.splitlines():
            stripped = line.strip()

            # --- Conditional compilation directives ---
            if stripped[:6] == "ifdef " or stripped == "ifdef":
                name = stripped[6:].strip() if len(stripped) > 6 else ""
                if not name:
                    raise ParseError(f"ifdef missing symbol name at {path}:{file_line_no}")
                _ifdef_stack.append(name in self.defines if _ifdef_active() else False)
                _out_append("")  # placeholder to keep line numbers aligned
                file_line_no += 1
                continue
            if stripped[:7] == "ifndef " or stripped == "ifndef":
                name = stripped[7:].strip() if len(stripped) > 7 else ""
                if not name:
                    raise ParseError(f"ifndef missing symbol name at {path}:{file_line_no}")
                _ifdef_stack.append(name not in self.defines if _ifdef_active() else False)
                _out_append("")
                file_line_no += 1
                continue
            if stripped == "elsedef":
                if not _ifdef_stack:
                    raise ParseError(f"elsedef without matching ifdef/ifndef at {path}:{file_line_no}")
                _ifdef_stack[-1] = not _ifdef_stack[-1]
                _out_append("")
                file_line_no += 1
                continue
            if stripped == "endif":
                if not _ifdef_stack:
                    raise ParseError(f"endif without matching ifdef/ifndef at {path}:{file_line_no}")
                _ifdef_stack.pop()
                _out_append("")
                file_line_no += 1
                continue

            # If inside a false ifdef branch, skip the line
            if not _ifdef_active():
                _out_append("")
                file_line_no += 1
                continue

            if not in_py_block and stripped[:3] == ":py" and "{" in stripped:
                in_py_block = True
                brace_depth = 0
                string_char = None
                escape = False
                # scan_line inline
                for ch in line:
                    if string_char:
                        if escape:
                            escape = False
                        elif ch == "\\":
                            escape = True
                        elif ch == string_char:
                            string_char = None
                    else:
                        if ch == "'" or ch == '"':
                            string_char = ch
                        elif ch == "{":
                            brace_depth += 1
                        elif ch == "}":
                            brace_depth -= 1
                # begin_segment_if_needed inline
                if segment_start_global is None:
                    segment_start_global = len(out_lines) + 1
                    segment_start_local = file_line_no
                _out_append(line)
                file_line_no += 1
                if brace_depth == 0:
                    in_py_block = False
                continue

            if in_py_block:
                # scan_line inline
                for ch in line:
                    if string_char:
                        if escape:
                            escape = False
                        elif ch == "\\":
                            escape = True
                        elif ch == string_char:
                            string_char = None
                    else:
                        if ch == "'" or ch == '"':
                            string_char = ch
                        elif ch == "{":
                            brace_depth += 1
                        elif ch == "}":
                            brace_depth -= 1
                # begin_segment_if_needed inline
                if segment_start_global is None:
                    segment_start_global = len(out_lines) + 1
                    segment_start_local = file_line_no
                _out_append(line)
                file_line_no += 1
                if brace_depth == 0:
                    in_py_block = False
                continue

            if stripped[:7] == "import ":
                target = stripped.split(None, 1)[1].strip()
                if not target:
                    raise ParseError(f"empty import target in {path}:{file_line_no}")

                # begin_segment_if_needed inline
                if segment_start_global is None:
                    segment_start_global = len(out_lines) + 1
                    segment_start_local = file_line_no
                _out_append("")
                file_line_no += 1
                # close_segment_if_open inline
                if segment_start_global is not None:
                    _spans_append(
                        _FileSpan(
                            path=path,
                            start_line=segment_start_global,
                            end_line=len(out_lines) + 1,
                            local_start_line=segment_start_local,
                        )
                    )
                    segment_start_global = None

                target_path = self._resolve_import_target(path, target)
                self._append_file_with_imports(target_path, out_lines, spans, seen)
                continue

            if stripped[:9] == 'cimport "' or stripped[:9] == "cimport \"":
                # cimport "header.h" — extract extern declarations from a C header
                m_cimport = re.match(r'cimport\s+"([^"]+)"', stripped)
                if not m_cimport:
                    raise ParseError(f"invalid cimport syntax at {path}:{file_line_no}")
                header_target = m_cimport.group(1)
                header_path = self._resolve_import_target(path, header_target)
                try:
                    header_text = header_path.read_text()
                except FileNotFoundError as exc:
                    raise ParseError(f"cimport cannot read {header_path}: {exc}") from exc

                # Try running the C preprocessor for more accurate parsing
                header_text = self._preprocess_c_header(header_path, header_text)

                extern_lines = _parse_c_header_externs(header_text)
                struct_lines = _parse_c_header_structs(header_text)

                # begin_segment_if_needed inline
                if segment_start_global is None:
                    segment_start_global = len(out_lines) + 1
                    segment_start_local = file_line_no
                # Replace the cimport line with the extracted extern + struct declarations
                for ext_line in extern_lines:
                    _out_append(ext_line)
                for st_line in struct_lines:
                    _out_append(st_line)
                _out_append("")  # blank line after externs
                file_line_no += 1
                continue

            # begin_segment_if_needed inline
            if segment_start_global is None:
                segment_start_global = len(out_lines) + 1
                segment_start_local = file_line_no
            _out_append(line)
            file_line_no += 1

        # close_segment_if_open inline
        if segment_start_global is not None:
            _spans_append(
                _FileSpan(
                    path=path,
                    start_line=segment_start_global,
                    end_line=len(out_lines) + 1,
                    local_start_line=segment_start_local,
                )
            )

        if _ifdef_stack:
            raise ParseError(f"unterminated ifdef/ifndef ({len(_ifdef_stack)} level(s) deep) in {path}")


class BuildCache:
    """Caches compilation artifacts keyed by source content and compiler flags."""

    def __init__(self, cache_dir: Path) -> None:
        self.cache_dir = cache_dir

    @staticmethod
    def _hash_bytes(data: bytes) -> str:
        import hashlib
        return hashlib.sha256(data).hexdigest()

    @staticmethod
    def _hash_str(s: str) -> str:
        import hashlib
        return hashlib.sha256(s.encode("utf-8")).hexdigest()

    def _manifest_path(self, source: Path) -> Path:
        key = self._hash_str(str(source.resolve()))
        return self.cache_dir / f"{key}.json"

    def flags_hash(
        self,
        debug: bool,
        folding: bool,
        static_list_folding: bool,
        peephole: bool,
        auto_inline: bool,
        entry_mode: str,
    ) -> str:
        # Include the compiler's own mtime so any change to main.py
        # (codegen improvements, bug fixes) invalidates cached results.
        try:
            compiler_mtime = os.path.getmtime(__file__)
        except OSError:
            compiler_mtime = 0
        return self._hash_str(
            f"debug={debug},folding={folding},static_list_folding={static_list_folding},"
            f"peephole={peephole},auto_inline={auto_inline},"
            f"entry_mode={entry_mode},compiler_mtime={compiler_mtime}"
        )

    def _file_info(self, path: Path) -> dict:
        st = path.stat()
        return {
            "mtime": st.st_mtime,
            "size": st.st_size,
            "hash": self._hash_bytes(path.read_bytes()),
        }

    def load_manifest(self, source: Path) -> Optional[dict]:
        mp = self._manifest_path(source)
        if not mp.exists():
            return None
        try:
            import json
            return json.loads(mp.read_text())
        except (ValueError, OSError):
            return None

    def check_fresh(self, manifest: dict, fhash: str) -> bool:
        """Return True if all source files are unchanged and flags match."""
        if manifest.get("flags_hash") != fhash:
            return False
        if manifest.get("has_ct_effects"):
            return False
        files = manifest.get("files", {})
        for path_str, info in files.items():
            p = Path(path_str)
            if not p.exists():
                return False
            try:
                st = p.stat()
            except OSError:
                return False
            if st.st_mtime == info.get("mtime") and st.st_size == info.get("size"):
                continue
            actual_hash = self._hash_bytes(p.read_bytes())
            if actual_hash != info.get("hash"):
                return False
        return True

    def get_cached_asm(self, manifest: dict) -> Optional[str]:
        asm_hash = manifest.get("asm_hash")
        if not asm_hash:
            return None
        asm_path = self.cache_dir / f"{asm_hash}.asm"
        if not asm_path.exists():
            return None
        return asm_path.read_text()

    def save(
        self,
        source: Path,
        loaded_files: Set[Path],
        fhash: str,
        asm_text: str,
        has_ct_effects: bool = False,
    ) -> None:
        self.cache_dir.mkdir(parents=True, exist_ok=True)
        files: Dict[str, dict] = {}
        for p in sorted(loaded_files):
            try:
                files[str(p)] = self._file_info(p)
            except OSError:
                pass
        asm_hash = self._hash_str(asm_text)
        asm_path = self.cache_dir / f"{asm_hash}.asm"
        asm_path.write_text(asm_text)
        manifest = {
            "source": str(source.resolve()),
            "flags_hash": fhash,
            "files": files,
            "asm_hash": asm_hash,
            "has_ct_effects": has_ct_effects,
        }
        import json
        self._manifest_path(source).write_text(json.dumps(manifest))

    def clean(self) -> None:
        if self.cache_dir.exists():
            import shutil
            shutil.rmtree(self.cache_dir)


_nasm_path: str = ""
_linker_path: str = ""
_linker_is_lld: bool = False

def _find_nasm() -> str:
    global _nasm_path
    if _nasm_path:
        return _nasm_path
    import shutil
    p = shutil.which("nasm")
    if not p:
        raise RuntimeError("nasm not found")
    _nasm_path = p
    return p

def _find_linker() -> tuple:
    global _linker_path, _linker_is_lld
    if _linker_path:
        return _linker_path, _linker_is_lld
    import shutil
    lld = shutil.which("ld.lld")
    if lld:
        _linker_path = lld
        _linker_is_lld = True
        return lld, True
    ld = shutil.which("ld")
    if ld:
        _linker_path = ld
        _linker_is_lld = False
        return ld, False
    raise RuntimeError("No linker found")

def _run_cmd(args: list) -> None:
    """Run a command using posix_spawn for lower overhead than subprocess."""
    pid = os.posix_spawn(args[0], args, os.environ)
    _, status = os.waitpid(pid, 0)
    if os.WIFEXITED(status):
        code = os.WEXITSTATUS(status)
        if code != 0:
            import subprocess
            raise subprocess.CalledProcessError(code, args)
    elif os.WIFSIGNALED(status):
        import subprocess
        raise subprocess.CalledProcessError(-os.WTERMSIG(status), args)


def run_nasm(asm_path: Path, obj_path: Path, debug: bool = False, asm_text: str = "") -> None:
    nasm = _find_nasm()
    cmd = [nasm, "-f", "elf64"]
    if debug:
        cmd.extend(["-g", "-F", "dwarf"])
    cmd += ["-o", str(obj_path), str(asm_path)]
    _run_cmd(cmd)


def run_linker(obj_path: Path, exe_path: Path, debug: bool = False, libs=None, *, shared: bool = False):
    libs = libs or []

    linker, use_lld = _find_linker()

    cmd = [linker]

    if use_lld:
        cmd.extend(["-m", "elf_x86_64"])

    if shared:
        cmd.append("-shared")

    cmd.extend([
        "-o", str(exe_path),
        str(obj_path),
    ])

    if not shared and not libs:
        cmd.extend(["-nostdlib", "-static"])

    if libs:
        # Determine if any libs require dynamic linking (shared libraries).
        needs_dynamic = any(
            not (str(lib).endswith(".a") or str(lib).endswith(".o"))
            for lib in libs if lib
        )
        if not shared and needs_dynamic:
            cmd.extend([
                "-dynamic-linker", "/lib64/ld-linux-x86-64.so.2",
            ])
        # Add standard library search paths so ld.lld can find libc etc.
        for lib_dir in ["/usr/lib/x86_64-linux-gnu", "/usr/lib64", "/lib/x86_64-linux-gnu"]:
            if os.path.isdir(lib_dir):
                cmd.append(f"-L{lib_dir}")
        for lib in libs:
            if not lib:
                continue
            lib = str(lib)
            if lib.startswith(("-L", "-l", "-Wl,")):
                cmd.append(lib)
                continue
            if lib.startswith(":"):
                cmd.append(f"-l{lib}")
                continue
            if os.path.isabs(lib) or lib.startswith("./") or lib.startswith("../"):
                cmd.append(lib)
                continue
            if os.path.sep in lib or lib.endswith(".a"):
                cmd.append(lib)
                continue
            if ".so" in lib:
                cmd.append(f"-l:{lib}")
                continue
            cmd.append(f"-l{lib}")

    if debug:
        cmd.append("-g")

    _run_cmd(cmd)


def build_static_library(obj_path: Path, archive_path: Path) -> None:
    import subprocess
    parent = archive_path.parent
    if parent and not parent.exists():
        parent.mkdir(parents=True, exist_ok=True)
    subprocess.run(["ar", "rcs", str(archive_path), str(obj_path)], check=True)


def _load_sidecar_meta_libs(source: Path) -> List[str]:
    """Return additional linker libs from sibling <source>.meta.json."""
    meta_path = source.with_suffix(".meta.json")
    if not meta_path.exists():
        return []
    try:
        import json
        payload = json.loads(meta_path.read_text())
    except Exception as exc:
        print(f"[warn] failed to read {meta_path}: {exc}")
        return []
    libs = payload.get("libs")
    if not isinstance(libs, list):
        return []
    out: List[str] = []
    for item in libs:
        if isinstance(item, str) and item:
            out.append(item)
    return out


def _build_ct_sidecar_shared(source: Path, temp_dir: Path) -> Optional[Path]:
    """Build sibling <source>.c into a shared object for --ct-run-main externs."""
    c_path = source.with_suffix(".c")
    if not c_path.exists():
        return None
    temp_dir.mkdir(parents=True, exist_ok=True)
    so_path = temp_dir / f"{source.stem}.ctlib.so"
    cmd = ["cc", "-shared", "-fPIC", str(c_path), "-o", str(so_path)]
    import subprocess
    subprocess.run(cmd, check=True)
    return so_path


def run_repl(
    compiler: Compiler,
    temp_dir: Path,
    libs: Sequence[str],
    debug: bool = False,
    initial_source: Optional[Path] = None,
) -> int:
    """REPL backed by the compile-time VM for instant execution.

    State (data stack, memory, definitions) persists across evaluations.
    Use ``:reset`` to start fresh.
    """

    # -- Colors ---------------------------------------------------------------
    _C_RESET = "\033[0m"
    _C_BOLD = "\033[1m"
    _C_DIM = "\033[2m"
    _C_GREEN = "\033[32m"
    _C_CYAN = "\033[36m"
    _C_YELLOW = "\033[33m"
    _C_RED = "\033[31m"
    _C_MAGENTA = "\033[35m"

    use_color = sys.stdout.isatty()
    def _c(code: str, text: str) -> str:
        return f"{code}{text}{_C_RESET}" if use_color else text

    # -- Helpers --------------------------------------------------------------
    def _block_defines_main(block: str) -> bool:
        stripped_lines = [ln.strip() for ln in block.splitlines() if ln.strip() and not ln.strip().startswith("#")]
        for idx, stripped in enumerate(stripped_lines):
            for prefix in ("word", ":asm", ":py", "extern"):
                if stripped.startswith(f"{prefix} "):
                    rest = stripped[len(prefix):].lstrip()
                    if rest.startswith("main"):
                        return True
            if stripped == "word" and idx + 1 < len(stripped_lines):
                if stripped_lines[idx + 1].startswith("main"):
                    return True
        return False

    temp_dir.mkdir(parents=True, exist_ok=True)
    src_path = temp_dir / "repl.sl"
    editor_cmd = os.environ.get("EDITOR") or "vim"

    default_imports = ["import stdlib/stdlib.sl", "import stdlib/io.sl"]
    imports: List[str] = list(default_imports)
    user_defs_files: List[str] = []
    user_defs_repl: List[str] = []
    main_body: List[str] = []
    has_user_main = False

    include_paths = list(compiler.include_paths)

    if initial_source is not None:
        try:
            initial_text = initial_source.read_text()
            user_defs_files.append(initial_text)
            has_user_main = has_user_main or _block_defines_main(initial_text)
            if has_user_main:
                main_body.clear()
            print(_c(_C_DIM, f"[repl] loaded {initial_source}"))
        except Exception as exc:
            print(_c(_C_RED, f"[repl] failed to load {initial_source}: {exc}"))

    # -- Persistent VM execution ----------------------------------------------
    def _run_on_ct_vm(source: str, word_name: str = "main") -> bool:
        """Parse source and execute *word_name* via the compile-time VM.

        Uses ``invoke_repl`` so stacks/memory persist across calls.
        Returns True on success, False on error (already printed).
        """
        nonlocal compiler
        src_path.write_text(source)
        try:
            _suppress_redefine_warnings_set(True)
            compiler._loaded_files.clear()
            compiler.parse_file(src_path)
        except (ParseError, CompileError, CompileTimeError) as exc:
            print(_c(_C_RED, f"[error] {exc}"))
            return False
        except Exception as exc:
            print(_c(_C_RED, f"[error] parse failed: {exc}"))
            return False
        finally:
            _suppress_redefine_warnings_set(False)

        try:
            compiler.run_compile_time_word_repl(word_name, libs=list(libs))
        except (CompileTimeError, _CTVMExit) as exc:
            if isinstance(exc, _CTVMExit):
                code = exc.code
                if code != 0:
                    print(_c(_C_YELLOW, f"[warn] program exited with code {code}"))
            else:
                print(_c(_C_RED, f"[error] {exc}"))
                return False
        except Exception as exc:
            print(_c(_C_RED, f"[error] execution failed: {exc}"))
            return False
        return True

    # -- Stack display --------------------------------------------------------
    def _show_stack() -> None:
        vm = compiler.parser.compile_time_vm
        values = vm.repl_stack_values()
        if not values:
            print(_c(_C_DIM, "<empty stack>"))
        else:
            parts = []
            for v in values:
                if v < 0:
                    v = v + (1 << 64)  # show as unsigned
                    parts.append(f"{v} (0x{v:x})")
                elif v > 0xFFFF:
                    parts.append(f"{v} (0x{v:x})")
                else:
                    parts.append(str(v))
            depth_str = _c(_C_DIM, f"<{len(values)}>")
            print(f"{depth_str} {' '.join(parts)}")

    # -- Word listing ---------------------------------------------------------
    def _show_words(filter_str: str = "") -> None:
        all_words = sorted(compiler.dictionary.words.keys())
        if filter_str:
            all_words = [w for w in all_words if filter_str in w]
        if not all_words:
            print(_c(_C_DIM, "no matching words"))
            return
        # Print in columns
        max_len = max(len(w) for w in all_words) + 2
        cols = max(1, 80 // max_len)
        for i in range(0, len(all_words), cols):
            row = all_words[i:i + cols]
            print("  ".join(w.ljust(max_len) for w in row))
        print(_c(_C_DIM, f"({len(all_words)} words)"))

    # -- Word type/info -------------------------------------------------------
    def _show_type(word_name: str) -> None:
        word = compiler.dictionary.lookup(word_name)
        if word is None:
            print(_c(_C_RED, f"word '{word_name}' not found"))
            return

        # Header: name + kind
        defn = word.definition
        if word.is_extern:
            kind = "extern"
        elif word.macro_expansion is not None:
            kind = "macro"
        elif isinstance(defn, AsmDefinition):
            kind = "asm"
        elif isinstance(defn, Definition):
            kind = "word"
        elif word.compile_time_intrinsic is not None or word.runtime_intrinsic is not None:
            kind = "builtin"
        elif word.macro is not None:
            kind = "immediate/macro"
        else:
            kind = "unknown"
        print(f"  {_c(_C_BOLD, word_name)}  {_c(_C_CYAN, kind)}")

        # Tags
        tags: List[str] = []
        if word.immediate:
            tags.append("immediate")
        if word.compile_only:
            tags.append("compile-only")
        if word.inline:
            tags.append("inline")
        if word.compile_time_override:
            tags.append("ct-override")
        if word.priority != 0:
            tags.append(f"priority={word.priority}")
        if tags:
            print(f"  {_c(_C_DIM, '  tags: ')}{_c(_C_YELLOW, ' '.join(tags))}")

        # Extern signature
        if word.is_extern and word.extern_signature:
            arg_types, ret_type = word.extern_signature
            sig = f"{ret_type} {word_name}({', '.join(arg_types)})"
            print(f"  {_c(_C_DIM, '  sig:  ')}{_c(_C_GREEN, sig)}")
        elif word.is_extern:
            print(f"  {_c(_C_DIM, '  args: ')}{word.extern_inputs} in, {word.extern_outputs} out")

        # Stack effect from definition comment
        if isinstance(defn, Definition) and defn.stack_inputs is not None:
            print(f"  {_c(_C_DIM, '  args: ')}{defn.stack_inputs} inputs")

        # Macro expansion
        if word.macro_expansion is not None:
            params = word.macro_params
            expansion = " ".join(word.macro_expansion)
            if len(expansion) > 80:
                expansion = expansion[:77] + "..."
            param_str = f" (${params} params)" if params else ""
            print(f"  {_c(_C_DIM, '  expands:')}{param_str} {expansion}")

        # Asm body (trimmed)
        if isinstance(defn, AsmDefinition):
            body = defn.body.strip()
            lines = body.splitlines()
            if defn.effects:
                print(f"  {_c(_C_DIM, '  effects:')} {' '.join(sorted(defn.effects))}")
            if len(lines) <= 6:
                for ln in lines:
                    print(f"    {_c(_C_DIM, ln.rstrip())}")
            else:
                for ln in lines[:4]:
                    print(f"    {_c(_C_DIM, ln.rstrip())}")
                print(f"    {_c(_C_DIM, f'... ({len(lines)} lines total)')}")

        # Word body (decompiled ops)
        if isinstance(defn, Definition):
            ops = defn.body
            indent = 0
            max_show = 12
            shown = 0
            for op in ops:
                if shown >= max_show:
                    print(f"    {_c(_C_DIM, f'... ({len(ops)} ops total)')}")
                    break
                if op.op in ("branch_zero", "for_begin", "while_begin", "list_begin"):
                    pass
                if op.op in ("jump", "for_end"):
                    indent = max(0, indent - 1)

                if op.op == "literal":
                    if isinstance(op.data, str):
                        txt = f'"{op.data}"' if len(op.data) <= 40 else f'"{op.data[:37]}..."'
                        line_str = f"  {txt}"
                    elif isinstance(op.data, float):
                        line_str = f"  {op.data}"
                    else:
                        line_str = f"  {op.data}"
                elif op.op == "word":
                    line_str = f"  {op.data}"
                elif op.op == "branch_zero":
                    line_str = "  if"
                    indent += 1
                elif op.op == "jump":
                    line_str = "  else/end"
                elif op.op == "for_begin":
                    line_str = "  for"
                    indent += 1
                elif op.op == "for_end":
                    line_str = "  end-for"
                elif op.op == "label":
                    line_str = f"  label {op.data}"
                elif op.op == "goto":
                    line_str = f"  goto {op.data}"
                else:
                    line_str = f"  {op.op}" + (f" {op.data}" if op.data is not None else "")

                print(f"  {_c(_C_DIM, '  ' * indent)}{line_str}")
                shown += 1

    # -- readline setup -------------------------------------------------------
    history_path = temp_dir / "repl_history"
    try:
        import readline
        readline.parse_and_bind("tab: complete")
        try:
            readline.read_history_file(str(history_path))
        except (FileNotFoundError, OSError):
            pass

        def _completer(text: str, state: int) -> Optional[str]:
            commands = [":help", ":show", ":reset", ":load ", ":call ",
                        ":edit ", ":seteditor ", ":quit", ":q",
                        ":stack", ":words ", ":type ", ":clear"]
            if text.startswith(":"):
                matches = [c for c in commands if c.startswith(text)]
            else:
                all_words = sorted(compiler.dictionary.words.keys())
                matches = [w + " " for w in all_words if w.startswith(text)]
            return matches[state] if state < len(matches) else None

        readline.set_completer(_completer)
        readline.set_completer_delims(" \t\n")
        _has_readline = True
    except ImportError:
        _has_readline = False

    # -- Help -----------------------------------------------------------------
    def _print_help() -> None:
        print(_c(_C_BOLD, "[repl] commands:"))
        cmds = [
            (":help", "show this help"),
            (":stack", "display the data stack"),
            (":clear", "clear the data stack (keep definitions)"),
            (":words [filter]", "list defined words (optionally filtered)"),
            (":type <word>", "show word info / signature"),
            (":show", "display current session source"),
            (":reset", "clear everything — fresh VM and dictionary"),
            (":load <file>", "load a source file into the session"),
            (":call <word>", "execute a word via the compile-time VM"),
            (":edit [file]", "open session file or given file in editor"),
            (":seteditor [cmd]", "show/set editor command (default from $EDITOR)"),
            (":quit | :q", "exit the REPL"),
        ]
        for cmd, desc in cmds:
            print(f"  {_c(_C_GREEN, cmd.ljust(20))} {desc}")
        print(_c(_C_BOLD, "[repl] free-form input:"))
        print("  definitions (word/:asm/:py/extern/macro/struct) extend the session")
        print("  imports add to session imports")
        print("  other lines run immediately (values stay on the stack)")
        print("  multiline: end lines with \\ to continue")

    # -- Banner ---------------------------------------------------------------
    prompt = _c(_C_GREEN + _C_BOLD, "l2> ") if use_color else "l2> "
    cont_prompt = _c(_C_DIM, "... ") if use_color else "... "
    print(_c(_C_BOLD, "[repl] L2 interactive — type :help for commands, :quit to exit"))
    print(_c(_C_DIM, "[repl] state persists across evaluations; :reset to start fresh"))

    pending_block: List[str] = []

    while True:
        try:
            cur_prompt = cont_prompt if pending_block else prompt
            line = input(cur_prompt)
        except (EOFError, KeyboardInterrupt):
            print()
            break

        stripped = line.strip()
        if stripped in {":quit", ":q"}:
            break
        if stripped == ":help":
            _print_help()
            continue
        if stripped == ":stack":
            _show_stack()
            continue
        if stripped == ":clear":
            vm = compiler.parser.compile_time_vm
            if vm._repl_initialized:
                vm.r12 = vm._native_data_top
            else:
                vm.stack.clear()
            print(_c(_C_DIM, "stack cleared"))
            continue
        if stripped.startswith(":words"):
            filt = stripped.split(None, 1)[1].strip() if " " in stripped else ""
            _show_words(filt)
            continue
        if stripped.startswith(":type "):
            word_name = stripped.split(None, 1)[1].strip()
            if word_name:
                _show_type(word_name)
            else:
                print(_c(_C_RED, "[repl] usage: :type <word>"))
            continue
        if stripped == ":reset":
            imports = list(default_imports)
            user_defs_files.clear()
            user_defs_repl.clear()
            main_body.clear()
            has_user_main = False
            pending_block.clear()
            compiler = Compiler(
                include_paths=include_paths,
                macro_expansion_limit=compiler.parser.macro_expansion_limit,
            )
            print(_c(_C_DIM, "[repl] session reset — fresh VM and dictionary"))
            continue
        if stripped.startswith(":seteditor"):
            parts = stripped.split(None, 1)
            if len(parts) == 1 or not parts[1].strip():
                print(f"[repl] editor: {editor_cmd}")
            else:
                editor_cmd = parts[1].strip()
                print(f"[repl] editor set to: {editor_cmd}")
            continue
        if stripped.startswith(":edit"):
            arg = stripped.split(None, 1)[1].strip() if " " in stripped else ""
            target_path = Path(arg) if arg else src_path
            try:
                current_source = _repl_build_source(
                    imports,
                    user_defs_files,
                    user_defs_repl,
                    main_body,
                    has_user_main,
                    force_synthetic=bool(main_body),
                )
                src_path.write_text(current_source)
            except Exception as exc:
                print(_c(_C_RED, f"[repl] failed to sync source before edit: {exc}"))
            try:
                if not target_path.exists():
                    target_path.parent.mkdir(parents=True, exist_ok=True)
                    target_path.touch()
                import shlex
                cmd_parts = shlex.split(editor_cmd)
                import subprocess
                subprocess.run([*cmd_parts, str(target_path)])
                if target_path.resolve() == src_path.resolve():
                    try:
                        updated = target_path.read_text()
                        new_imports: List[str] = []
                        non_import_lines: List[str] = []
                        for ln in updated.splitlines():
                            stripped_ln = ln.strip()
                            if stripped_ln.startswith("import "):
                                new_imports.append(stripped_ln)
                            else:
                                non_import_lines.append(ln)
                        imports = new_imports if new_imports else list(default_imports)
                        new_body = "\n".join(non_import_lines).strip()
                        user_defs_files = [new_body] if new_body else []
                        user_defs_repl.clear()
                        main_body.clear()
                        has_user_main = _block_defines_main(new_body)
                        print(_c(_C_DIM, "[repl] reloaded session source from editor"))
                    except Exception as exc:
                        print(_c(_C_RED, f"[repl] failed to reload edited source: {exc}"))
            except Exception as exc:
                print(_c(_C_RED, f"[repl] failed to launch editor: {exc}"))
            continue
        if stripped == ":show":
            source = _repl_build_source(imports, user_defs_files, user_defs_repl, main_body, has_user_main, force_synthetic=True)
            print(source.rstrip())
            continue
        if stripped.startswith(":load "):
            path_text = stripped.split(None, 1)[1].strip()
            target_path = Path(path_text)
            if not target_path.exists():
                print(_c(_C_RED, f"[repl] file not found: {target_path}"))
                continue
            try:
                loaded_text = target_path.read_text()
                user_defs_files.append(loaded_text)
                if _block_defines_main(loaded_text):
                    has_user_main = True
                    main_body.clear()
                print(_c(_C_DIM, f"[repl] loaded {target_path}"))
            except Exception as exc:
                print(_c(_C_RED, f"[repl] failed to load {target_path}: {exc}"))
            continue
        if stripped.startswith(":call "):
            word_name = stripped.split(None, 1)[1].strip()
            if not word_name:
                print(_c(_C_RED, "[repl] usage: :call <word>"))
                continue
            if word_name == "main" and not has_user_main:
                print(_c(_C_RED, "[repl] cannot call main; no user-defined main present"))
                continue
            if word_name == "main" and has_user_main:
                source = _repl_build_source(imports, user_defs_files, user_defs_repl, [], True, force_synthetic=False)
            else:
                temp_defs = [*user_defs_repl, f"word __repl_call__\n    {word_name}\nend"]
                source = _repl_build_source(imports, user_defs_files, temp_defs, [], True, force_synthetic=False)
                _run_on_ct_vm(source, "__repl_call__")
                continue
            _run_on_ct_vm(source, word_name)
            continue
        if not stripped:
            continue

        # Multiline handling via trailing backslash
        if line.endswith("\\"):
            pending_block.append(line[:-1])
            continue

        if pending_block:
            pending_block.append(line)
            block = "\n".join(pending_block)
            pending_block.clear()
        else:
            block = line

        block_stripped = block.lstrip()
        first_tok = block_stripped.split(None, 1)[0] if block_stripped else ""
        is_definition = first_tok in {"word", ":asm", ":py", "extern", "macro", "struct"}
        is_import = first_tok == "import"

        if is_import:
            imports.append(block_stripped)
        elif is_definition:
            if _block_defines_main(block):
                user_defs_repl = [d for d in user_defs_repl if not _block_defines_main(d)]
                has_user_main = True
                main_body.clear()
            user_defs_repl.append(block)
        else:
            source = _repl_build_source(
                imports,
                user_defs_files,
                user_defs_repl,
                block.splitlines(),
                has_user_main,
                force_synthetic=True,
            )
            _run_on_ct_vm(source)
            continue

        # Validate definitions by parsing (no execution needed).
        source = _repl_build_source(imports, user_defs_files, user_defs_repl, main_body, has_user_main, force_synthetic=bool(main_body))
        try:
            src_path.write_text(source)
            _suppress_redefine_warnings_set(True)
            try:
                compiler._loaded_files.clear()
                compiler.parse_file(src_path)
            finally:
                _suppress_redefine_warnings_set(False)
        except (ParseError, CompileError, CompileTimeError) as exc:
            print(_c(_C_RED, f"[error] {exc}"))
            continue

    # Save readline history
    if _has_readline:
        try:
            readline.write_history_file(str(history_path))
        except OSError:
            pass

    return 0


def _repl_build_source(
    imports: Sequence[str],
    file_defs: Sequence[str],
    repl_defs: Sequence[str],
    main_body: Sequence[str],
    has_user_main: bool,
    force_synthetic: bool = False,
) -> str:
    lines: List[str] = []
    lines.extend(imports)
    lines.extend(file_defs)
    lines.extend(repl_defs)
    if (force_synthetic or not has_user_main) and main_body:
        lines.append("word main")
        for ln in main_body:
            if ln:
                lines.append(f"    {ln}")
            else:
                lines.append("")
        lines.append("end")
    return "\n".join(lines) + "\n"


class DocEntry:
    __slots__ = ('name', 'stack_effect', 'description', 'kind', 'path', 'line')

    def __init__(self, name: str, stack_effect: str, description: str, kind: str, path: Path, line: int) -> None:
        self.name = name
        self.stack_effect = stack_effect
        self.description = description
        self.kind = kind
        self.path = path
        self.line = line


_DOC_STACK_RE = re.compile(r"^\s*#\s*([^\s]+)\s*(.*)$")
_DOC_WORD_RE = re.compile(r"^\s*(?:inline\s+)?word\s+([^\s]+)\b")
_DOC_ASM_RE = re.compile(r"^\s*:asm\s+([^\s{]+)")
_DOC_PY_RE = re.compile(r"^\s*:py\s+([^\s{]+)")
_DOC_MACRO_RE = re.compile(r"^\s*macro\s+([^\s]+)(?:\s+(\d+))?")


def _extract_stack_comment(text: str) -> Optional[Tuple[str, str]]:
    match = _DOC_STACK_RE.match(text)
    if match is None:
        return None
    name = match.group(1).strip()
    tail = match.group(2).strip()
    if not name:
        return None
    if "->" not in tail:
        return None
    return name, tail


def _extract_definition_name(text: str, *, include_macros: bool = False) -> Optional[Tuple[str, str, int]]:
    for kind, regex in (("word", _DOC_WORD_RE), ("asm", _DOC_ASM_RE), ("py", _DOC_PY_RE)):
        match = regex.match(text)
        if match is not None:
            return kind, match.group(1), -1
    if include_macros:
        match = _DOC_MACRO_RE.match(text)
        if match is not None:
            arg_count = int(match.group(2)) if match.group(2) is not None else 0
            return "macro", match.group(1), arg_count
    return None


def _is_doc_symbol_name(name: str, *, include_private: bool = False) -> bool:
    if not name:
        return False
    if not include_private and name.startswith("__"):
        return False
    return True


def _collect_leading_doc_comments(lines: Sequence[str], def_index: int, name: str) -> Tuple[str, str]:
    comments: List[str] = []
    stack_effect = ""

    idx = def_index - 1
    while idx >= 0:
        raw = lines[idx]
        stripped = raw.strip()
        if not stripped:
            break
        if not stripped.startswith("#"):
            break

        parsed = _extract_stack_comment(raw)
        if parsed is not None:
            comment_name, effect = parsed
            if comment_name == name and not stack_effect:
                stack_effect = effect
            idx -= 1
            continue

        text = stripped[1:].strip()
        if text:
            comments.append(text)
        idx -= 1

    comments.reverse()
    return stack_effect, " ".join(comments)


def _scan_doc_file(
    path: Path,
    *,
    include_undocumented: bool = False,
    include_private: bool = False,
    include_macros: bool = False,
) -> List[DocEntry]:
    try:
        text = path.read_text(encoding="utf-8", errors="ignore")
    except Exception:
        return []

    lines = text.splitlines()
    entries: List[DocEntry] = []
    defined_names: Set[str] = set()

    for idx, line in enumerate(lines):
        parsed = _extract_definition_name(line, include_macros=include_macros)
        if parsed is None:
            continue
        kind, name, macro_args = parsed
        if not _is_doc_symbol_name(name, include_private=include_private):
            continue
        defined_names.add(name)
        stack_effect, description = _collect_leading_doc_comments(lines, idx, name)
        # Auto-generate stack effect for macros from arg count
        if kind == "macro" and not stack_effect:
            if macro_args > 0:
                params = " ".join(f"${i}" for i in range(macro_args))
                stack_effect = f"macro({macro_args}): {params} -> expanded"
            else:
                stack_effect = "macro(0): -> expanded"
        if not include_undocumented and not stack_effect and not description:
            continue
        entries.append(
            DocEntry(
                name=name,
                stack_effect=stack_effect,
                description=description,
                kind=kind,
                path=path,
                line=idx + 1,
            )
        )

    return entries


def _iter_doc_files(roots: Sequence[Path], *, include_tests: bool = False) -> List[Path]:
    seen: Set[Path] = set()
    files: List[Path] = []
    skip_parts = {"build", ".git", ".venv", "raylib-5.5_linux_amd64"}
    if not include_tests:
        skip_parts.update({"tests", "extra_tests"})

    def _should_skip(candidate: Path) -> bool:
        parts = set(candidate.parts)
        return any(part in parts for part in skip_parts)

    for root in roots:
        resolved = root.expanduser().resolve()
        if not resolved.exists():
            continue
        if resolved.is_file() and resolved.suffix == ".sl":
            if _should_skip(resolved):
                continue
            if resolved not in seen:
                seen.add(resolved)
                files.append(resolved)
            continue
        if not resolved.is_dir():
            continue
        for path in resolved.rglob("*.sl"):
            if _should_skip(path):
                continue
            candidate = path.resolve()
            if candidate in seen:
                continue
            seen.add(candidate)
            files.append(candidate)
    files.sort()
    return files


def collect_docs(
    roots: Sequence[Path],
    *,
    include_undocumented: bool = False,
    include_private: bool = False,
    include_macros: bool = False,
    include_tests: bool = False,
) -> List[DocEntry]:
    entries: List[DocEntry] = []
    for doc_file in _iter_doc_files(roots, include_tests=include_tests):
        entries.extend(
            _scan_doc_file(
                doc_file,
                include_undocumented=include_undocumented,
                include_private=include_private,
                include_macros=include_macros,
            )
        )
    # Deduplicate by symbol name; keep first (roots/files are stable-sorted)
    dedup: Dict[str, DocEntry] = {}
    for entry in entries:
        dedup.setdefault(entry.name, entry)
    entries = list(dedup.values())
    entries.sort(key=lambda item: (item.name.lower(), str(item.path), item.line))
    return entries


def _filter_docs(entries: Sequence[DocEntry], query: str) -> List[DocEntry]:
    q = query.strip().lower()
    if not q:
        return list(entries)

    try:
        import shlex
        raw_terms = [term.lower() for term in shlex.split(q) if term]
    except Exception:
        raw_terms = [term.lower() for term in q.split() if term]
    terms = raw_terms
    if not terms:
        return list(entries)

    positive_terms: List[str] = []
    negative_terms: List[str] = []
    field_terms: Dict[str, List[str]] = {"name": [], "effect": [], "desc": [], "path": [], "kind": []}
    for term in terms:
        if term.startswith("-") and len(term) > 1:
            negative_terms.append(term[1:])
            continue
        if ":" in term:
            prefix, value = term.split(":", 1)
            if prefix in field_terms and value:
                field_terms[prefix].append(value)
                continue
        positive_terms.append(term)

    ranked: List[Tuple[int, DocEntry]] = []
    for entry in entries:
        name = entry.name.lower()
        effect = entry.stack_effect.lower()
        desc = entry.description.lower()
        path_text = entry.path.as_posix().lower()
        kind = entry.kind.lower()
        all_text = " ".join([name, effect, desc, path_text, kind])

        if any(term in all_text for term in negative_terms):
            continue

        if any(term not in name for term in field_terms["name"]):
            continue
        if any(term not in effect for term in field_terms["effect"]):
            continue
        if any(term not in desc for term in field_terms["desc"]):
            continue
        if any(term not in path_text for term in field_terms["path"]):
            continue
        if any(term not in kind for term in field_terms["kind"]):
            continue

        score = 0
        matches_all = True
        for term in positive_terms:
            term_score = 0
            if name == term:
                term_score = 400
            elif name.startswith(term):
                term_score = 220
            elif term in name:
                term_score = 140
            elif term in effect:
                term_score = 100
            elif term in desc:
                term_score = 70
            elif term in path_text:
                term_score = 40
            if term_score == 0:
                matches_all = False
                break
            score += term_score

        if not matches_all:
            continue
        if len(positive_terms) == 1 and positive_terms[0] in effect and positive_terms[0] not in name:
            score -= 5
        if field_terms["name"]:
            score += 60
        if field_terms["kind"]:
            score += 20
        ranked.append((score, entry))

    ranked.sort(key=lambda item: (-item[0], item[1].name.lower(), str(item[1].path), item[1].line))
    return [entry for _, entry in ranked]


def _run_docs_tui(
    entries: Sequence[DocEntry],
    initial_query: str = "",
    *,
    reload_fn: Optional[Callable[..., List[DocEntry]]] = None,
) -> int:
    if not entries:
        print("[info] no documentation entries found")
        return 0

    if not sys.stdin.isatty() or not sys.stdout.isatty():
        filtered = _filter_docs(entries, initial_query)
        print(f"[info] docs entries: {len(filtered)}/{len(entries)}")
        for entry in filtered[:200]:
            effect = entry.stack_effect if entry.stack_effect else "(no stack effect)"
            print(f"{entry.name:24} {effect}  [{entry.path}:{entry.line}]")
        if len(filtered) > 200:
            print(f"[info] ... {len(filtered) - 200} more entries")
        return 0

    import curses

    _MODE_BROWSE = 0
    _MODE_SEARCH = 1
    _MODE_DETAIL = 2
    _MODE_FILTER = 3
    _MODE_LANG_REF = 4
    _MODE_LANG_DETAIL = 5
    _MODE_LICENSE = 6
    _MODE_PHILOSOPHY = 7
    _MODE_CT_REF = 8
    _MODE_QA = 9
    _MODE_HOW = 10

    _TAB_LIBRARY = 0
    _TAB_LANG_REF = 1
    _TAB_CT_REF = 2
    _TAB_NAMES = ["Library Docs", "Language Reference", "Compile-Time Reference"]

    _FILTER_KINDS = ["all", "word", "asm", "py", "macro"]

    # ── Language Reference Entries ──────────────────────────────────
    _LANG_REF_ENTRIES: List[Dict[str, str]] = [
        {
            "name": "word ... end",
            "category": "Definitions",
            "syntax": "word <name> <body...> end",
            "summary": "Define a new word (function).",
            "detail": (
                "Defines a named word that can be called by other words. "
                "The body consists of stack operations, literals, and calls to other words. "
                "Redefinitions overwrite the previous entry with a warning.\n\n"
                "Example:\n"
                "  word square dup * end\n"
                "  word greet \"hello world\" puts end"
            ),
        },
        {
            "name": "inline word ... end",
            "category": "Definitions",
            "syntax": "inline word <name> <body...> end",
            "summary": "Define an inlined word (body is expanded at call sites).",
            "detail": (
                "Marks the definition for inline expansion. "
                "Every call site gets a copy of the body rather than a function call. "
                "Recursive inline calls are rejected at compile time.\n\n"
                "Example:\n"
                "  inline word inc 1 + end"
            ),
        },
        {
            "name": ":asm ... ;",
            "category": "Definitions",
            "syntax": ":asm <name> { <nasm body> } ;",
            "summary": "Define a word in raw NASM x86-64 assembly.",
            "detail": (
                "The body is copied verbatim into the output assembly. "
                "r12 = data stack pointer, r13 = return stack pointer. "
                "Values are 64-bit qwords. An implicit `ret` is appended.\n\n"
                "Example:\n"
                "  :asm double {\n"
                "      mov rax, [r12]\n"
                "      shl rax, 1\n"
                "      mov [r12], rax\n"
                "  } ;"
            ),
        },
        {
            "name": ":py ... ;",
            "category": "Definitions",
            "syntax": ":py <name> { <python body> } ;",
            "summary": "Define a compile-time Python macro or intrinsic.",
            "detail": (
                "The body executes once during parsing. It may define:\n"
                "  - macro(ctx: MacroContext): manipulate tokens, emit literals\n"
                "  - intrinsic(builder: FunctionEmitter): emit assembly directly\n\n"
                "Used by syntax extensions like libs/fn.sl to reshape the language."
            ),
        },
        {
            "name": "extern",
            "category": "Definitions",
            "syntax": "extern <name> <n_args> <n_rets>\nextern <ret_type> <name>(<arg_types>)",
            "summary": "Declare a foreign (C) function.",
            "detail": (
                "Two forms:\n"
                "  Raw:    extern foo 2 1     (2 args, 1 return)\n"
                "  C-like: extern double atan2(double y, double x)\n\n"
                "The emitter marshals arguments into System V registers "
                "(rdi, rsi, rdx, rcx, r8, r9 for ints; xmm0-xmm7 for floats), "
                "aligns rsp, and pushes the result from rax or xmm0."
            ),
        },
        {
            "name": "macro ... ;",
            "category": "Definitions",
            "syntax": "macro <name> <param_count> <tokens...> ;",
            "summary": "Define a text macro with positional substitution.",
            "detail": (
                "Records raw tokens until `;`. On expansion, `$0`, `$1`, ... "
                "are replaced by positional arguments. Macros cannot nest.\n\n"
                "Example:\n"
                "  macro max2 2 $0 $1 > if $0 else $1 end ;\n"
                "  5 3 max2   # leaves 5 on stack"
            ),
        },
        {
            "name": "struct ... end",
            "category": "Definitions",
            "syntax": "struct <Name>\n  field <field> <size>\n  ...\nend",
            "summary": "Define a packed struct with auto-generated accessors.",
            "detail": (
                "Emits helper words:\n"
                "  <Name>.size         — total byte size\n"
                "  <Name>.<field>.size   — field byte size\n"
                "  <Name>.<field>.offset — field byte offset\n"
                "  <Name>.<field>@     — read field from struct pointer\n"
                "  <Name>.<field>!     — write field to struct pointer\n\n"
                "Layout is tightly packed with no implicit padding.\n\n"
                "Example:\n"
                "  struct Point\n"
                "    field x 8\n"
                "    field y 8\n"
                "  end\n"
                "  # Now Point.x@, Point.x!, Point.y@, Point.y! exist"
            ),
        },
        {
            "name": "if ... end",
            "category": "Control Flow",
            "syntax": "<cond> if <body> end\n<cond> if <then> else <otherwise> end",
            "summary": "Conditional execution — pops a flag from the stack.",
            "detail": (
                "Pops the top of stack. If non-zero, executes the `then` branch; "
                "otherwise executes the `else` branch (if present).\n\n"
                "For else-if chains, place `if` on the same line as `else`:\n"
                "  <cond1> if\n"
                "    ... branch 1 ...\n"
                "  else <cond2> if\n"
                "    ... branch 2 ...\n"
                "  else\n"
                "    ... fallback ...\n"
                "  end\n\n"
                "Example:\n"
                "  dup 0 > if \"positive\" puts else \"non-positive\" puts end"
            ),
        },
        {
            "name": "while ... do ... end",
            "category": "Control Flow",
            "syntax": "while <condition> do <body> end",
            "summary": "Loop while condition is true.",
            "detail": (
                "The condition block runs before each iteration. It must leave "
                "a flag on the stack. If non-zero, the body executes and the loop "
                "repeats. If zero, execution continues after `end`.\n\n"
                "Example:\n"
                "  10\n"
                "  while dup 0 > do\n"
                "    dup puti cr\n"
                "    1 -\n"
                "  end\n"
                "  drop"
            ),
        },
        {
            "name": "for ... end",
            "category": "Control Flow",
            "syntax": "<count> for <body> end",
            "summary": "Counted loop — pops count, loops that many times.",
            "detail": (
                "Pops the loop count from the stack, stores it on the return stack, "
                "and decrements it each pass. Use `r@` (return "
                "stack peek) to read the current counter value.\n\n"
                "Example:\n"
                "  10 for\n"
                "    \"hello\" puts\n"
                "  end\n\n"
                "  # prints \"hello\" 10 times"
            ),
        },
        {
            "name": "begin ... again",
            "category": "Control Flow",
            "syntax": "begin <body> again",
            "summary": "Infinite loop (use `exit` or `goto` to break out).",
            "detail": (
                "Creates an unconditional loop. The body repeats forever.\n"
                "Available only at compile time.\n\n"
                "Example:\n"
                "  begin\n"
                "    read_stdin\n"
                "    dup 0 == if drop exit end\n"
                "    process\n"
                "  again"
            ),
        },
        {
            "name": "label / goto",
            "category": "Control Flow",
            "syntax": "label <name>\ngoto <name>",
            "summary": "Local jumps within a definition.",
            "detail": (
                "Defines a local label and jumps to it. "
                "Labels are scoped to the enclosing word definition.\n\n"
                "Example:\n"
                "  word example\n"
                "    label start\n"
                "    dup 0 == if drop exit end\n"
                "    1 - goto start\n"
                "  end"
            ),
        },
        {
            "name": "&name",
            "category": "Control Flow",
            "syntax": "&<word_name>",
            "summary": "Push pointer to a word's code label.",
            "detail": (
                "Pushes the callable address of the named word onto the stack. "
                "Combine with `jmp` for indirect/tail calls.\n\n"
                "Example:\n"
                "  &my_handler jmp   # tail-call my_handler"
            ),
        },
        {
            "name": "with ... in ... end",
            "category": "Control Flow",
            "syntax": "with <a> <b> in <body> end",
            "summary": "Local variable scope using hidden globals.",
            "detail": (
                "Pops the named values from the stack and stores them in hidden "
                "global cells (__with_a, etc.). Inside the body, reading `a` "
                "compiles to `@`, writing compiles to `!`. The cells persist "
                "across calls and are NOT re-entrant.\n\n"
                "Example:\n"
                "  10 20 with x y in\n"
                "    x y + puti cr   # prints 30\n"
                "  end"
            ),
        },
        {
            "name": "import",
            "category": "Modules",
            "syntax": "import <path>",
            "summary": "Textually include another .sl file.",
            "detail": (
                "Inserts the referenced file. Resolution order:\n"
                "  1. Absolute path\n"
                "  2. Relative to the importing file\n"
                "  3. Each include path (defaults: project root, ./stdlib)\n\n"
                "Each file is included at most once per compilation unit."
            ),
        },
        {
            "name": "[ ... ]",
            "category": "Data",
            "syntax": "[ <values...> ]",
            "summary": "List literal — captures stack segment into mmap'd buffer.",
            "detail": (
                "Captures the intervening stack values into a freshly allocated "
                "buffer. Format: [len, item0, item1, ...] as qwords. "
                "The buffer address is pushed. User must `munmap` when done.\n\n"
                "Example:\n"
                "  [ 1 2 3 4 5 ]   # pushes addr of [5, 1, 2, 3, 4, 5]"
            ),
        },
        {
            "name": "String literals",
            "category": "Data",
            "syntax": "\"<text>\"",
            "summary": "Push (addr len) pair for a string.",
            "detail": (
                "String literals push a (addr len) pair with length on top. "
                "Stored in .data with a trailing NULL for C compatibility. "
                "Escape sequences: \\\", \\\\, \\n, \\r, \\t, \\0.\n\n"
                "Example:\n"
                "  \"hello world\" puts   # prints: hello world"
            ),
        },
        {
            "name": "Number literals",
            "category": "Data",
            "syntax": "123  0xFF  0b1010  0o77",
            "summary": "Push a signed 64-bit integer.",
            "detail": (
                "Numbers are signed 64-bit integers. Supports:\n"
                "  Decimal:  123, -42\n"
                "  Hex:      0xFF, 0x1A\n"
                "  Binary:   0b1010, 0b11110000\n"
                "  Octal:    0o77, 0o755\n"
                "  Float:    3.14, 1e10 (stored as 64-bit IEEE double)"
            ),
        },
        {
            "name": "immediate",
            "category": "Modifiers",
            "syntax": "immediate",
            "summary": "Mark the last-defined word to execute at parse time.",
            "detail": (
                "Applied to the most recently defined word. Immediate words "
                "run during parsing rather than being compiled into the output. "
                "Used for syntax extensions and compile-time computation."
            ),
        },
        {
            "name": "compile-only",
            "category": "Modifiers",
            "syntax": "compile-only",
            "summary": "Mark the last-defined word as compile-only.",
            "detail": (
                "The word can only be used inside other definitions, not at "
                "the top level. Often combined with `immediate`."
            ),
        },
        {
            "name": "priority",
            "category": "Modifiers",
            "syntax": "priority <int>",
            "summary": "Set priority for the next definition (conflict resolution).",
            "detail": (
                "Controls redefinition conflicts. Higher priority wins; "
                "lower-priority definitions are silently ignored. Equal priority "
                "keeps the last definition with a warning."
            ),
        },
        {
            "name": "compile-time",
            "category": "Modifiers",
            "syntax": "compile-time <word>",
            "summary": "Execute a word at compile time but still emit it.",
            "detail": (
                "Runs the named word immediately during compilation, "
                "but its definition is also emitted for runtime use."
            ),
        },
        {
            "name": "syscall",
            "category": "System",
            "syntax": "<argN> ... <arg0> <count> <nr> syscall",
            "summary": "Invoke a Linux system call directly.",
            "detail": (
                "Expects (argN ... arg0 count nr) on the stack. Count is "
                "clamped to [0,6]. Arguments are loaded into rdi, rsi, rdx, r10, "
                "r8, r9. Executes `syscall` and pushes rax.\n\n"
                "Example:\n"
                "  # write(1, addr, len)\n"
                "  addr len 1   # fd=stdout\n"
                "  3 1 syscall  # 3 args, nr=1 (write)"
            ),
        },
        {
            "name": "exit",
            "category": "System",
            "syntax": "<code> exit",
            "summary": "Terminate the process with given exit code.",
            "detail": (
                "Pops the exit code and terminates via sys_exit_group(231). "
                "Convention: 0 = success, non-zero = failure.\n\n"
                "Example:\n"
                "  0 exit   # success"
            ),
        },
    ]

    _LANG_REF_CATEGORIES = []
    _cat_seen: set = set()
    for _lre in _LANG_REF_ENTRIES:
        if _lre["category"] not in _cat_seen:
            _cat_seen.add(_lre["category"])
            _LANG_REF_CATEGORIES.append(_lre["category"])

    _L2_LICENSE_TEXT = (
        "═══════════════════════════════════════════════════════════════\n"
        "          Apache License, Version 2.0\n"
        "          January 2004\n"
        "          http://www.apache.org/licenses/\n"
        "═══════════════════════════════════════════════════════════════\n"
        "\n"
        "  TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n"
        "\n"
        "  1. Definitions.\n"
        "\n"
        "  \"License\" shall mean the terms and conditions for use,\n"
        "  reproduction, and distribution as defined by Sections 1\n"
        "  through 9 of this document.\n"
        "\n"
        "  \"Licensor\" shall mean the copyright owner or entity\n"
        "  authorized by the copyright owner that is granting the\n"
        "  License.\n"
        "\n"
        "  \"Legal Entity\" shall mean the union of the acting entity\n"
        "  and all other entities that control, are controlled by,\n"
        "  or are under common control with that entity. For the\n"
        "  purposes of this definition, \"control\" means (i) the\n"
        "  power, direct or indirect, to cause the direction or\n"
        "  management of such entity, whether by contract or\n"
        "  otherwise, or (ii) ownership of fifty percent (50%) or\n"
        "  more of the outstanding shares, or (iii) beneficial\n"
        "  ownership of such entity.\n"
        "\n"
        "  \"You\" (or \"Your\") shall mean an individual or Legal\n"
        "  Entity exercising permissions granted by this License.\n"
        "\n"
        "  \"Source\" form shall mean the preferred form for making\n"
        "  modifications, including but not limited to software\n"
        "  source code, documentation source, and configuration\n"
        "  files.\n"
        "\n"
        "  \"Object\" form shall mean any form resulting from\n"
        "  mechanical transformation or translation of a Source\n"
        "  form, including but not limited to compiled object code,\n"
        "  generated documentation, and conversions to other media\n"
        "  types.\n"
        "\n"
        "  \"Work\" shall mean the work of authorship, whether in\n"
        "  Source or Object form, made available under the License,\n"
        "  as indicated by a copyright notice that is included in\n"
        "  or attached to the work.\n"
        "\n"
        "  \"Derivative Works\" shall mean any work, whether in\n"
        "  Source or Object form, that is based on (or derived\n"
        "  from) the Work and for which the editorial revisions,\n"
        "  annotations, elaborations, or other modifications\n"
        "  represent, as a whole, an original work of authorship.\n"
        "\n"
        "  \"Contribution\" shall mean any work of authorship,\n"
        "  including the original version of the Work and any\n"
        "  modifications or additions to that Work or Derivative\n"
        "  Works thereof, that is intentionally submitted to the\n"
        "  Licensor for inclusion in the Work by the copyright\n"
        "  owner or by an individual or Legal Entity authorized to\n"
        "  submit on behalf of the copyright owner.\n"
        "\n"
        "  \"Contributor\" shall mean Licensor and any individual or\n"
        "  Legal Entity on behalf of whom a Contribution has been\n"
        "  received by the Licensor and subsequently incorporated\n"
        "  within the Work.\n"
        "\n"
        "  2. Grant of Copyright License.\n"
        "\n"
        "  Subject to the terms and conditions of this License,\n"
        "  each Contributor hereby grants to You a perpetual,\n"
        "  worldwide, non-exclusive, no-charge, royalty-free,\n"
        "  irrevocable copyright license to reproduce, prepare\n"
        "  Derivative Works of, publicly display, publicly perform,\n"
        "  sublicense, and distribute the Work and such Derivative\n"
        "  Works in Source or Object form.\n"
        "\n"
        "  3. Grant of Patent License.\n"
        "\n"
        "  Subject to the terms and conditions of this License,\n"
        "  each Contributor hereby grants to You a perpetual,\n"
        "  worldwide, non-exclusive, no-charge, royalty-free,\n"
        "  irrevocable (except as stated in this section) patent\n"
        "  license to make, have made, use, offer to sell, sell,\n"
        "  import, and otherwise transfer the Work, where such\n"
        "  license applies only to those patent claims licensable\n"
        "  by such Contributor that are necessarily infringed by\n"
        "  their Contribution(s) alone or by combination of their\n"
        "  Contribution(s) with the Work to which such\n"
        "  Contribution(s) was submitted.\n"
        "\n"
        "  If You institute patent litigation against any entity\n"
        "  (including a cross-claim or counterclaim in a lawsuit)\n"
        "  alleging that the Work or a Contribution incorporated\n"
        "  within the Work constitutes direct or contributory\n"
        "  patent infringement, then any patent licenses granted\n"
        "  to You under this License for that Work shall terminate\n"
        "  as of the date such litigation is filed.\n"
        "\n"
        "  4. Redistribution.\n"
        "\n"
        "  You may reproduce and distribute copies of the Work or\n"
        "  Derivative Works thereof in any medium, with or without\n"
        "  modifications, and in Source or Object form, provided\n"
        "  that You meet the following conditions:\n"
        "\n"
        "  (a) You must give any other recipients of the Work or\n"
        "      Derivative Works a copy of this License; and\n"
        "\n"
        "  (b) You must cause any modified files to carry prominent\n"
        "      notices stating that You changed the files; and\n"
        "\n"
        "  (c) You must retain, in the Source form of any Derivative\n"
        "      Works that You distribute, all copyright, patent,\n"
        "      trademark, and attribution notices from the Source\n"
        "      form of the Work, excluding those notices that do\n"
        "      not pertain to any part of the Derivative Works; and\n"
        "\n"
        "  (d) If the Work includes a \"NOTICE\" text file as part\n"
        "      of its distribution, then any Derivative Works that\n"
        "      You distribute must include a readable copy of the\n"
        "      attribution notices contained within such NOTICE\n"
        "      file, excluding any notices that do not pertain to\n"
        "      any part of the Derivative Works, in at least one\n"
        "      of the following places: within a NOTICE text file\n"
        "      distributed as part of the Derivative Works; within\n"
        "      the Source form or documentation, if provided along\n"
        "      with the Derivative Works; or, within a display\n"
        "      generated by the Derivative Works, if and wherever\n"
        "      such third-party notices normally appear.\n"
        "\n"
        "  5. Submission of Contributions.\n"
        "\n"
        "  Unless You explicitly state otherwise, any Contribution\n"
        "  intentionally submitted for inclusion in the Work by You\n"
        "  to the Licensor shall be under the terms and conditions\n"
        "  of this License, without any additional terms or\n"
        "  conditions. Notwithstanding the above, nothing herein\n"
        "  shall supersede or modify the terms of any separate\n"
        "  license agreement you may have executed with Licensor\n"
        "  regarding such Contributions.\n"
        "\n"
        "  6. Trademarks.\n"
        "\n"
        "  This License does not grant permission to use the trade\n"
        "  names, trademarks, service marks, or product names of\n"
        "  the Licensor, except as required for reasonable and\n"
        "  customary use in describing the origin of the Work and\n"
        "  reproducing the content of the NOTICE file.\n"
        "\n"
        "  7. Disclaimer of Warranty.\n"
        "\n"
        "  Unless required by applicable law or agreed to in\n"
        "  writing, Licensor provides the Work (and each\n"
        "  Contributor provides its Contributions) on an \"AS IS\"\n"
        "  BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,\n"
        "  either express or implied, including, without limitation,\n"
        "  any warranties or conditions of TITLE, NON-INFRINGEMENT,\n"
        "  MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE.\n"
        "  You are solely responsible for determining the\n"
        "  appropriateness of using or redistributing the Work and\n"
        "  assume any risks associated with Your exercise of\n"
        "  permissions under this License.\n"
        "\n"
        "  8. Limitation of Liability.\n"
        "\n"
        "  In no event and under no legal theory, whether in tort\n"
        "  (including negligence), contract, or otherwise, unless\n"
        "  required by applicable law (such as deliberate and\n"
        "  grossly negligent acts) or agreed to in writing, shall\n"
        "  any Contributor be liable to You for damages, including\n"
        "  any direct, indirect, special, incidental, or\n"
        "  consequential damages of any character arising as a\n"
        "  result of this License or out of the use or inability\n"
        "  to use the Work (including but not limited to damages\n"
        "  for loss of goodwill, work stoppage, computer failure\n"
        "  or malfunction, or any and all other commercial damages\n"
        "  or losses), even if such Contributor has been advised\n"
        "  of the possibility of such damages.\n"
        "\n"
        "  9. Accepting Warranty or Additional Liability.\n"
        "\n"
        "  While redistributing the Work or Derivative Works\n"
        "  thereof, You may choose to offer, and charge a fee for,\n"
        "  acceptance of support, warranty, indemnity, or other\n"
        "  liability obligations and/or rights consistent with\n"
        "  this License. However, in accepting such obligations,\n"
        "  You may act only on Your own behalf and on Your sole\n"
        "  responsibility, not on behalf of any other Contributor,\n"
        "  and only if You agree to indemnify, defend, and hold\n"
        "  each Contributor harmless for any liability incurred\n"
        "  by, or claims asserted against, such Contributor by\n"
        "  reason of your accepting any such warranty or\n"
        "  additional liability.\n"
        "\n"
        "  END OF TERMS AND CONDITIONS\n"
        "\n"
        "═══════════════════════════════════════════════════════════════\n"
        "\n"
        "  Copyright 2024-2026 Igor Cielniak\n"
        "\n"
        "  Licensed under the Apache License, Version 2.0 (the\n"
        "  \"License\"); you may not use this file except in\n"
        "  compliance with the License. You may obtain a copy at\n"
        "\n"
        "    http://www.apache.org/licenses/LICENSE-2.0\n"
        "\n"
        "  Unless required by applicable law or agreed to in\n"
        "  writing, software distributed under the License is\n"
        "  distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES\n"
        "  OR CONDITIONS OF ANY KIND, either express or implied.\n"
        "  See the License for the specific language governing\n"
        "  permissions and limitations under the License.\n"
        "\n"
        "═══════════════════════════════════════════════════════════════\n"
    )

    _L2_PHILOSOPHY_TEXT = (
        "═══════════════════════════════════════════════════════════\n"
        "          T H E   P H I L O S O P H Y   O F   L 2\n"
        "═══════════════════════════════════════════════════════════\n"
        "\n"
        "  \"Give the programmer raw power and get out of the way.\"\n"
        "\n"
        "───────────────────────────────────────────────────────────\n"
        "\n"
        "  WHAT IS L2?\n"
        "\n"
        "  At its core, L2 is a programmable assembly templating\n"
        "  engine with a Forth-style stack interface. You write\n"
        "  small 'words' that compose into larger programs, and\n"
        "  each word compiles to a known, inspectable sequence of\n"
        "  x86-64 instructions. The language sits just above raw\n"
        "  assembly — close enough to see every byte, high enough\n"
        "  to be genuinely productive.\n"
        "\n"
        "  But L2 is more than a glorified macro assembler. Its\n"
        "  compile-time virtual machine lets you run arbitrary L2\n"
        "  code at compile time: generate words, compute lookup\n"
        "  tables, build structs, or emit entire subsystems before\n"
        "  a single byte of native code is produced. Text macros,\n"
        "  :py blocks, and token hooks extend the syntax in ways\n"
        "  that feel like language features — because they are.\n"
        "\n"
        "───────────────────────────────────────────────────────────\n"
        "\n"
        "  WHY DOES L2 EXIST?\n"
        "\n"
        "  L2 was built for fun — and that's a feature, not an\n"
        "  excuse. It exists because writing a compiler is deeply\n"
        "  satisfying, because Forth's ideas deserve to be pushed\n"
        "  further, and because sometimes you want to write a\n"
        "  program that does exactly what you told it to.\n"
        "\n"
        "  That said, 'fun' doesn't mean 'toy'. L2 produces real\n"
        "  native binaries, links against C libraries, and handles\n"
        "  practical tasks like file I/O, hashmap manipulation,\n"
        "  and async scheduling — all with a minimal runtime.\n"
        "\n"
        "───────────────────────────────────────────────────────────\n"
        "\n"
        "  CORE TENETS\n"
        "\n"
        "  1. SIMPLICITY OVER CONVENIENCE\n"
        "     No garbage collector, no hidden magic. The compiler\n"
        "     emits a minimal runtime you can read and modify.\n"
        "     You own every allocation and every free.\n"
        "\n"
        "  2. TRANSPARENCY\n"
        "     Every word compiles to a known, inspectable\n"
        "     sequence of x86-64 instructions. --emit-asm\n"
        "     shows exactly what runs on the metal.\n"
        "\n"
        "  3. COMPOSABILITY\n"
        "     Small words build big programs. The stack is the\n"
        "     universal interface — no types to reconcile, no\n"
        "     generics to instantiate. If it fits on the stack,\n"
        "     it composes.\n"
        "\n"
        "  4. META-PROGRAMMABILITY\n"
        "     The front-end is user-extensible: text macros, :py\n"
        "     blocks, immediate words, and token hooks reshape\n"
        "     syntax at compile time. The compile-time VM can\n"
        "     execute full L2 programs during compilation, making\n"
        "     the boundary between 'language' and 'metaprogram'\n"
        "     deliberately blurry.\n"
        "\n"
        "  5. UNSAFE BY DESIGN\n"
        "     Safety is the programmer's job, not the language's.\n"
        "     L2 trusts you with raw memory, inline assembly,\n"
        "     and direct syscalls. This is a feature, not a bug.\n"
        "\n"
        "  6. MINIMAL STANDARD LIBRARY\n"
        "     The stdlib provides building blocks — not policy.\n"
        "     It gives you alloc/free, puts/puti, arrays, and\n"
        "     file I/O. Everything else is your choice.\n"
        "\n"
        "  7. FUN FIRST\n"
        "     If using L2 feels like a chore, the design has\n"
        "     failed. The language should reward curiosity and\n"
        "     make you want to dig deeper into how things work.\n"
        "     At least its fun for me to write programs in. ;)"
        "\n"
        "───────────────────────────────────────────────────────────\n"
        "\n"
        "  L2 is for programmers who want to understand every\n"
        "  byte their program emits, and who believe that the\n"
        "  best abstraction is the one you built yourself.\n"
        "\n"
        "═══════════════════════════════════════════════════════════\n"
    )

    _L2_CT_REF_TEXT = (
        "═══════════════════════════════════════════════════════════════\n"
        "        C O M P I L E - T I M E   R E F E R E N C E\n"
        "═══════════════════════════════════════════════════════════════\n"
        "\n"
        "  L2 runs a compile-time virtual machine (the CT VM) during\n"
        "  parsing. Code marked `compile-time`, immediate words, and\n"
        "  :py blocks execute inside this VM. They can inspect and\n"
        "  transform the token stream, emit definitions, manipulate\n"
        "  lists and maps, and control the generated assembly output.\n"
        "\n"
        "  All words listed below are compile-only: they exist only\n"
        "  during compilation and produce no runtime code.\n"
        "\n"
        "  Stack notation:  [*, deeper, deeper | top] -> [*] || [*, result]\n"
        "    *   = rest of stack (unchanged)\n"
        "    |   = separates deeper elements from the top\n"
        "    ->  = before / after\n"
        "    ||  = separates alternative stack effects\n"
        "\n"
        "\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "  § 1  COMPILE-TIME HOOKS\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "\n"
        "  compile-time                             [immediate]\n"
        "    Marks a word definition so that its body\n"
        "    runs in the CT VM. The word's definition\n"
        "    is interpreted by the VM when the\n"
        "    word is referenced during compilation.\n"
        "\n"
        "      word double-ct dup + end\n"
        "      compile-time double-ct\n"
        "\n"
        "  immediate                                [immediate]\n"
        "    Mark the preceding word as immediate: it runs at parse\n"
        "    time whenever the compiler encounters it. Immediate words\n"
        "    receive a MacroContext and can consume tokens, emit ops,\n"
        "    or inject tokens into the stream.\n"
        "\n"
        "  compile-only                             [immediate]\n"
        "    Mark the preceding word as compile-only. It can only be\n"
        "    called during compilation, its asm is not emitted.\n"
        "\n"
        "  inline                                   [immediate]\n"
        "    Mark a word for inline expansion: its body\n"
        "    is expanded at each call site instead of emitting a call.\n"
        "\n"
        "  use-l2-ct                           [immediate, compile-only]\n"
        "    Replace the built-in CT intrinsic of a word with its L2\n"
        "    definition body. With a name on the stack, targets that\n"
        "    word; with an empty stack, targets the most recently\n"
        "    defined word.\n"
        "\n"
        "      word 3dup dup dup dup end  use-l2-ct\n"
        "\n"
        "  set-token-hook                           [compile-only]\n"
        "    [* | name] -> [*]\n"
        "    Register a word as the token hook. Every token the parser\n"
        "    encounters is pushed onto the CT stack, the hook word is\n"
        "    invoked, and the result (0 = not handled, 1 = handled)\n"
        "    tells the parser whether to skip normal processing.\n"
        "\n"
        "  clear-token-hook                         [compile-only]\n"
        "    [*] -> [*]\n"
        "    Remove the currently active token hook.\n"
        "\n"
        "\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "  § 2  LIST OPERATIONS\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "\n"
        "  Lists are dynamic arrays that live in the CT VM. They hold\n"
        "  integers, strings, tokens, other lists, maps, or nil.\n"
        "\n"
        "  list-new          [*] -> [* | list]\n"
        "    Create a new empty list.\n"
        "\n"
        "  list-clone         [* | list] -> [* | copy]\n"
        "    Shallow-copy a list.\n"
        "\n"
        "  list-append        [*, list | value] -> [* | list]\n"
        "    Append value to the end of list (mutates in place).\n"
        "\n"
        "  list-pop            [* | list] -> [*, list | value]\n"
        "    Remove and return the last element.\n"
        "\n"
        "  list-pop-front      [* | list] -> [*, list | value]\n"
        "    Remove and return the first element.\n"
        "\n"
        "  list-peek-front     [* | list] -> [*, list | value]\n"
        "    Return the first element without removing it.\n"
        "\n"
        "  list-push-front     [*, list | value] -> [* | list]\n"
        "    Insert value at the beginning of list.\n"
        "\n"
        "  list-reverse        [* | list] -> [* | list]\n"
        "    Reverse the list in place.\n"
        "\n"
        "  list-length         [* | list] -> [* | n]\n"
        "    Push the number of elements.\n"
        "\n"
        "  list-empty?         [* | list] -> [* | flag]\n"
        "    Push 1 if the list is empty, 0 otherwise.\n"
        "\n"
        "  list-get            [*, list | index] -> [* | value]\n"
        "    Get element at index (0-based). Errors on out-of-range.\n"
        "\n"
        "  list-set            [*, list, index | value] -> [* | list]\n"
        "    Set element at index. Errors on out-of-range.\n"
        "\n"
        "  list-clear          [* | list] -> [* | list]\n"
        "    Remove all elements from the list.\n"
        "\n"
        "  list-extend         [*, target | source] -> [* | target]\n"
        "    Append all elements of source to target.\n"
        "\n"
        "  list-last           [* | list] -> [* | value]\n"
        "    Push the last element without removing it.\n"
        "\n"
        "\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "  § 3  MAP OPERATIONS\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "\n"
        "  Maps are string-keyed dictionaries in the CT VM.\n"
        "\n"
        "  map-new             [*] -> [* | map]\n"
        "    Create a new empty map.\n"
        "\n"
        "  map-set             [*, map, key | value] -> [* | map]\n"
        "    Set key to value in the map (mutates in place).\n"
        "\n"
        "  map-get             [*, map | key] -> [*, map, value | flag]\n"
        "    Look up key. Pushes the map back, then the value\n"
        "    (or nil if absent), then 1 if found or 0 if not.\n"
        "\n"
        "  map-has?            [*, map | key] -> [*, map | flag]\n"
        "    Push 1 if the key exists in the map, 0 otherwise.\n"
        "\n"
        "\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "  § 4  NIL\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "\n"
        "  nil                 [*] -> [* | nil]\n"
        "    Push the nil sentinel value.\n"
        "\n"
        "  nil?                [* | value] -> [* | flag]\n"
        "    Push 1 if the value is nil, 0 otherwise.\n"
        "\n"
        "\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "  § 5  STRING OPERATIONS\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "\n"
        "  Strings in the CT VM are immutable sequences of characters.\n"
        "\n"
        "  string=             [*, a | b] -> [* | flag]\n"
        "    Push 1 if strings a and b are equal, 0 otherwise.\n"
        "\n"
        "  string-length       [* | str] -> [* | n]\n"
        "    Push the length of the string.\n"
        "\n"
        "  string-append       [*, left | right] -> [* | result]\n"
        "    Concatenate two strings.\n"
        "\n"
        "  string>number       [* | str] -> [*, value | flag]\n"
        "    Parse an integer from the string (supports 0x, 0b, 0o\n"
        "    prefixes). Pushes (value, 1) on success or (0, 0) on\n"
        "    failure.\n"
        "\n"
        "  int>string          [* | n] -> [* | str]\n"
        "    Convert an integer to its decimal string representation.\n"
        "\n"
        "  identifier?         [* | value] -> [* | flag]\n"
        "    Push 1 if the value is a valid L2 identifier string,\n"
        "    0 otherwise. Also accepts token objects.\n"
        "\n"
        "\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "  § 6  TOKEN STREAM MANIPULATION\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "\n"
        "  These words give compile-time code direct control over\n"
        "  the token stream the parser reads from.\n"
        "\n"
        "  next-token          [*] -> [* | token]\n"
        "    Consume and push the next token from the parser.\n"
        "\n"
        "  peek-token          [*] -> [* | token]\n"
        "    Push the next token without consuming it.\n"
        "\n"
        "  token-lexeme        [* | token] -> [* | str]\n"
        "    Extract the lexeme (text) from a token or string.\n"
        "\n"
        "  token-from-lexeme   [*, lexeme | template] -> [* | token]\n"
        "    Create a new token with the given lexeme, using source\n"
        "    location from the template token.\n"
        "\n"
        "  inject-tokens       [* | list-of-tokens] -> [*]\n"
        "    Insert a list of token objects at the current parser\n"
        "    position. The parser will read them before continuing\n"
        "    with the original stream.\n"
        "\n"
        "  add-token           [* | str] -> [*]\n"
        "    Register a single-character string as a token separator\n"
        "    recognized by the reader.\n"
        "\n"
        "  add-token-chars     [* | str] -> [*]\n"
        "    Register each character of the string as a token\n"
        "    separator character.\n"
        "\n"
        "  emit-definition     [*, name | body-list] -> [*]\n"
        "    Emit a word definition dynamically. `name` is a token or\n"
        "    string; `body-list` is a list of tokens/strings that form\n"
        "    the word body. Injects the equivalent of\n"
        "      word <name> <body...> end\n"
        "    into the parser's token stream.\n"
        "\n"
        "  ── Control-frame helpers (for custom control structures)\n"
        "\n"
        "  ct-control-frame-new [* | type] -> [* | frame]\n"
        "    Create a control frame map with a `type` field.\n"
        "\n"
        "  ct-control-get       [*, frame | key] -> [* | value]\n"
        "    Read key from a control frame map.\n"
        "\n"
        "  ct-control-set       [*, frame, key | value] -> [* | frame]\n"
        "    Write key/value into a control frame map.\n"
        "\n"
        "  ct-control-push      [* | frame] -> [*]\n"
        "    Push a frame onto the parser control stack.\n"
        "\n"
        "  ct-control-pop       [*] -> [* | frame]\n"
        "    Pop and return the top parser control frame.\n"
        "\n"
        "  ct-control-peek      [*] -> [* | frame] || [* | nil]\n"
        "    Return the top parser control frame without popping.\n"
        "\n"
        "  ct-control-depth     [*] -> [* | n]\n"
        "    Return parser control-stack depth.\n"
        "\n"
        "  ct-control-add-close-op [*, frame, op | data] -> [* | frame]\n"
        "    Append a close operation descriptor to frame.close_ops.\n"
        "\n"
        "  ct-new-label         [* | prefix] -> [* | label]\n"
        "    Allocate a fresh internal label with the given prefix.\n"
        "\n"
        "  ct-emit-op           [*, op | data] -> [*]\n"
        "    Emit an internal op node directly into the current body.\n"
        "\n"
        "  ct-last-token-line   [*] -> [* | line]\n"
        "    Return line number of the last parser token (or 0).\n"
        "\n"
        "  ct-register-block-opener [* | name] -> [*]\n"
        "    Mark a word name as a block opener for `with` nesting.\n"
        "\n"
        "  ct-unregister-block-opener [* | name] -> [*]\n"
        "    Remove a word name from block opener registration.\n"
        "\n"
        "  ct-register-control-override [* | name] -> [*]\n"
        "    Register a control word override so parser can delegate\n"
        "    built-in control handling to custom compile-time words.\n"
        "\n"
        "  ct-unregister-control-override [* | name] -> [*]\n"
        "    Remove a control word override registration.\n"
        "\n"
        "\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "  § 7  LEXER OBJECTS\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "\n"
        "  Lexer objects provide structured token parsing with custom\n"
        "  separator characters. They wrap the main parser and let\n"
        "  macros build mini-DSLs that tokenize differently.\n"
        "\n"
        "  lexer-new           [* | separators] -> [* | lexer]\n"
        "    Create a lexer object with the given separator characters\n"
        "    (e.g. \",;\" to split on commas and semicolons).\n"
        "\n"
        "  lexer-pop           [* | lexer] -> [*, lexer | token]\n"
        "    Consume and return the next token from the lexer.\n"
        "\n"
        "  lexer-peek          [* | lexer] -> [*, lexer | token]\n"
        "    Return the next token without consuming it.\n"
        "\n"
        "  lexer-expect        [*, lexer | str] -> [*, lexer | token]\n"
        "    Consume the next token and assert its lexeme matches str.\n"
        "    Raises a parse error on mismatch.\n"
        "\n"
        "  lexer-collect-brace [* | lexer] -> [*, lexer | list]\n"
        "    Collect all tokens between matching { } braces into a\n"
        "    list. The opening { must be the next token.\n"
        "\n"
        "  lexer-push-back     [* | lexer] -> [* | lexer]\n"
        "    Push the most recently consumed token back onto the\n"
        "    lexer's stream.\n"
        "\n"
        "\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "  § 8  ASSEMBLY OUTPUT CONTROL\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "\n"
        "  These words let compile-time code modify the generated\n"
        "  assembly: the prelude (code inside _start) and the\n"
        "  BSS section (uninitialized data).\n"
        "\n"
        "  prelude-clear       [*] -> [*]\n"
        "    Discard the entire custom prelude.\n"
        "\n"
        "  prelude-append      [* | line] -> [*]\n"
        "    Append a line of assembly to the custom prelude.\n"
        "\n"
        "  prelude-set         [* | list-of-strings] -> [*]\n"
        "    Replace the custom prelude with the given list of\n"
        "    assembly lines.\n"
        "\n"
        "  bss-clear           [*] -> [*]\n"
        "    Discard all custom BSS declarations.\n"
        "\n"
        "  bss-append          [* | line] -> [*]\n"
        "    Append a line to the custom BSS section.\n"
        "\n"
        "  bss-set             [* | list-of-strings] -> [*]\n"
        "    Replace the custom BSS with the given list of lines.\n"
        "\n"
        "\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "  § 9  EXPRESSION HELPER\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "\n"
        "  shunt               [* | token-list] -> [* | postfix-list]\n"
        "    Shunting-yard algorithm. Takes a list of infix token\n"
        "    strings (numbers, identifiers, +, -, *, /, %, parentheses)\n"
        "    and returns the equivalent postfix (RPN) token list.\n"
        "    Useful for building expression-based DSLs.\n"
        "\n"
        "      [\"3\" \"+\" \"4\" \"*\" \"2\"] shunt\n"
        "      # => [\"3\" \"4\" \"2\" \"*\" \"+\"]\n"
        "\n"
        "\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "  § 10  LOOP INDEX\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "\n"
        "  i                   [*] -> [* | index]\n"
        "    Push the current iteration index (0-based) of the\n"
        "    innermost compile-time for loop.\n"
        "\n"
        "\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "  § 11  ASSERTIONS & ERRORS\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "\n"
        "  static_assert       [* | condition] -> [*]\n"
        "    If condition is zero or false, abort compilation with a\n"
        "    static assertion failure (includes source location).\n"
        "\n"
        "  parse-error         [* | message] -> (aborts)\n"
        "    Abort compilation with the given error message.\n"
        "\n"
        "\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "  § 12  EVAL\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "\n"
        "  eval                [* | source-string] -> [*]\n"
        "    Parse and execute a string of L2 code in the CT VM.\n"
        "    The string is tokenized, parsed as if it were part of\n"
        "    a definition body, and the resulting ops are executed\n"
        "    immediately.\n"
        "\n"
        "      \"3 4 +\" eval   # pushes 7 onto the CT stack\n"
        "\n"
        "\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "  § 13  MACRO & TEXT MACRO DEFINITION\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "\n"
        "  macro <name> <number> <body...> ;\n"
        "    Define a text macro that expands during tokenization.\n"
        "    The number is the parameter count. The body tokens are\n"
        "    substituted literally wherever the macro is invoked.\n"
        "\n"
        "      macro BUFFER_SIZE 0 4096 ;\n"
        "      macro MAX 2 >r dup r> dup >r < if drop r> else r> drop end ;\n"
        "\n"
        "  :py { ... }\n"
        "    Embed a Python code block that runs at compile time.\n"
        "    The block receives a `ctx` (MacroContext) variable and\n"
        "    can call ctx.emit(), ctx.next_token(), etc.\n"
        "\n"
        "\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "  § 14  STRUCT & CSTRUCT\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "\n"
        "  struct <name> field <name> <size> ... end\n"
        "    Define a simple struct with manually-sized fields.\n"
        "    Generates accessor words:\n"
        "      <struct>.size           — total byte size\n"
        "      <struct>.<field>.offset — byte offset\n"
        "      <struct>.<field>.size   — field byte size\n"
        "      <struct>.<field>@       — read field (qword)\n"
        "      <struct>.<field>!       — write field (qword)\n"
        "\n"
        "  cstruct <name> cfield <name> <type> ... end\n"
        "    Define a C-compatible struct with automatic alignment\n"
        "    and padding. Field types use C names (int, long, char*,\n"
        "    struct <name>*, etc.). Generates the same accessors as\n"
        "    struct plus <struct>.align.\n"
        "\n"
        "\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "  § 15  FLOW CONTROL LABELS\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "\n"
        "  label <name>                             [immediate]\n"
        "    Emit a named label at the current position in the word\n"
        "    body. Can be targeted by `goto`.\n"
        "\n"
        "  goto <name>                              [immediate]\n"
        "    Emit an unconditional jump to the named label.\n"
        "\n"
        "  here                                     [immediate]\n"
        "    Push a \"file:line:col\" string literal for the current\n"
        "    source location. Useful for error messages and debugging.\n"
        "\n"
        "\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "  § 16  WITH (SCOPED VARIABLES)\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "\n"
        "  with <names...> in <body> end\n"
        "    Pop values from the stack into named local variables.\n"
        "    Inside the body, referencing a name reads the variable;\n"
        "    `name !` writes to it. Variables are backed by hidden\n"
        "    globals and are NOT re-entrant.\n"
        "\n"
        "      10 20 with x y in\n"
        "        x y +   # reads x (10) and y (20), adds -> 30\n"
        "      end\n"
        "\n"
        "\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "  § 17  SUMMARY TABLE\n"
        "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        "\n"
        "  Word                  Category        Stack Effect\n"
        "  ────────────────────  ──────────────  ──────────────────────────\n"
        "  nil                   Nil             [*] -> [* | nil]\n"
        "  nil?                  Nil             [* | v] -> [* | flag]\n"
        "  list-new              List            [*] -> [* | list]\n"
        "  list-clone            List            [* | list] -> [* | copy]\n"
        "  list-append           List            [*, list | v] -> [* | list]\n"
        "  list-pop              List            [* | list] -> [*, list | v]\n"
        "  list-pop-front        List            [* | list] -> [*, list | v]\n"
        "  list-peek-front       List            [* | list] -> [*, list | v]\n"
        "  list-push-front       List            [*, list | v] -> [* | list]\n"
        "  list-reverse          List            [* | list] -> [* | list]\n"
        "  list-length           List            [* | list] -> [* | n]\n"
        "  list-empty?           List            [* | list] -> [* | flag]\n"
        "  list-get              List            [*, list | i] -> [* | v]\n"
        "  list-set              List            [*, list, i | v] -> [* | list]\n"
        "  list-clear            List            [* | list] -> [* | list]\n"
        "  list-extend           List            [*, tgt | src] -> [* | tgt]\n"
        "  list-last             List            [* | list] -> [* | v]\n"
        "  map-new               Map             [*] -> [* | map]\n"
        "  map-set               Map             [*, map, k | v] -> [* | map]\n"
        "  map-get               Map             [*, map | k] -> [*, map, v | f]\n"
        "  map-has?              Map             [*, map | k] -> [*, map | f]\n"
        "  string=               String          [*, a | b] -> [* | flag]\n"
        "  string-length         String          [* | s] -> [* | n]\n"
        "  string-append         String          [*, l | r] -> [* | lr]\n"
        "  string>number         String          [* | s] -> [*, v | flag]\n"
        "  int>string            String          [* | n] -> [* | s]\n"
        "  identifier?           String          [* | v] -> [* | flag]\n"
        "  next-token            Token           [*] -> [* | tok]\n"
        "  peek-token            Token           [*] -> [* | tok]\n"
        "  token-lexeme          Token           [* | tok] -> [* | s]\n"
        "  token-from-lexeme     Token           [*, s | tmpl] -> [* | tok]\n"
        "  inject-tokens         Token           [* | list] -> [*]\n"
        "  add-token             Token           [* | s] -> [*]\n"
        "  add-token-chars       Token           [* | s] -> [*]\n"
        "  emit-definition       Token           [*, name | body] -> [*]\n"
        "  ct-control-frame-new  Control         [* | type] -> [* | frame]\n"
        "  ct-control-get        Control         [*, frame | key] -> [* | value]\n"
        "  ct-control-set        Control         [*, frame, key | value] -> [* | frame]\n"
        "  ct-control-push       Control         [* | frame] -> [*]\n"
        "  ct-control-pop        Control         [*] -> [* | frame]\n"
        "  ct-control-peek       Control         [*] -> [* | frame]\n"
        "  ct-control-depth      Control         [*] -> [* | n]\n"
        "  ct-control-add-close-op Control       [*, frame, op | data] -> [* | frame]\n"
        "  ct-new-label          Control         [* | prefix] -> [* | label]\n"
        "  ct-emit-op            Control         [*, op | data] -> [*]\n"
        "  ct-last-token-line    Control         [*] -> [* | line]\n"
        "  ct-register-block-opener Control      [* | name] -> [*]\n"
        "  ct-unregister-block-opener Control    [* | name] -> [*]\n"
        "  ct-register-control-override Control  [* | name] -> [*]\n"
        "  ct-unregister-control-override Control [* | name] -> [*]\n"
        "  set-token-hook        Hook            [* | name] -> [*]\n"
        "  clear-token-hook      Hook            [*] -> [*]\n"
        "  prelude-clear         Assembly        [*] -> [*]\n"
        "  prelude-append        Assembly        [* | line] -> [*]\n"
        "  prelude-set           Assembly        [* | list] -> [*]\n"
        "  bss-clear             Assembly        [*] -> [*]\n"
        "  bss-append            Assembly        [* | line] -> [*]\n"
        "  bss-set               Assembly        [* | list] -> [*]\n"
        "  shunt                 Expression      [* | list] -> [* | list]\n"
        "  i                     Loop            [*] -> [* | idx]\n"
        "  static_assert         Assert          [* | cond] -> [*]\n"
        "  parse-error           Assert          [* | msg] -> (aborts)\n"
        "  eval                  Eval            [* | str] -> [*]\n"
        "  lexer-new             Lexer           [* | seps] -> [* | lex]\n"
        "  lexer-pop             Lexer           [* | lex] -> [*, lex | tok]\n"
        "  lexer-peek            Lexer           [* | lex] -> [*, lex | tok]\n"
        "  lexer-expect          Lexer           [*, lex | s] -> [*, lex | tok]\n"
        "  lexer-collect-brace   Lexer           [* | lex] -> [*, lex | list]\n"
        "  lexer-push-back       Lexer           [* | lex] -> [* | lex]\n"
        "  use-l2-ct             Hook            [* | name?] -> [*]\n"
        "\n"
        "═══════════════════════════════════════════════════════════════\n"
    )

    _L2_QA_TEXT = (
        "═══════════════════════════════════════════════════════════\n"
        "              Q & A   /   T I P S   &   T R I C K S\n"
        "═══════════════════════════════════════════════════════════\n"
        "\n"
        "  HOW DO I DEBUG AN L2 PROGRAM?\n"
        "\n"
        "    Compile with --debug to embed DWARF debug info, then\n"
        "    launch with --dbg to drop straight into GDB:\n"
        "\n"
        "      python3 main.py my_program.sl --debug --dbg\n"
        "\n"
        "    Inside GDB you can:\n"
        "      - Set breakpoints on word labels  (b word_main)\n"
        "      - Inspect the data stack via r12  (x/8gx $r12)\n"
        "      - Step through asm instructions   (si / ni)\n"
        "      - View registers                  (info registers)\n"
        "      - Disassemble a word              (disas word_foo)\n"
        "\n"
        "    Tip: r12 is the stack pointer. [r12] = TOS,\n"
        "    [r12+8] = second element, etc.\n"
        "\n"
        "  HOW DO I VIEW THE GENERATED ASSEMBLY?\n"
        "\n"
        "    Use --emit-asm to stop after generating assembly:\n"
        "\n"
        "      python3 main.py my_program.sl --emit-asm\n"
        "\n"
        "    The .asm file is written to build/<name>.asm.\n"
        "    You can also use -v1 or higher for timing info,\n"
        "    -v2 for per-function details, and -v3 or -v4 for\n"
        "    full optimization tracing.\n"
        "\n"
        "  HOW DO I CALL C FUNCTIONS?\n"
        "\n"
        "    Declare them with the C-style extern syntax:\n"
        "\n"
        "      extern int printf(const char* fmt, ...)\n"
        "      extern void* malloc(size_t size)\n"
        "\n"
        "    Or use the legacy style:\n"
        "\n"
        "      extern printf 2 1\n"
        "\n"
        "    Link the library with -l:\n"
        "\n"
        "      python3 main.py my_program.sl -l c\n"
        "\n"
        "    You can also use cimport to auto-extract externs:\n"
        "\n"
        "      cimport \"my_header.h\"\n"
        "\n"
        "  HOW DO MACROS WORK?\n"
        "\n"
        "    Text macros are template expansions. Define with\n"
        "    an optional parameter count:\n"
        "\n"
        "      macro square       # 0-arg: inline expansion\n"
        "        dup *\n"
        "      ;\n"
        "\n"
        "      macro defconst 2   # 2-arg: $0 and $1 are args\n"
        "        word $0\n"
        "          $1\n"
        "        end\n"
        "      ;\n"
        "\n"
        "    Use them normally; macro args are positional:\n"
        "\n"
        "      5 square           # expands to: 5 dup *\n"
        "      defconst TEN 10    # defines: word TEN 10 end\n"
        "\n"
        "  HOW DO I RUN CODE AT COMPILE TIME?\n"
        "\n"
        "    Use --ct-run-main or --script to execute 'main' at\n"
        "    compile time. The CT VM supports most stack ops, I/O,\n"
        "    lists, hashmaps, and string manipulation.\n"
        "\n"
        "    You can also mark words as compile-time:\n"
        "\n"
        "      word generate-table\n"
        "        # ... runs during compilation\n"
        "      end\n"
        "      compile-time generate-table\n"
        "\n"
        "  WHAT IS THE --SCRIPT FLAG?\n"
        "\n"
        "    Shorthand for --no-artifact --ct-run-main. It parses\n"
        "    and runs 'main' in the compile-time VM without\n"
        "    producing a binary — useful for scripts as the name suggests.\n"
        "\n"
        "  HOW DO I USE THE BUILD CACHE?\n"
        "\n"
        "    The cache is automatic. It stores assembly output\n"
        "    and skips recompilation when source files haven't\n"
        "    changed. Disable with --no-cache if needed.\n"
        "\n"
        "  HOW DO I DUMP THE CONTROL-FLOW GRAPH?\n"
        "\n"
        "    Use --dump-cfg to produce a Graphviz DOT file:\n"
        "\n"
        "      python3 main.py prog.sl --dump-cfg\n"
        "      dot -Tpng build/prog.cfg.dot -o cfg.png\n"
        "\n"
        "  WHAT OPTIMIZATIONS DOES L2 PERFORM?\n"
        "\n"
        "    - Constant folding (--no-folding to disable)\n"
        "    - Peephole optimization (--no-peephole)\n"
        "    - Loop unrolling (--no-loop-unroll)\n"
        "    - Auto-inlining of small asm bodies (--no-auto-inline)\n"
        "    - Static list folding (--no-static-list-folding)\n"
        "    - Dead code elimination (automatic)\n"
        "    - -O0 disables all optimizations\n"
        "    - -O2 disables all optimizations AND checks\n"
        "\n"
        "══════════════════════════════════════════════════════════\n"
    )

    _L2_HOW_TEXT = (
        "═══════════════════════════════════════════════════════════════\n"
        "          H O W   L 2   W O R K S   (I N T E R N A L S)\n"
        "═══════════════════════════════════════════════════════════════\n"
        "\n"
        "  ARCHITECTURE OVERVIEW\n"
        "\n"
        "    The L2 compiler is a single-pass, single-file Python\n"
        "    program (~13K lines) with these major stages:\n"
        "\n"
        "    1. READER/TOKENIZER\n"
        "       Splits source into whitespace-delimited tokens.\n"
        "       Tracks line, column, and byte offsets per token.\n"
        "       Comment lines (starting with #) in word bodies are\n"
        "       preserved as metadata but not compiled.\n"
        "\n"
        "    2. IMPORT RESOLUTION\n"
        "       'import' and 'cimport' directives are resolved\n"
        "       recursively. Each file is loaded once. Imports are\n"
        "       concatenated into a single token stream with\n"
        "       FileSpan markers for error reporting.\n"
        "\n"
        "    3. PARSER\n"
        "       Walks the token stream and builds an IR Module of\n"
        "       Op lists (one per word definition). Key features:\n"
        "       - Word/asm/py/extern definitions -> dictionary\n"
        "       - Control flow (if/else/end, while/do/end, for)\n"
        "         compiled to label-based jumps\n"
        "       - Macro expansion (text macros with $N params)\n"
        "       - Token hooks for user-extensible syntax\n"
        "       - Compile-time VM execution of immediate words\n"
        "\n"
        "    4. ASSEMBLER / CODE GENERATOR\n"
        "       Converts the Op IR into NASM x86-64 assembly.\n"
        "       Handles calling conventions, extern C FFI with\n"
        "       full System V ABI support (register classification,\n"
        "       struct passing, SSE arguments).\n"
        "\n"
        "    5. NASM + LINKER\n"
        "       The assembly is assembled by NASM into an object\n"
        "       file, then linked (via ld or gcc) into the final\n"
        "       binary.\n"
        "\n"
        "───────────────────────────────────────────────────────────────\n"
        "\n"
        "  THE STACKS\n"
        "\n"
        "    L2 uses register r12 as the stack pointer for its data\n"
        "    stack. The stack grows downward:\n"
        "\n"
        "      push:  sub r12, 8; mov [r12], rax\n"
        "      pop:   mov rax, [r12]; add r12, 8\n"
        "\n"
        "    The return stack lives in a separate buffer with r13 as\n"
        "    its stack pointer (also grows downward). The native x86\n"
        "    call/ret stack (rsp) is used only for word call/return\n"
        "    linkage and C interop.\n"
        "\n"
        "───────────────────────────────────────────────────────────────\n"
        "\n"
        "  THE COMPILE-TIME VM\n"
        "\n"
        "    The CT VM is a stack-based interpreter that runs during\n"
        "    parsing. It maintains:\n"
        "\n"
        "      - A value stack (Python list of ints/strings/lists)\n"
        "      - A dictionary of CT-callable words\n"
        "      - A return stack for nested calls\n"
        "\n"
        "    CT words can:\n"
        "      - Emit token sequences into the compiler's stream\n"
        "      - Inspect/modify the parser state\n"
        "      - Call other CT words or builtins\n"
        "      - Perform I/O, string ops, list/hashmap manipulation\n"
        "\n"
        "    When --ct-run-main is used, the CT VM can also JIT-compile\n"
        "    and execute native x86-64 code via the Keystone assembler\n"
        "    engine (for words that need native performance).\n"
        "\n"
        "───────────────────────────────────────────────────────────────\n"
        "\n"
        "  OPTIMIZATION PASSES\n"
        "\n"
        "    CONSTANT FOLDING\n"
        "      Evaluates pure arithmetic sequences (e.g., 3 4 +\n"
        "      becomes push 7). Works across word boundaries for\n"
        "      inlined words.\n"
        "\n"
        "    PEEPHOLE OPTIMIZATION\n"
        "      Pattern-matches instruction sequences and\n"
        "      replaces them with shorter equivalents. Examples:\n"
        "        swap drop -> nip\n"
        "        swap nip  -> drop\n"
        "\n"
        "    LOOP UNROLLING\n"
        "      Small deterministic loops (e.g., '4 for ... next')\n"
        "      are unrolled into straight-line code when the\n"
        "      iteration count is known at compile time.\n"
        "\n"
        "    AUTO-INLINING\n"
        "      Small asm-body words (below a size threshold) are\n"
        "      automatically inlined at call sites, eliminating\n"
        "      call/ret overhead.\n"
        "\n"
        "    STATIC LIST FOLDING\n"
        "      List literals like [1 2 3] with all-constant\n"
        "      elements are placed in .data instead of being\n"
        "      built at runtime.\n"
        "\n"
        "    DEAD CODE ELIMINATION\n"
        "      Words that are never called (and not 'main') are\n"
        "      excluded from the final assembly output.\n"
        "\n"
        "───────────────────────────────────────────────────────────────\n"
        "\n"
        "  EXTERN C FFI\n"
        "\n"
        "    L2's extern system supports the full System V AMD64 ABI:\n"
        "\n"
        "    - Integer args -> rdi, rsi, rdx, rcx, r8, r9, then stack\n"
        "    - Float/double args -> xmm0..xmm7, then stack\n"
        "    - Struct args classified per ABI eightbyte rules\n"
        "    - Return values in rax (int), xmm0 (float), or via\n"
        "      hidden sret pointer for large structs\n"
        "    - RSP is aligned to 16 bytes before each call\n"
        "\n"
        "    The compiler auto-classifies argument types from the\n"
        "    C-style declaration and generates the correct register\n"
        "    shuffle and stack layout.\n"
        "\n"
        "───────────────────────────────────────────────────────────────\n"
        "\n"
        "  QUIRKS & GOTCHAS\n"
        "\n"
        "    - No type system: everything is a 64-bit integer on\n"
        "      the stack. Pointers, booleans, characters — all\n"
        "      just numbers. Type safety is your responsibility.\n"
        "\n"
        "    - Macro expansion depth: macros can expand macros,\n"
        "      but there's a limit (default 64, configurable via\n"
        "      --macro-expansion-limit).\n"
        "\n"
        "    - :py blocks: Python code embedded in :py { ... }\n"
        "      runs in the compiler's Python process. It has full\n"
        "      access to the parser and dictionary — powerful but\n"
        "      dangerous.\n"
        "\n"
        "    - The CT VM and native codegen share a dictionary\n"
        "      but have separate stacks. A word defined at CT\n"
        "      exists at CT only unless also compiled normally.\n"
        "\n"
        "    - The build cache tracks file mtimes and a hash of\n"
        "      compiler flags. CT side effects invalidate the\n"
        "      cache for that file.\n"
        "\n"
        "═══════════════════════════════════════════════════════════════\n"
    )

    def _parse_sig_counts(effect: str) -> Tuple[int, int]:
        """Parse stack effect to (n_args, n_returns).

        Counts all named items (excluding ``*``) on each side of ``->``.
        Items before ``|`` are deeper stack elements; items after are top.
        Both count as args/returns.

        Handles dual-return with ``||``:
          ``[* | x] -> [* | y] || [*, x | z]``
        Takes the first branch for counting.
        Returns (-1, -1) for unparseable effects.
        """
        if not effect or "->" not in effect:
            return (-1, -1)
        # Split off dual-return: take first branch
        main = effect.split("||")[0].strip()
        parts = main.split("->", 1)
        if len(parts) != 2:
            return (-1, -1)
        lhs, rhs = parts[0].strip(), parts[1].strip()

        def _count_items(side: str) -> int:
            s = side.strip()
            if s.startswith("["):
                s = s[1:]
            if s.endswith("]"):
                s = s[:-1]
            s = s.strip()
            if not s:
                return 0
            # Flatten both sides of pipe and count all non-* items
            all_items = s.replace("|", ",")
            return len([x.strip() for x in all_items.split(",")
                        if x.strip() and x.strip() != "*"])

        return (_count_items(lhs), _count_items(rhs))

    def _safe_addnstr(scr: Any, y: int, x: int, text: str, maxlen: int, attr: int = 0) -> None:
        h, w = scr.getmaxyx()
        if y < 0 or y >= h or x >= w:
            return
        maxlen = min(maxlen, w - x)
        if maxlen <= 0:
            return
        try:
            scr.addnstr(y, x, text, maxlen, attr)
        except curses.error:
            pass

    def _build_detail_lines(entry: DocEntry, width: int) -> List[str]:
        lines: List[str] = []
        lines.append(f"{'Name:':<14} {entry.name}")
        lines.append(f"{'Kind:':<14} {entry.kind}")
        if entry.stack_effect:
            lines.append(f"{'Stack effect:':<14} {entry.stack_effect}")
        else:
            lines.append(f"{'Stack effect:':<14} (none)")
        lines.append(f"{'File:':<14} {entry.path}:{entry.line}")
        lines.append("")
        if entry.description:
            lines.append("Description:")
            # Word-wrap description
            words = entry.description.split()
            current: List[str] = []
            col = 2  # indent
            for w in words:
                if current and col + 1 + len(w) > width - 2:
                    lines.append("  " + " ".join(current))
                    current = [w]
                    col = 2 + len(w)
                else:
                    current.append(w)
                    col += 1 + len(w) if current else len(w)
            if current:
                lines.append("  " + " ".join(current))
        else:
            lines.append("(no description)")
        lines.append("")
        # Show source context
        lines.append("Source context:")
        try:
            src_lines = entry.path.read_text(encoding="utf-8", errors="ignore").splitlines()
            start = max(0, entry.line - 1)
            if entry.kind == "word":
                # Depth-tracking: word/if/while/for/begin/with open blocks closed by 'end'
                _block_openers = {"word", "if", "while", "for", "begin", "with"}
                depth = 0
                end = min(len(src_lines), start + 200)
                for i in range(start, end):
                    stripped = src_lines[i].strip()
                    # Strip comments (# to end of line, but not inside strings)
                    code = stripped.split("#", 1)[0].strip() if "#" in stripped else stripped
                    # Count all block openers and 'end' tokens on the line
                    for tok in code.split():
                        if tok in _block_openers:
                            depth += 1
                        elif tok == "end":
                            depth -= 1
                    prefix = f"  {i + 1:4d}| "
                    lines.append(prefix + src_lines[i])
                    if depth <= 0 and i > start:
                        break
            elif entry.kind in ("asm", "py"):
                # Show until closing brace + a few extra lines of context
                end = min(len(src_lines), start + 200)
                found_close = False
                extra_after = 0
                for i in range(start, end):
                    prefix = f"  {i + 1:4d}| "
                    lines.append(prefix + src_lines[i])
                    stripped = src_lines[i].strip()
                    if not found_close and stripped in ("}", "};") and i > start:
                        found_close = True
                        extra_after = 0
                        continue
                    if found_close:
                        extra_after += 1
                        if extra_after >= 3 or not stripped:
                            break
            elif entry.kind == "macro":
                # Show macro body until closing ';'
                end = min(len(src_lines), start + 200)
                for i in range(start, end):
                    prefix = f"  {i + 1:4d}| "
                    lines.append(prefix + src_lines[i])
                    stripped = src_lines[i].strip()
                    if stripped.endswith(";") and i >= start:
                        break
            else:
                end = min(len(src_lines), start + 30)
                for i in range(start, end):
                    prefix = f"  {i + 1:4d}| "
                    lines.append(prefix + src_lines[i])
        except Exception:
            lines.append("  (unable to read source)")
        return lines

    def _app(stdscr: Any) -> int:
        try:
            curses.curs_set(0)
        except Exception:
            pass
        stdscr.keypad(True)

        # Initialize color pairs for kind tags
        _has_colors = False
        try:
            if curses.has_colors():
                curses.start_color()
                curses.use_default_colors()
                curses.init_pair(1, curses.COLOR_CYAN, -1)     # word
                curses.init_pair(2, curses.COLOR_GREEN, -1)    # asm
                curses.init_pair(3, curses.COLOR_YELLOW, -1)   # py
                curses.init_pair(4, curses.COLOR_MAGENTA, -1)  # macro
                _has_colors = True
        except Exception:
            pass

        _KIND_COLORS = {
            "word": curses.color_pair(1) if _has_colors else 0,
            "asm": curses.color_pair(2) if _has_colors else 0,
            "py": curses.color_pair(3) if _has_colors else 0,
            "macro": curses.color_pair(4) if _has_colors else 0,
        }

        nonlocal entries
        query = initial_query
        selected = 0
        scroll = 0
        mode = _MODE_BROWSE
        active_tab = _TAB_LIBRARY

        # Search mode state
        search_buf = query

        # Detail mode state
        detail_scroll = 0
        detail_lines: List[str] = []

        # Language reference state
        lang_selected = 0
        lang_scroll = 0
        lang_cat_filter = 0  # 0 = all
        lang_detail_scroll = 0
        lang_detail_lines: List[str] = []

        # License/philosophy scroll state
        info_scroll = 0
        info_lines: List[str] = []

        # Filter mode state
        filter_kind_idx = 0  # index into _FILTER_KINDS
        filter_field = 0  # 0=kind, 1=args, 2=returns, 3=show_private, 4=show_macros, 5=extra_path, 6=files
        filter_file_scroll = 0
        filter_file_cursor = 0
        filter_args = -1      # -1 = any
        filter_returns = -1   # -1 = any
        filter_extra_path = ""  # text input for adding paths
        filter_extra_roots: List[str] = []  # accumulated extra paths
        filter_show_private = False
        filter_show_macros = False

        # Build unique file list; all enabled by default
        all_file_paths: List[str] = sorted(set(e.path.as_posix() for e in entries))
        filter_files_enabled: Dict[str, bool] = {p: True for p in all_file_paths}

        def _rebuild_file_list() -> None:
            nonlocal all_file_paths, filter_files_enabled
            new_paths = sorted(set(e.path.as_posix() for e in entries))
            old = filter_files_enabled
            filter_files_enabled = {p: old.get(p, True) for p in new_paths}
            all_file_paths = new_paths

        def _filter_lang_ref() -> List[Dict[str, str]]:
            if lang_cat_filter == 0:
                return list(_LANG_REF_ENTRIES)
            cat = _LANG_REF_CATEGORIES[lang_cat_filter - 1]
            return [e for e in _LANG_REF_ENTRIES if e["category"] == cat]

        def _build_lang_detail_lines(entry: Dict[str, str], width: int) -> List[str]:
            lines: List[str] = []
            lines.append(f"{'Name:':<14} {entry['name']}")
            lines.append(f"{'Category:':<14} {entry['category']}")
            lines.append("")
            lines.append("Syntax:")
            for sl in entry["syntax"].split("\n"):
                lines.append(f"  {sl}")
            lines.append("")
            lines.append(f"{'Summary:':<14} {entry['summary']}")
            lines.append("")
            lines.append("Description:")
            for dl in entry["detail"].split("\n"):
                if len(dl) <= width - 4:
                    lines.append(f"  {dl}")
                else:
                    words = dl.split()
                    current: List[str] = []
                    col = 2
                    for w in words:
                        if current and col + 1 + len(w) > width - 2:
                            lines.append("  " + " ".join(current))
                            current = [w]
                            col = 2 + len(w)
                        else:
                            current.append(w)
                            col += 1 + len(w) if current else len(w)
                    if current:
                        lines.append("  " + " ".join(current))
            return lines

        def _render_tab_bar(scr: Any, y: int, width: int) -> None:
            x = 1
            for i, name in enumerate(_TAB_NAMES):
                label = f" {name} "
                attr = curses.A_REVERSE | curses.A_BOLD if i == active_tab else curses.A_DIM
                _safe_addnstr(scr, y, x, label, width - x - 1, attr)
                x += len(label) + 1
            # Right-aligned shortcuts
            shortcuts = " ? Q&A  H how  P philosophy  L license "
            if x + len(shortcuts) < width:
                _safe_addnstr(scr, y, width - len(shortcuts) - 1, shortcuts, len(shortcuts), curses.A_DIM)

        def _apply_filters(items: List[DocEntry]) -> List[DocEntry]:
            result = items
            kind = _FILTER_KINDS[filter_kind_idx]
            if kind != "all":
                result = [e for e in result if e.kind == kind]
            # File toggle filter
            if not all(filter_files_enabled.get(p, True) for p in all_file_paths):
                result = [e for e in result if filter_files_enabled.get(e.path.as_posix(), True)]
            # Signature filters
            if filter_args >= 0 or filter_returns >= 0:
                filtered = []
                for e in result:
                    n_args, n_rets = _parse_sig_counts(e.stack_effect)
                    if filter_args >= 0 and n_args != filter_args:
                        continue
                    if filter_returns >= 0 and n_rets != filter_returns:
                        continue
                    filtered.append(e)
                result = filtered
            return result

        while True:
            filtered = _apply_filters(_filter_docs(entries, query))
            if selected >= len(filtered):
                selected = max(0, len(filtered) - 1)

            height, width = stdscr.getmaxyx()
            if height < 3 or width < 10:
                stdscr.erase()
                _safe_addnstr(stdscr, 0, 0, "terminal too small", width - 1)
                stdscr.refresh()
                stdscr.getch()
                continue

            # -- DETAIL MODE --
            if mode == _MODE_DETAIL:
                stdscr.erase()
                _safe_addnstr(
                    stdscr, 0, 0,
                    f" {detail_lines[0] if detail_lines else ''} ",
                    width - 1, curses.A_BOLD,
                )
                _safe_addnstr(stdscr, 1, 0, " q/Esc: back  j/k/Up/Down: scroll  PgUp/PgDn ", width - 1, curses.A_DIM)
                body_height = max(1, height - 3)
                max_dscroll = max(0, len(detail_lines) - body_height)
                if detail_scroll > max_dscroll:
                    detail_scroll = max_dscroll
                for row in range(body_height):
                    li = detail_scroll + row
                    if li >= len(detail_lines):
                        break
                    _safe_addnstr(stdscr, 2 + row, 0, detail_lines[li], width - 1)
                pos_text = f" {detail_scroll + 1}-{min(detail_scroll + body_height, len(detail_lines))}/{len(detail_lines)} "
                _safe_addnstr(stdscr, height - 1, 0, pos_text, width - 1, curses.A_DIM)
                stdscr.refresh()
                key = stdscr.getch()
                if key in (27, ord("q"), ord("h"), curses.KEY_LEFT):
                    mode = _MODE_BROWSE
                    continue
                if key in (curses.KEY_DOWN, ord("j")):
                    if detail_scroll < max_dscroll:
                        detail_scroll += 1
                    continue
                if key in (curses.KEY_UP, ord("k")):
                    if detail_scroll > 0:
                        detail_scroll -= 1
                    continue
                if key == curses.KEY_NPAGE:
                    detail_scroll = min(max_dscroll, detail_scroll + body_height)
                    continue
                if key == curses.KEY_PPAGE:
                    detail_scroll = max(0, detail_scroll - body_height)
                    continue
                if key == ord("g"):
                    detail_scroll = 0
                    continue
                if key == ord("G"):
                    detail_scroll = max_dscroll
                    continue
                continue

            # -- FILTER MODE --
            if mode == _MODE_FILTER:
                stdscr.erase()
                _safe_addnstr(stdscr, 0, 0, " Filters ", width - 1, curses.A_BOLD)
                _safe_addnstr(stdscr, 1, 0, " Tab: next field  Space/Left/Right: change  a: all files  n: none  Enter/Esc: close ", width - 1, curses.A_DIM)

                _N_FILTER_FIELDS = 7  # kind, args, returns, show_private, show_macros, extra_path, files
                row_y = 3

                # Kind row
                kind_label = f"  Kind: < {_FILTER_KINDS[filter_kind_idx]:6} >"
                kind_attr = curses.A_REVERSE if filter_field == 0 else 0
                _safe_addnstr(stdscr, row_y, 0, kind_label, width - 1, kind_attr)
                row_y += 1

                # Args row
                args_val = "any" if filter_args < 0 else str(filter_args)
                args_label = f"  Args: < {args_val:6} >"
                args_attr = curses.A_REVERSE if filter_field == 1 else 0
                _safe_addnstr(stdscr, row_y, 0, args_label, width - 1, args_attr)
                row_y += 1

                # Returns row
                rets_val = "any" if filter_returns < 0 else str(filter_returns)
                rets_label = f"  Rets: < {rets_val:6} >"
                rets_attr = curses.A_REVERSE if filter_field == 2 else 0
                _safe_addnstr(stdscr, row_y, 0, rets_label, width - 1, rets_attr)
                row_y += 1

                # Show private row
                priv_val = "yes" if filter_show_private else "no"
                priv_label = f"  Private: < {priv_val:6} >"
                priv_attr = curses.A_REVERSE if filter_field == 3 else 0
                _safe_addnstr(stdscr, row_y, 0, priv_label, width - 1, priv_attr)
                row_y += 1

                # Show macros row
                macro_val = "yes" if filter_show_macros else "no"
                macro_label = f"  Macros: < {macro_val:6} >"
                macro_attr = curses.A_REVERSE if filter_field == 4 else 0
                _safe_addnstr(stdscr, row_y, 0, macro_label, width - 1, macro_attr)
                row_y += 1

                # Extra path row
                if filter_field == 5:
                    ep_label = f"  Path: {filter_extra_path}_"
                    ep_attr = curses.A_REVERSE
                else:
                    ep_label = f"  Path: {filter_extra_path or '(type path, Enter to add)'}"
                    ep_attr = 0
                _safe_addnstr(stdscr, row_y, 0, ep_label, width - 1, ep_attr)
                row_y += 1
                for er in filter_extra_roots:
                    _safe_addnstr(stdscr, row_y, 0, f"    + {er}", width - 1, curses.A_DIM)
                    row_y += 1
                row_y += 1

                # Files section
                files_header = "  Files:"
                files_header_attr = curses.A_BOLD if filter_field == 6 else curses.A_DIM
                _safe_addnstr(stdscr, row_y, 0, files_header, width - 1, files_header_attr)
                row_y += 1

                file_area_top = row_y
                file_area_height = max(1, height - file_area_top - 2)
                n_files = len(all_file_paths)

                if filter_field == 6:
                    # Clamp cursor and scroll
                    if filter_file_cursor >= n_files:
                        filter_file_cursor = max(0, n_files - 1)
                    if filter_file_cursor < filter_file_scroll:
                        filter_file_scroll = filter_file_cursor
                    if filter_file_cursor >= filter_file_scroll + file_area_height:
                        filter_file_scroll = filter_file_cursor - file_area_height + 1
                    max_fscroll = max(0, n_files - file_area_height)
                    if filter_file_scroll > max_fscroll:
                        filter_file_scroll = max_fscroll

                for row in range(file_area_height):
                    fi = filter_file_scroll + row
                    if fi >= n_files:
                        break
                    fp = all_file_paths[fi]
                    mark = "[x]" if filter_files_enabled.get(fp, True) else "[ ]"
                    label = f"    {mark} {fp}"
                    attr = curses.A_REVERSE if (filter_field == 6 and fi == filter_file_cursor) else 0
                    _safe_addnstr(stdscr, file_area_top + row, 0, label, width - 1, attr)

                enabled_count = sum(1 for v in filter_files_enabled.values() if v)
                preview = _apply_filters(_filter_docs(entries, query))
                status = f" {enabled_count}/{n_files} files  kind={_FILTER_KINDS[filter_kind_idx]}  args={args_val}  rets={rets_val}  {len(preview)} matches "
                _safe_addnstr(stdscr, height - 1, 0, status, width - 1, curses.A_DIM)
                stdscr.refresh()
                key = stdscr.getch()
                if key == 27:
                    mode = _MODE_BROWSE
                    selected = 0
                    scroll = 0
                    continue
                if key in (10, 13, curses.KEY_ENTER) and filter_field != 5:
                    mode = _MODE_BROWSE
                    selected = 0
                    scroll = 0
                    continue
                if key == 9:  # Tab
                    filter_field = (filter_field + 1) % _N_FILTER_FIELDS
                    continue
                if filter_field not in (5, 6):
                    if key in (curses.KEY_DOWN, ord("j")):
                        filter_field = (filter_field + 1) % _N_FILTER_FIELDS
                        continue
                    if key in (curses.KEY_UP, ord("k")):
                        filter_field = (filter_field - 1) % _N_FILTER_FIELDS
                        continue
                if filter_field == 0:
                    # Kind field
                    if key in (curses.KEY_LEFT, ord("h")):
                        filter_kind_idx = (filter_kind_idx - 1) % len(_FILTER_KINDS)
                        continue
                    if key in (curses.KEY_RIGHT, ord("l"), ord(" ")):
                        filter_kind_idx = (filter_kind_idx + 1) % len(_FILTER_KINDS)
                        continue
                elif filter_field == 1:
                    # Args field: Left/Right to adjust, -1 = any
                    if key in (curses.KEY_RIGHT, ord("l"), ord(" ")):
                        filter_args += 1
                        if filter_args > 10:
                            filter_args = -1
                        continue
                    if key in (curses.KEY_LEFT, ord("h")):
                        filter_args -= 1
                        if filter_args < -1:
                            filter_args = 10
                        continue
                elif filter_field == 2:
                    # Returns field: Left/Right to adjust
                    if key in (curses.KEY_RIGHT, ord("l"), ord(" ")):
                        filter_returns += 1
                        if filter_returns > 10:
                            filter_returns = -1
                        continue
                    if key in (curses.KEY_LEFT, ord("h")):
                        filter_returns -= 1
                        if filter_returns < -1:
                            filter_returns = 10
                        continue
                elif filter_field == 3:
                    # Show private toggle
                    if key in (curses.KEY_LEFT, curses.KEY_RIGHT, ord("h"), ord("l"), ord(" ")):
                        filter_show_private = not filter_show_private
                        if reload_fn is not None:
                            entries = reload_fn(include_private=filter_show_private, include_macros=filter_show_macros, extra_roots=filter_extra_roots)
                            _rebuild_file_list()
                        continue
                elif filter_field == 4:
                    # Show macros toggle
                    if key in (curses.KEY_LEFT, curses.KEY_RIGHT, ord("h"), ord("l"), ord(" ")):
                        filter_show_macros = not filter_show_macros
                        if reload_fn is not None:
                            entries = reload_fn(include_private=filter_show_private, include_macros=filter_show_macros, extra_roots=filter_extra_roots)
                            _rebuild_file_list()
                        continue
                elif filter_field == 5:
                    # Extra path: text input, Enter adds to roots
                    if key in (10, 13, curses.KEY_ENTER):
                        if filter_extra_path.strip():
                            filter_extra_roots.append(filter_extra_path.strip())
                            filter_extra_path = ""
                            if reload_fn is not None:
                                entries = reload_fn(
                                    include_private=filter_show_private,
                                    include_macros=filter_show_macros,
                                    extra_roots=filter_extra_roots,
                                )
                                _rebuild_file_list()
                        continue
                    if key in (curses.KEY_BACKSPACE, 127, 8):
                        filter_extra_path = filter_extra_path[:-1]
                        continue
                    if 32 <= key <= 126:
                        filter_extra_path += chr(key)
                        continue
                elif filter_field == 6:
                    # Files field
                    if key in (curses.KEY_UP, ord("k")):
                        if filter_file_cursor > 0:
                            filter_file_cursor -= 1
                        continue
                    if key in (curses.KEY_DOWN, ord("j")):
                        if filter_file_cursor + 1 < n_files:
                            filter_file_cursor += 1
                        continue
                    if key == ord(" "):
                        if 0 <= filter_file_cursor < n_files:
                            fp = all_file_paths[filter_file_cursor]
                            filter_files_enabled[fp] = not filter_files_enabled.get(fp, True)
                        continue
                    if key == ord("a"):
                        for fp in all_file_paths:
                            filter_files_enabled[fp] = True
                        continue
                    if key == ord("n"):
                        for fp in all_file_paths:
                            filter_files_enabled[fp] = False
                        continue
                    if key == curses.KEY_PPAGE:
                        filter_file_cursor = max(0, filter_file_cursor - file_area_height)
                        continue
                    if key == curses.KEY_NPAGE:
                        filter_file_cursor = min(max(0, n_files - 1), filter_file_cursor + file_area_height)
                        continue
                continue

            # -- SEARCH MODE --
            if mode == _MODE_SEARCH:
                stdscr.erase()
                prompt = f"/{search_buf}"
                _safe_addnstr(stdscr, 0, 0, prompt, width - 1, curses.A_BOLD)
                preview = _apply_filters(_filter_docs(entries, search_buf))
                _safe_addnstr(stdscr, 1, 0, f" {len(preview)} matches   (Enter: apply  Esc: cancel)", width - 1, curses.A_DIM)
                preview_height = max(1, height - 3)
                for row in range(min(preview_height, len(preview))):
                    e = preview[row]
                    effect = e.stack_effect if e.stack_effect else "(no stack effect)"
                    line = f"  {e.name:24} {effect}"
                    _safe_addnstr(stdscr, 2 + row, 0, line, width - 1)
                stdscr.refresh()
                try:
                    curses.curs_set(1)
                except Exception:
                    pass
                key = stdscr.getch()
                if key == 27:
                    # Cancel search, revert
                    search_buf = query
                    mode = _MODE_BROWSE
                    try:
                        curses.curs_set(0)
                    except Exception:
                        pass
                    continue
                if key in (10, 13, curses.KEY_ENTER):
                    query = search_buf
                    selected = 0
                    scroll = 0
                    mode = _MODE_BROWSE
                    try:
                        curses.curs_set(0)
                    except Exception:
                        pass
                    continue
                if key in (curses.KEY_BACKSPACE, 127, 8):
                    search_buf = search_buf[:-1]
                    continue
                if 32 <= key <= 126:
                    search_buf += chr(key)
                    continue
                continue

            # -- LANGUAGE REFERENCE BROWSE --
            if mode == _MODE_LANG_REF:
                lang_entries = _filter_lang_ref()
                if lang_selected >= len(lang_entries):
                    lang_selected = max(0, len(lang_entries) - 1)

                list_height = max(1, height - 5)
                if lang_selected < lang_scroll:
                    lang_scroll = lang_selected
                if lang_selected >= lang_scroll + list_height:
                    lang_scroll = lang_selected - list_height + 1
                max_ls = max(0, len(lang_entries) - list_height)
                if lang_scroll > max_ls:
                    lang_scroll = max_ls

                stdscr.erase()
                _render_tab_bar(stdscr, 0, width)
                cat_names = ["all"] + _LANG_REF_CATEGORIES
                cat_label = cat_names[lang_cat_filter]
                header = f" Language Reference  {len(lang_entries)} entries  category: {cat_label}"
                _safe_addnstr(stdscr, 1, 0, header, width - 1, curses.A_BOLD)
                hint = " c category  Enter detail  j/k nav  Tab switch  C ct-ref  ? Q&A  H how  P philosophy  q quit"
                _safe_addnstr(stdscr, 2, 0, hint, width - 1, curses.A_DIM)

                for row in range(list_height):
                    idx = lang_scroll + row
                    if idx >= len(lang_entries):
                        break
                    le = lang_entries[idx]
                    cat_tag = f"[{le['category']}]"
                    line = f"  {le['name']:<28} {le['summary']:<36} {cat_tag}"
                    attr = curses.A_REVERSE if idx == lang_selected else 0
                    _safe_addnstr(stdscr, 3 + row, 0, line, width - 1, attr)

                if lang_entries:
                    cur = lang_entries[lang_selected]
                    _safe_addnstr(stdscr, height - 1, 0, f" {cur['syntax'].split(chr(10))[0]}", width - 1, curses.A_DIM)
                stdscr.refresh()
                key = stdscr.getch()

                if key in (27, ord("q")):
                    return 0
                if key == 9:  # Tab
                    active_tab = _TAB_CT_REF
                    info_lines = _L2_CT_REF_TEXT.splitlines()
                    info_scroll = 0
                    mode = _MODE_CT_REF
                    continue
                if key == ord("c"):
                    lang_cat_filter = (lang_cat_filter + 1) % (len(_LANG_REF_CATEGORIES) + 1)
                    lang_selected = 0
                    lang_scroll = 0
                    continue
                if key in (10, 13, curses.KEY_ENTER):
                    if lang_entries:
                        lang_detail_lines = _build_lang_detail_lines(lang_entries[lang_selected], width)
                        lang_detail_scroll = 0
                        mode = _MODE_LANG_DETAIL
                    continue
                if key in (curses.KEY_UP, ord("k")):
                    if lang_selected > 0:
                        lang_selected -= 1
                    continue
                if key in (curses.KEY_DOWN, ord("j")):
                    if lang_selected + 1 < len(lang_entries):
                        lang_selected += 1
                    continue
                if key == curses.KEY_PPAGE:
                    lang_selected = max(0, lang_selected - list_height)
                    continue
                if key == curses.KEY_NPAGE:
                    lang_selected = min(max(0, len(lang_entries) - 1), lang_selected + list_height)
                    continue
                if key == ord("g"):
                    lang_selected = 0
                    lang_scroll = 0
                    continue
                if key == ord("G"):
                    lang_selected = max(0, len(lang_entries) - 1)
                    continue
                if key == ord("L"):
                    info_lines = _L2_LICENSE_TEXT.splitlines()
                    info_scroll = 0
                    mode = _MODE_LICENSE
                    continue
                if key == ord("P"):
                    info_lines = _L2_PHILOSOPHY_TEXT.splitlines()
                    info_scroll = 0
                    mode = _MODE_PHILOSOPHY
                    continue
                if key == ord("?"):
                    info_lines = _L2_QA_TEXT.splitlines()
                    info_scroll = 0
                    mode = _MODE_QA
                    continue
                if key == ord("H"):
                    info_lines = _L2_HOW_TEXT.splitlines()
                    info_scroll = 0
                    mode = _MODE_HOW
                    continue
                if key == ord("C"):
                    active_tab = _TAB_CT_REF
                    info_lines = _L2_CT_REF_TEXT.splitlines()
                    info_scroll = 0
                    mode = _MODE_CT_REF
                    continue

            # -- LANGUAGE DETAIL MODE --
            if mode == _MODE_LANG_DETAIL:
                stdscr.erase()
                _safe_addnstr(
                    stdscr, 0, 0,
                    f" {lang_detail_lines[0] if lang_detail_lines else ''} ",
                    width - 1, curses.A_BOLD,
                )
                _safe_addnstr(stdscr, 1, 0, " q/Esc: back  j/k/Up/Down: scroll  PgUp/PgDn ", width - 1, curses.A_DIM)
                body_height = max(1, height - 3)
                max_ldscroll = max(0, len(lang_detail_lines) - body_height)
                if lang_detail_scroll > max_ldscroll:
                    lang_detail_scroll = max_ldscroll
                for row in range(body_height):
                    li = lang_detail_scroll + row
                    if li >= len(lang_detail_lines):
                        break
                    _safe_addnstr(stdscr, 2 + row, 0, lang_detail_lines[li], width - 1)
                pos_text = f" {lang_detail_scroll + 1}-{min(lang_detail_scroll + body_height, len(lang_detail_lines))}/{len(lang_detail_lines)} "
                _safe_addnstr(stdscr, height - 1, 0, pos_text, width - 1, curses.A_DIM)
                stdscr.refresh()
                key = stdscr.getch()
                if key in (27, ord("q"), ord("h"), curses.KEY_LEFT):
                    mode = _MODE_LANG_REF
                    continue
                if key in (curses.KEY_DOWN, ord("j")):
                    if lang_detail_scroll < max_ldscroll:
                        lang_detail_scroll += 1
                    continue
                if key in (curses.KEY_UP, ord("k")):
                    if lang_detail_scroll > 0:
                        lang_detail_scroll -= 1
                    continue
                if key == curses.KEY_NPAGE:
                    lang_detail_scroll = min(max_ldscroll, lang_detail_scroll + body_height)
                    continue
                if key == curses.KEY_PPAGE:
                    lang_detail_scroll = max(0, lang_detail_scroll - body_height)
                    continue
                if key == ord("g"):
                    lang_detail_scroll = 0
                    continue
                if key == ord("G"):
                    lang_detail_scroll = max_ldscroll
                    continue
                continue

            # -- LICENSE / PHILOSOPHY / Q&A / HOW-IT-WORKS MODE --
            if mode in (_MODE_LICENSE, _MODE_PHILOSOPHY, _MODE_QA, _MODE_HOW):
                _info_titles = {
                    _MODE_LICENSE: "License",
                    _MODE_PHILOSOPHY: "Philosophy of L2",
                    _MODE_QA: "Q&A / Tips & Tricks",
                    _MODE_HOW: "How L2 Works (Internals)",
                }
                title = _info_titles.get(mode, "")
                stdscr.erase()
                _safe_addnstr(stdscr, 0, 0, f" {title} ", width - 1, curses.A_BOLD)
                _safe_addnstr(stdscr, 1, 0, " q/Esc: back  j/k: scroll  PgUp/PgDn ", width - 1, curses.A_DIM)
                body_height = max(1, height - 3)
                max_iscroll = max(0, len(info_lines) - body_height)
                if info_scroll > max_iscroll:
                    info_scroll = max_iscroll
                for row in range(body_height):
                    li = info_scroll + row
                    if li >= len(info_lines):
                        break
                    _safe_addnstr(stdscr, 2 + row, 0, f"  {info_lines[li]}", width - 1)
                pos_text = f" {info_scroll + 1}-{min(info_scroll + body_height, len(info_lines))}/{len(info_lines)} "
                _safe_addnstr(stdscr, height - 1, 0, pos_text, width - 1, curses.A_DIM)
                stdscr.refresh()
                key = stdscr.getch()
                prev_mode = _MODE_LANG_REF if active_tab == _TAB_LANG_REF else (_MODE_CT_REF if active_tab == _TAB_CT_REF else _MODE_BROWSE)
                if key in (27, ord("q"), ord("h"), curses.KEY_LEFT):
                    mode = prev_mode
                    # Restore info_lines when returning to CT ref
                    if prev_mode == _MODE_CT_REF:
                        info_lines = _L2_CT_REF_TEXT.splitlines()
                        info_scroll = 0
                    continue
                if key in (curses.KEY_DOWN, ord("j")):
                    if info_scroll < max_iscroll:
                        info_scroll += 1
                    continue
                if key in (curses.KEY_UP, ord("k")):
                    if info_scroll > 0:
                        info_scroll -= 1
                    continue
                if key == curses.KEY_NPAGE:
                    info_scroll = min(max_iscroll, info_scroll + body_height)
                    continue
                if key == curses.KEY_PPAGE:
                    info_scroll = max(0, info_scroll - body_height)
                    continue
                if key == ord("g"):
                    info_scroll = 0
                    continue
                if key == ord("G"):
                    info_scroll = max_iscroll
                    continue
                continue

            # -- COMPILE-TIME REFERENCE MODE --
            if mode == _MODE_CT_REF:
                stdscr.erase()
                _safe_addnstr(stdscr, 0, 0, " Compile-Time Reference ", width - 1, curses.A_BOLD)
                _render_tab_bar(stdscr, 1, width)
                _safe_addnstr(stdscr, 2, 0, " j/k scroll  PgUp/PgDn  Tab switch  ? Q&A  H how  P philosophy  L license  q quit", width - 1, curses.A_DIM)
                body_height = max(1, height - 4)
                max_iscroll = max(0, len(info_lines) - body_height)
                if info_scroll > max_iscroll:
                    info_scroll = max_iscroll
                for row in range(body_height):
                    li = info_scroll + row
                    if li >= len(info_lines):
                        break
                    _safe_addnstr(stdscr, 3 + row, 0, f"  {info_lines[li]}", width - 1)
                pos_text = f" {info_scroll + 1}-{min(info_scroll + body_height, len(info_lines))}/{len(info_lines)} "
                _safe_addnstr(stdscr, height - 1, 0, pos_text, width - 1, curses.A_DIM)
                stdscr.refresh()
                key = stdscr.getch()
                if key in (27, ord("q")):
                    return 0
                if key == 9:  # Tab
                    active_tab = _TAB_LIBRARY
                    mode = _MODE_BROWSE
                    continue
                if key in (curses.KEY_DOWN, ord("j")):
                    if info_scroll < max_iscroll:
                        info_scroll += 1
                    continue
                if key in (curses.KEY_UP, ord("k")):
                    if info_scroll > 0:
                        info_scroll -= 1
                    continue
                if key == curses.KEY_NPAGE:
                    info_scroll = min(max_iscroll, info_scroll + body_height)
                    continue
                if key == curses.KEY_PPAGE:
                    info_scroll = max(0, info_scroll - body_height)
                    continue
                if key == ord("g"):
                    info_scroll = 0
                    continue
                if key == ord("G"):
                    info_scroll = max_iscroll
                    continue
                if key == ord("L"):
                    info_lines = _L2_LICENSE_TEXT.splitlines()
                    info_scroll = 0
                    mode = _MODE_LICENSE
                    continue
                if key == ord("P"):
                    info_lines = _L2_PHILOSOPHY_TEXT.splitlines()
                    info_scroll = 0
                    mode = _MODE_PHILOSOPHY
                    continue
                if key == ord("?"):
                    info_lines = _L2_QA_TEXT.splitlines()
                    info_scroll = 0
                    mode = _MODE_QA
                    continue
                if key == ord("H"):
                    info_lines = _L2_HOW_TEXT.splitlines()
                    info_scroll = 0
                    mode = _MODE_HOW
                    continue
                if key == ord("C"):
                    active_tab = _TAB_CT_REF
                    info_lines = _L2_CT_REF_TEXT.splitlines()
                    info_scroll = 0
                    mode = _MODE_CT_REF
                    continue
                continue

            # -- BROWSE MODE --
            list_height = max(1, height - 5)
            if selected < scroll:
                scroll = selected
            if selected >= scroll + list_height:
                scroll = selected - list_height + 1
            max_scroll = max(0, len(filtered) - list_height)
            if scroll > max_scroll:
                scroll = max_scroll

            stdscr.erase()
            kind_str = _FILTER_KINDS[filter_kind_idx]
            enabled_count = sum(1 for v in filter_files_enabled.values() if v)
            filter_info = ""
            has_kind_filter = kind_str != "all"
            has_file_filter = enabled_count < len(all_file_paths)
            has_sig_filter = filter_args >= 0 or filter_returns >= 0
            if has_kind_filter or has_file_filter or has_sig_filter or filter_extra_roots or filter_show_private or filter_show_macros:
                parts = []
                if has_kind_filter:
                    parts.append(f"kind={kind_str}")
                if has_file_filter:
                    parts.append(f"files={enabled_count}/{len(all_file_paths)}")
                if filter_args >= 0:
                    parts.append(f"args={filter_args}")
                if filter_returns >= 0:
                    parts.append(f"rets={filter_returns}")
                if filter_show_private:
                    parts.append("private")
                if filter_show_macros:
                    parts.append("macros")
                if filter_extra_roots:
                    parts.append(f"+{len(filter_extra_roots)} paths")
                filter_info = "  [" + ", ".join(parts) + "]"
            header = f" L2 docs  {len(filtered)}/{len(entries)}" + (f"  search: {query}" if query else "") + filter_info
            _safe_addnstr(stdscr, 0, 0, header, width - 1, curses.A_BOLD)
            _render_tab_bar(stdscr, 1, width)
            hint = " / search  f filters  r reload  Enter detail  Tab switch  C ct-ref  ? Q&A  H how  P philosophy  L license  q quit"
            _safe_addnstr(stdscr, 2, 0, hint, width - 1, curses.A_DIM)

            for row in range(list_height):
                idx = scroll + row
                if idx >= len(filtered):
                    break
                entry = filtered[idx]
                effect = entry.stack_effect if entry.stack_effect else ""
                kind_tag = f"[{entry.kind:5}]"
                name_part = f" {entry.name:24} "
                effect_part = f"{effect:30} "
                is_sel = idx == selected
                base_attr = curses.A_REVERSE if is_sel else 0
                y = 3 + row
                # Draw name
                _safe_addnstr(stdscr, y, 0, name_part, width - 1, base_attr | curses.A_BOLD if is_sel else base_attr)
                # Draw stack effect
                x = len(name_part)
                if x < width - 1:
                    _safe_addnstr(stdscr, y, x, effect_part, width - x - 1, base_attr)
                # Draw kind tag with color
                x2 = x + len(effect_part)
                if x2 < width - 1:
                    kind_color = _KIND_COLORS.get(entry.kind, 0) if not is_sel else 0
                    _safe_addnstr(stdscr, y, x2, kind_tag, width - x2 - 1, base_attr | kind_color)

            if filtered:
                current = filtered[selected]
                detail = f" {current.path}:{current.line}"
                if current.description:
                    detail += f"  {current.description}"
                _safe_addnstr(stdscr, height - 1, 0, detail, width - 1, curses.A_DIM)
            else:
                _safe_addnstr(stdscr, height - 1, 0, " No matches", width - 1, curses.A_DIM)

            stdscr.refresh()
            key = stdscr.getch()

            if key in (27, ord("q")):
                return 0
            if key == 9:  # Tab
                active_tab = _TAB_LANG_REF
                mode = _MODE_LANG_REF
                continue
            if key == ord("L"):
                info_lines = _L2_LICENSE_TEXT.splitlines()
                info_scroll = 0
                mode = _MODE_LICENSE
                continue
            if key == ord("P"):
                info_lines = _L2_PHILOSOPHY_TEXT.splitlines()
                info_scroll = 0
                mode = _MODE_PHILOSOPHY
                continue
            if key == ord("?"):
                info_lines = _L2_QA_TEXT.splitlines()
                info_scroll = 0
                mode = _MODE_QA
                continue
            if key == ord("H"):
                info_lines = _L2_HOW_TEXT.splitlines()
                info_scroll = 0
                mode = _MODE_HOW
                continue
            if key == ord("C"):
                active_tab = _TAB_CT_REF
                info_lines = _L2_CT_REF_TEXT.splitlines()
                info_scroll = 0
                mode = _MODE_CT_REF
                continue
            if key == ord("/"):
                search_buf = query
                mode = _MODE_SEARCH
                continue
            if key == ord("f"):
                mode = _MODE_FILTER
                continue
            if key == ord("r"):
                if reload_fn is not None:
                    entries = reload_fn(include_private=filter_show_private, include_macros=filter_show_macros, extra_roots=filter_extra_roots)
                    _rebuild_file_list()
                    selected = 0
                    scroll = 0
                continue
            if key in (10, 13, curses.KEY_ENTER):
                if filtered:
                    detail_lines = _build_detail_lines(filtered[selected], width)
                    detail_scroll = 0
                    mode = _MODE_DETAIL
                continue
            if key in (curses.KEY_UP, ord("k")):
                if selected > 0:
                    selected -= 1
                continue
            if key in (curses.KEY_DOWN, ord("j")):
                if selected + 1 < len(filtered):
                    selected += 1
                continue
            if key == curses.KEY_PPAGE:
                selected = max(0, selected - list_height)
                continue
            if key == curses.KEY_NPAGE:
                selected = min(max(0, len(filtered) - 1), selected + list_height)
                continue
            if key == ord("g"):
                selected = 0
                scroll = 0
                continue
            if key == ord("G"):
                selected = max(0, len(filtered) - 1)
                continue

        return 0

    return int(curses.wrapper(_app))


def run_docs_explorer(
    *,
    source: Optional[Path],
    include_paths: Sequence[Path],
    explicit_roots: Sequence[Path],
    initial_query: str,
    include_undocumented: bool = False,
    include_private: bool = False,
    include_tests: bool = False,
) -> int:
    roots: List[Path] = [Path("."), Path("./stdlib"), Path("./libs")]
    roots.extend(include_paths)
    roots.extend(explicit_roots)
    if source is not None:
        roots.append(source.parent)
        roots.append(source)

    collect_opts: Dict[str, Any] = dict(
        include_undocumented=include_undocumented,
        include_private=include_private,
        include_tests=include_tests,
        include_macros=False,
    )

    def _reload(**overrides: Any) -> List[DocEntry]:
        extra = overrides.pop("extra_roots", [])
        opts = {**collect_opts, **overrides}
        entries = collect_docs(roots, **opts)
        # Scan extra roots directly, bypassing _iter_doc_files skip filters
        # Always include undocumented entries from user-added paths
        if extra:
            seen_names = {e.name for e in entries}
            scan_opts = dict(
                include_undocumented=True,
                include_private=True,
                include_macros=opts.get("include_macros", False),
            )
            for p in extra:
                ep = Path(p).expanduser().resolve()
                if not ep.exists():
                    continue
                if ep.is_file() and ep.suffix == ".sl":
                    for e in _scan_doc_file(ep, **scan_opts):
                        if e.name not in seen_names:
                            seen_names.add(e.name)
                            entries.append(e)
                elif ep.is_dir():
                    for sl in sorted(ep.rglob("*.sl")):
                        for e in _scan_doc_file(sl.resolve(), **scan_opts):
                            if e.name not in seen_names:
                                seen_names.add(e.name)
                                entries.append(e)
            entries.sort(key=lambda item: (item.name.lower(), str(item.path), item.line))
        return entries

    entries = _reload()
    return _run_docs_tui(entries, initial_query=initial_query, reload_fn=_reload)


def cli(argv: Sequence[str]) -> int:
    import argparse
    parser = argparse.ArgumentParser(description="L2 compiler driver")
    parser.add_argument("source", type=Path, nargs="?", default=None, help="input .sl file (optional when --clean is used)")
    parser.add_argument("-o", dest="output", type=Path, default=None, help="output path (defaults vary by artifact)")
    parser.add_argument(
        "-I",
        "--include",
        dest="include_paths",
        action="append",
        default=[],
        type=Path,
        help="add import search path (repeatable)",
    )
    parser.add_argument("--artifact", choices=["exe", "shared", "static", "obj"], default="exe", help="choose final artifact type")
    parser.add_argument("--emit-asm", action="store_true", help="stop after generating asm")
    parser.add_argument("--temp-dir", type=Path, default=Path("build"))
    parser.add_argument("--debug", action="store_true", help="compile with debug info")
    parser.add_argument("--run", action="store_true", help="run the built binary after successful build")
    parser.add_argument("--dbg", action="store_true", help="launch gdb on the built binary after successful build")
    parser.add_argument("--clean", action="store_true", help="remove the temp build directory and exit")
    parser.add_argument("--repl", action="store_true", help="interactive REPL; source file is optional")
    parser.add_argument("-l", dest="libs", action="append", default=[], help="pass library to linker (e.g. -l m or -l libc.so.6)")
    parser.add_argument("--no-folding", action="store_true", help="disable constant folding optimization")
    parser.add_argument(
        "--no-static-list-folding",
        action="store_true",
        help="disable static list-literal folding (lists stay runtime-allocated)",
    )
    parser.add_argument("--no-peephole", action="store_true", help="disable peephole optimizations")
    parser.add_argument("--no-loop-unroll", action="store_true", help="disable loop unrolling optimization")
    parser.add_argument("--no-auto-inline", action="store_true", help="disable auto-inlining of small asm bodies")
    parser.add_argument("-O0", dest="O0", action="store_true", help="disable all optimizations")
    parser.add_argument("-O2", dest="O2", action="store_true", help="fast mode: disable all optimizations and checks")
    parser.add_argument("-v", "--verbose", type=int, default=0, metavar="LEVEL", help="verbosity level (1=summary+timing, 2=per-function/DCE, 3=full debug, 4=optimization detail)")
    parser.add_argument("--no-extern-type-check", action="store_true", help="disable extern function argument count checking")
    parser.add_argument("--no-stack-check", action="store_true", help="disable stack underflow checking for builtins")
    parser.add_argument("--no-cache", action="store_true", help="disable incremental build cache")
    parser.add_argument("--ct-run-main", action="store_true", help="execute 'main' via the compile-time VM after parsing")
    parser.add_argument("--no-artifact", action="store_true", help="compile source but skip producing final output artifact")
    parser.add_argument("--docs", action="store_true", help="open searchable TUI for word/function documentation")
    parser.add_argument(
        "--docs-root",
        action="append",
        default=[],
        type=Path,
        help="extra file/directory root to scan for docs (repeatable)",
    )
    parser.add_argument(
        "--docs-query",
        default="",
        help="initial filter query for --docs mode",
    )
    parser.add_argument(
        "--docs-all",
        action="store_true",
        help="include undocumented and private symbols in docs index",
    )
    parser.add_argument(
        "--docs-include-tests",
        action="store_true",
        help="include tests/extra_tests in docs index",
    )
    parser.add_argument(
        "--script",
        action="store_true",
        help="shortcut for --no-artifact --ct-run-main",
    )
    parser.add_argument(
        "--dump-cfg",
        nargs="?",
        default=None,
        const="__AUTO__",
        metavar="PATH",
        help="write Graphviz DOT control-flow dump (default: <temp-dir>/<source>.cfg.dot)",
    )
    parser.add_argument(
        "--macro-expansion-limit",
        type=int,
        default=DEFAULT_MACRO_EXPANSION_LIMIT,
        help="maximum nested macro expansion depth (default: %(default)s)",
    )
    parser.add_argument(
        "-D",
        dest="defines",
        action="append",
        default=[],
        metavar="NAME",
        help="define a preprocessor symbol for ifdef/ifndef (repeatable)",
    )
    parser.add_argument(
        "--check",
        action="store_true",
        help="validate source without producing artifacts (parse + compile only)",
    )
    parser.add_argument(
        "-W",
        dest="warnings",
        action="append",
        default=[],
        metavar="NAME",
        help="enable warning (e.g. -W redefine, -W stack-depth, -W all)",
    )
    parser.add_argument(
        "--Werror",
        action="store_true",
        help="treat all warnings as errors",
    )

    # Parse known and unknown args to allow -l flags anywhere
    args, unknown = parser.parse_known_args(argv)
    # Collect any -l flags from unknown args (e.g. -lfoo or -l foo)
    i = 0
    while i < len(unknown):
        if unknown[i] == "-l" and i + 1 < len(unknown):
            args.libs.append(unknown[i + 1])
            i += 2
        elif unknown[i].startswith("-l"):
            args.libs.append(unknown[i][2:])
            i += 1
        else:
            i += 1

    if args.script:
        args.no_artifact = True
        args.ct_run_main = True

    if args.check:
        args.no_artifact = True

    if args.macro_expansion_limit < 1:
        parser.error("--macro-expansion-limit must be >= 1")

    artifact_kind = args.artifact
    if args.O2:
        folding_enabled = False
        static_list_folding_enabled = False
        peephole_enabled = False
        loop_unroll_enabled = False
        auto_inline_enabled = False
        extern_type_check_enabled = False
        stack_check_enabled = False
    elif args.O0:
        folding_enabled = False
        static_list_folding_enabled = False
        peephole_enabled = False
        loop_unroll_enabled = False
        auto_inline_enabled = not args.no_auto_inline
        extern_type_check_enabled = not args.no_extern_type_check
        stack_check_enabled = not args.no_stack_check
    else:
        folding_enabled = not args.no_folding
        static_list_folding_enabled = not args.no_static_list_folding
        peephole_enabled = not args.no_peephole
        loop_unroll_enabled = not args.no_loop_unroll
        auto_inline_enabled = not args.no_auto_inline
        extern_type_check_enabled = not args.no_extern_type_check
        stack_check_enabled = not args.no_stack_check
    cfg_output: Optional[Path] = None
    verbosity: int = args.verbose

    if args.ct_run_main and artifact_kind != "exe":
        parser.error("--ct-run-main requires --artifact exe")

    if artifact_kind != "exe" and (args.run or args.dbg):
        parser.error("--run/--dbg are only available when --artifact exe is selected")

    if args.no_artifact and (args.run or args.dbg):
        parser.error("--run/--dbg are not available with --no-artifact")

    if args.clean:
        try:
            if args.temp_dir.exists():
                import shutil
                shutil.rmtree(args.temp_dir)
                print(f"[info] removed {args.temp_dir}")
            else:
                print(f"[info] {args.temp_dir} does not exist")
        except Exception as exc:
            print(f"[error] failed to remove {args.temp_dir}: {exc}")
            return 1
        return 0

    if args.docs:
        return run_docs_explorer(
            source=args.source,
            include_paths=args.include_paths,
            explicit_roots=args.docs_root,
            initial_query=str(args.docs_query or ""),
            include_undocumented=args.docs_all,
            include_private=args.docs_all,
            include_tests=args.docs_include_tests,
        )

    if args.source is None and not args.repl:
        parser.error("the following arguments are required: source")

    if args.dump_cfg is not None:
        if args.repl:
            parser.error("--dump-cfg is not available with --repl")
        if args.source is None:
            parser.error("--dump-cfg requires a source file")
        if args.dump_cfg == "__AUTO__":
            cfg_output = args.temp_dir / f"{args.source.stem}.cfg.dot"
        else:
            cfg_output = Path(args.dump_cfg)

    if not args.repl and args.output is None and not args.no_artifact:
        stem = args.source.stem
        default_outputs = {
            "exe": Path("a.out"),
            "shared": Path(f"lib{stem}.so"),
            "static": Path(f"lib{stem}.a"),
            "obj": Path(f"{stem}.o"),
        }
        args.output = default_outputs[artifact_kind]

    if not args.repl and artifact_kind in {"static", "obj"} and args.libs:
        print("[warn] --libs ignored for static/object outputs")

    ct_run_libs: List[str] = list(args.libs)
    if args.source is not None:
        for lib in _load_sidecar_meta_libs(args.source):
            if lib not in args.libs:
                args.libs.append(lib)
            if lib not in ct_run_libs:
                ct_run_libs.append(lib)

    if args.ct_run_main and args.source is not None:
        import subprocess
        try:
            ct_sidecar = _build_ct_sidecar_shared(args.source, args.temp_dir)
        except subprocess.CalledProcessError as exc:
            print(f"[error] failed to build compile-time sidecar library: {exc}")
            return 1
        if ct_sidecar is not None:
            so_lib = str(ct_sidecar.resolve())
            if so_lib not in ct_run_libs:
                ct_run_libs.append(so_lib)

    compiler = Compiler(
        include_paths=[Path("."), Path("./stdlib"), *args.include_paths],
        macro_expansion_limit=args.macro_expansion_limit,
        defines=args.defines,
    )
    compiler.assembler.enable_constant_folding = folding_enabled
    compiler.assembler.enable_static_list_folding = static_list_folding_enabled
    compiler.assembler.enable_peephole_optimization = peephole_enabled
    compiler.assembler.enable_loop_unroll = loop_unroll_enabled
    compiler.assembler.enable_auto_inline = auto_inline_enabled
    compiler.assembler.enable_extern_type_check = extern_type_check_enabled
    compiler.assembler.enable_stack_check = stack_check_enabled
    compiler.assembler.verbosity = verbosity
    if args.dump_cfg is not None:
        compiler.assembler._need_cfg = True
    # Warning configuration
    warnings_set = set(args.warnings)
    werror = args.Werror
    # Support GCC-style -Werror (single dash, parsed as -W error)
    if "error" in warnings_set:
        warnings_set.discard("error")
        werror = True
    # -Werror without explicit -W categories implies -W all
    if werror and not warnings_set:
        warnings_set.add("all")
    compiler.parser._warnings_enabled = warnings_set
    compiler.parser._werror = werror
    # Route dictionary redefine warnings through the parser's _warn system
    if warnings_set or werror:
        def _dict_warn_cb(name: str, priority: int) -> None:
            compiler.parser._warn(
                compiler.parser._last_token, "redefine",
                f"redefining word {name} (priority {priority})",
            )
        compiler.parser.dictionary.warn_callback = _dict_warn_cb
    cache: Optional[BuildCache] = None
    if not args.no_cache:
        cache = BuildCache(args.temp_dir / ".l2cache")

    try:
        if args.repl:
            return run_repl(compiler, args.temp_dir, args.libs, debug=args.debug, initial_source=args.source)

        entry_mode = "program" if artifact_kind == "exe" else "library"

        # --- assembly-level cache check ---
        asm_text: Optional[str] = None
        fhash = ""
        if cache and not args.ct_run_main and args.dump_cfg is None:
            fhash = cache.flags_hash(
                args.debug,
                folding_enabled,
                static_list_folding_enabled,
                peephole_enabled,
                auto_inline_enabled,
                entry_mode,
            )
            manifest = cache.load_manifest(args.source)
            if manifest and cache.check_fresh(manifest, fhash):
                cached = cache.get_cached_asm(manifest)
                if cached is not None:
                    asm_text = cached
                    if verbosity >= 1:
                        print(f"[v1] cache hit for {args.source}")

        if asm_text is None:
            if verbosity >= 1:
                import time as _time_mod
                _compile_t0 = _time_mod.perf_counter()
            emission = compiler.compile_file(args.source, debug=args.debug, entry_mode=entry_mode)

            # Snapshot assembly text *before* ct-run-main JIT execution, which may
            # corrupt Python heap objects depending on memory layout.
            asm_text = emission.snapshot()
            if verbosity >= 1:
                _compile_dt = (_time_mod.perf_counter() - _compile_t0) * 1000
                print(f"[v1] compilation: {_compile_dt:.1f}ms")
                print(f"[v1] assembly size: {len(asm_text)} bytes")

            if cache and not args.ct_run_main:
                if not fhash:
                    fhash = cache.flags_hash(
                        args.debug,
                        folding_enabled,
                        static_list_folding_enabled,
                        peephole_enabled,
                        auto_inline_enabled,
                        entry_mode,
                    )
                has_ct = bool(compiler.parser.compile_time_vm._ct_executed)
                cache.save(args.source, compiler._loaded_files, fhash, asm_text, has_ct_effects=has_ct)

        if cfg_output is not None:
            cfg_output.parent.mkdir(parents=True, exist_ok=True)
            cfg_dot = compiler.assembler.render_last_cfg_dot()
            cfg_output.write_text(cfg_dot)
            print(f"[info] wrote {cfg_output}")

        if args.ct_run_main:
            try:
                compiler.run_compile_time_word("main", libs=ct_run_libs)
            except CompileTimeError as exc:
                print(f"[error] compile-time execution of 'main' failed: {exc}")
                return 1
    except (ParseError, CompileError, CompileTimeError) as exc:
        # Print all collected diagnostics in Rust-style format
        use_color = sys.stderr.isatty()
        diags = getattr(compiler.parser, 'diagnostics', []) if 'compiler' in dir() else []
        if diags:
            for diag in diags:
                print(diag.format(color=use_color), file=sys.stderr)
            error_count = sum(1 for d in diags if d.level == "error")
            warn_count = sum(1 for d in diags if d.level == "warning")
            summary_parts = []
            if error_count:
                summary_parts.append(f"{error_count} error(s)")
            if warn_count:
                summary_parts.append(f"{warn_count} warning(s)")
            if summary_parts:
                print(f"\n{' and '.join(summary_parts)} emitted", file=sys.stderr)
        else:
            print(f"[error] {exc}", file=sys.stderr)
        return 1
    except Exception as exc:
        print(f"[error] unexpected failure: {exc}", file=sys.stderr)
        return 1

    # Print any warnings accumulated during successful compilation
    use_color = sys.stderr.isatty()
    warnings = [d for d in compiler.parser.diagnostics if d.level == "warning"]
    if warnings:
        for diag in warnings:
            print(diag.format(color=use_color), file=sys.stderr)
        print(f"\n{len(warnings)} warning(s) emitted", file=sys.stderr)

    args.temp_dir.mkdir(parents=True, exist_ok=True)
    asm_path = args.temp_dir / (args.source.stem + ".asm")
    obj_path = args.temp_dir / (args.source.stem + ".o")

    # --- incremental: skip nasm if assembly unchanged ---
    asm_changed = True
    if asm_path.exists():
        existing_asm = asm_path.read_text()
        if existing_asm == asm_text:
            asm_changed = False
    if asm_changed:
        asm_path.write_text(asm_text)

    if args.emit_asm:
        print(f"[info] wrote {asm_path}")
        return 0

    if args.no_artifact:
        print("[info] skipped artifact generation (--no-artifact)")
        return 0

    # --- incremental: skip nasm if .o newer than .asm ---
    need_nasm = asm_changed or not obj_path.exists()
    if not need_nasm:
        try:
            need_nasm = obj_path.stat().st_mtime < asm_path.stat().st_mtime
        except OSError:
            need_nasm = True
    if need_nasm:
        run_nasm(asm_path, obj_path, debug=args.debug)
    if args.output.parent and not args.output.parent.exists():
        args.output.parent.mkdir(parents=True, exist_ok=True)

    # --- incremental: skip linker if output newer than .o AND same source ---
    # Track which .o produced the output so switching source files forces relink.
    link_stamp = args.temp_dir / f"{args.output.name}.link_src"
    need_link = need_nasm or not args.output.exists() or args.no_cache
    if not need_link:
        # Check that the output was linked from the same .o last time.
        try:
            recorded = link_stamp.read_text()
        except OSError:
            recorded = ""
        if recorded != str(obj_path.resolve()):
            need_link = True
    if not need_link:
        try:
            need_link = args.output.stat().st_mtime < obj_path.stat().st_mtime
        except OSError:
            need_link = True

    if artifact_kind == "obj":
        dest = args.output
        if obj_path.resolve() != dest.resolve():
            if need_link:
                import shutil
                shutil.copy2(obj_path, dest)
    elif artifact_kind == "static":
        if need_link:
            build_static_library(obj_path, args.output)
    else:
        if need_link:
            # Remove existing output first to avoid "Text file busy" on Linux
            # when the old binary is still mapped by a running process.
            try:
                args.output.unlink(missing_ok=True)
            except OSError:
                pass
            run_linker(
                obj_path,
                args.output,
                debug=args.debug,
                libs=args.libs,
                shared=(artifact_kind == "shared"),
            )

    if need_link:
        link_stamp.write_text(str(obj_path.resolve()))
        print(f"[info] built {args.output}")
    else:
        print(f"[info] {args.output} is up to date")

    if artifact_kind == "exe":
        import subprocess
        exe_path = Path(args.output).resolve()
        if args.dbg:
            subprocess.run(["gdb", str(exe_path)])
        elif args.run:
            subprocess.run([str(exe_path)])
    return 0


def main() -> None:
    code = cli(sys.argv[1:])
    # Flush all output then use os._exit to avoid SIGSEGV from ctypes/native
    # memory finalization during Python's shutdown sequence.
    sys.stdout.flush()
    sys.stderr.flush()
    os._exit(code)


if __name__ == "__main__":
    main()