From 7faafe9df034c12f99a50750effcf3706b015273 Mon Sep 17 00:00:00 2001 From: igor Date: Mon, 2 Mar 2026 13:40:51 +0100 Subject: [PATCH] added an option to generate a basic control flow graph for graphviz --- SPEC.md | 2 +- main.py | 227 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 227 insertions(+), 2 deletions(-) diff --git a/SPEC.md b/SPEC.md index a53de48..6dbc70e 100644 --- a/SPEC.md +++ b/SPEC.md @@ -9,7 +9,7 @@ This document reflects the implementation that ships in this repository today (` - **Unsafe by design** – Memory, syscalls, inline assembly, and FFI expose raw machine power. The standard library is intentionally thin and policy-free. ## 2. Toolchain and Repository Layout -- **Driver (`main.py`)** – Supports `python main.py source.sl -o a.out`, `--emit-asm`, `--run`, `--dbg`, `--repl`, `--temp-dir`, `--clean`, repeated `-I/--include` paths, and repeated `-l` linker flags (either `-lfoo` or `-l libc.so.6`). Unknown `-l` flags are collected and forwarded to the linker. Pass `--ct-run-main` to run the program's `main` word on the compile-time VM before NASM/ld run, which surfaces discrepancies between compile-time and runtime semantics. Pass `--no-artifact` to stop after compilation/assembly emission without building an output file, or use `--script` as shorthand for `--no-artifact --ct-run-main`. Pass `--docs` to open a searchable TUI that scans stack-effect comments and nearby docs from `.sl` files (`--docs-query` sets initial filter and `--docs-root` adds scan roots). `--no-folding` disables constant folding and `--no-peephole` disables peephole rewrites (for example `swap drop` → `nip`, `dup drop` removed, `swap over` → `tuck`, `nip drop` → `2drop`, `x 0 +` removed, `x 1 *` removed, `x -1 *` → `neg`, and `not not` removed). +- **Driver (`main.py`)** – Supports `python main.py source.sl -o a.out`, `--emit-asm`, `--run`, `--dbg`, `--repl`, `--temp-dir`, `--clean`, `--dump-cfg[=path]`, repeated `-I/--include` paths, and repeated `-l` linker flags (either `-lfoo` or `-l libc.so.6`). Unknown `-l` flags are collected and forwarded to the linker. Pass `--ct-run-main` to run the program's `main` word on the compile-time VM before NASM/ld run, which surfaces discrepancies between compile-time and runtime semantics. Pass `--no-artifact` to stop after compilation/assembly emission without building an output file, or use `--script` as shorthand for `--no-artifact --ct-run-main`. Pass `--docs` to open a searchable TUI that scans stack-effect comments and nearby docs from `.sl` files (`--docs-query` sets initial filter and `--docs-root` adds scan roots). `--no-folding` disables constant folding and `--no-peephole` disables peephole rewrites (for example `swap drop` → `nip`, `dup drop` removed, `swap over` → `tuck`, `nip drop` → `2drop`, `x 0 +` removed, `x 1 *` removed, `x -1 *` → `neg`, and `not not` removed). - **REPL** – `--repl` launches a stateful session with commands such as `:help`, `:reset`, `:load`, `:call `, `:edit`, and `:show`. The REPL still emits/links entire programs for each run; it simply manages the session source for you. - **Imports** – `import relative/or/absolute/path.sl` inserts the referenced file textually. Resolution order: (1) absolute path, (2) relative to the importing file, (3) each include path (defaults: project root and `./stdlib`). Each file is included at most once per compilation unit. Import lines leave blank placeholders so error spans stay meaningful. - **Workspace** – `stdlib/` holds library modules, `tests/` contains executable samples with `.expected` outputs, `extra_tests/` houses standalone integration demos, and `libs/` collects opt-in extensions such as `libs/fn.sl` and `libs/nob.sl`. diff --git a/main.py b/main.py index ea3ecfe..cfb8b17 100644 --- a/main.py +++ b/main.py @@ -3321,6 +3321,199 @@ class Assembler: self.enable_static_list_folding = enable_static_list_folding self.enable_peephole_optimization = enable_peephole_optimization self.loop_unroll_threshold = loop_unroll_threshold + self._last_cfg_definitions: List[Definition] = [] + + def _copy_definition_for_cfg(self, definition: Definition) -> Definition: + return Definition( + name=definition.name, + body=[Op(op=node.op, data=node.data, loc=node.loc) for node in definition.body], + immediate=definition.immediate, + compile_only=definition.compile_only, + terminator=definition.terminator, + inline=definition.inline, + ) + + def _format_cfg_op(self, node: Op) -> str: + kind = node._opcode + data = node.data + if kind == OP_LITERAL: + return f"literal {data!r}" + if kind == OP_WORD: + return f"word {data}" + if kind == OP_WORD_PTR: + return f"word_ptr {data}" + if kind == OP_BRANCH_ZERO: + return f"branch_zero {data}" + if kind == OP_JUMP: + return f"jump {data}" + if kind == OP_LABEL: + return f"label {data}" + if kind == OP_FOR_BEGIN: + return f"for_begin loop={data['loop']} end={data['end']}" + if kind == OP_FOR_END: + return f"for_end loop={data['loop']} end={data['end']}" + if kind == OP_LIST_BEGIN: + return f"list_begin {data}" + if kind == OP_LIST_END: + return f"list_end {data}" + if kind == OP_LIST_LITERAL: + return f"list_literal {data}" + return f"{node.op} {data!r}" + + @staticmethod + def _dot_escape(text: str) -> str: + return text.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n") + + @staticmethod + def _dot_id(text: str) -> str: + return re.sub(r"[^A-Za-z0-9_]", "_", text) + + def _definition_cfg_blocks_and_edges(self, definition: Definition) -> Tuple[List[Tuple[int, int]], List[Tuple[int, int, str]]]: + nodes = definition.body + if not nodes: + return [], [] + + label_positions = self._cfg_label_positions(nodes) + for_pairs = self._for_pairs(nodes) + + leaders: Set[int] = {0} + + def add_leader(idx: int) -> None: + if 0 <= idx < len(nodes): + leaders.add(idx) + + for idx, node in enumerate(nodes): + kind = node._opcode + if kind == OP_LABEL: + leaders.add(idx) + elif kind == OP_BRANCH_ZERO: + target = label_positions.get(str(node.data)) + if target is not None: + add_leader(target) + add_leader(idx + 1) + elif kind == OP_JUMP: + target = label_positions.get(str(node.data)) + if target is not None: + add_leader(target) + add_leader(idx + 1) + elif kind == OP_FOR_BEGIN: + end_idx = for_pairs.get(idx) + if end_idx is not None: + add_leader(end_idx + 1) + add_leader(idx + 1) + elif kind == OP_FOR_END: + begin_idx = for_pairs.get(idx) + if begin_idx is not None: + add_leader(begin_idx + 1) + add_leader(idx + 1) + + ordered = sorted(leaders) + blocks: List[Tuple[int, int]] = [] + for i, start in enumerate(ordered): + end = ordered[i + 1] if i + 1 < len(ordered) else len(nodes) + if start < end: + blocks.append((start, end)) + + block_by_ip: Dict[int, int] = {} + for block_idx, (start, end) in enumerate(blocks): + for ip in range(start, end): + block_by_ip[ip] = block_idx + + edges: List[Tuple[int, int, str]] = [] + + def add_edge(src_block: int, target_ip: int, label: str) -> None: + if target_ip < 0 or target_ip >= len(nodes): + return + dst_block = block_by_ip.get(target_ip) + if dst_block is None: + return + edges.append((src_block, dst_block, label)) + + for block_idx, (_start, end) in enumerate(blocks): + last_ip = end - 1 + node = nodes[last_ip] + kind = node._opcode + + if kind == OP_BRANCH_ZERO: + target = label_positions.get(str(node.data)) + if target is not None: + add_edge(block_idx, target, "zero") + add_edge(block_idx, last_ip + 1, "nonzero") + continue + + if kind == OP_JUMP: + target = label_positions.get(str(node.data)) + if target is not None: + add_edge(block_idx, target, "jmp") + continue + + if kind == OP_FOR_BEGIN: + end_idx = for_pairs.get(last_ip) + add_edge(block_idx, last_ip + 1, "enter") + if end_idx is not None: + add_edge(block_idx, end_idx + 1, "empty") + continue + + if kind == OP_FOR_END: + begin_idx = for_pairs.get(last_ip) + if begin_idx is not None: + add_edge(block_idx, begin_idx + 1, "loop") + add_edge(block_idx, last_ip + 1, "exit") + continue + + add_edge(block_idx, last_ip + 1, "next") + + edges.sort(key=lambda item: (item[0], item[1], item[2])) + return blocks, edges + + def _cfg_label_positions(self, nodes: Sequence[Op]) -> Dict[str, int]: + positions: Dict[str, int] = {} + for idx, node in enumerate(nodes): + if node._opcode == OP_LABEL: + positions[str(node.data)] = idx + return positions + + def render_last_cfg_dot(self) -> str: + lines: List[str] = [ + "digraph l2_cfg {", + " rankdir=LR;", + " node [shape=box, fontname=\"Courier\"];", + " edge [fontname=\"Courier\"];", + ] + + if not self._last_cfg_definitions: + lines.append(" empty [label=\"No runtime high-level definitions available for CFG dump\"];") + lines.append("}") + return "\n".join(lines) + + for defn in self._last_cfg_definitions: + cluster_id = self._dot_id(f"cluster_{defn.name}") + lines.append(f" subgraph {cluster_id} {{") + lines.append(f" label=\"{self._dot_escape(defn.name)}\";") + + blocks, edges = self._definition_cfg_blocks_and_edges(defn) + if not blocks: + node_id = self._dot_id(f"{defn.name}_empty") + lines.append(f" {node_id} [label=\"(empty)\"];") + lines.append(" }") + continue + + prefix = self._dot_id(defn.name) + for block_idx, (start, end) in enumerate(blocks): + node_id = f"{prefix}_b{block_idx}" + op_lines = [f"{ip}: {self._format_cfg_op(defn.body[ip])}" for ip in range(start, end)] + label = "\\l".join(self._dot_escape(line) for line in op_lines) + "\\l" + lines.append(f" {node_id} [label=\"{label}\"];") + + for src, dst, edge_label in edges: + src_id = f"{prefix}_b{src}" + dst_id = f"{prefix}_b{dst}" + lines.append(f" {src_id} -> {dst_id} [label=\"{self._dot_escape(edge_label)}\"];") + + lines.append(" }") + + lines.append("}") + return "\n".join(lines) def _peephole_optimize_definition(self, definition: Definition) -> None: # Rewrite short stack-manipulation sequences into canonical forms. @@ -3735,6 +3928,7 @@ class Assembler: is_program = entry_mode == "program" emission = Emission() self._export_all_defs = not is_program + self._last_cfg_definitions = [] try: self._emit_externs(emission.text) # Determine whether user provided a top-level `:asm _start` in @@ -3840,6 +4034,12 @@ class Assembler: # Inline-only definitions are expanded at call sites; skip emitting standalone labels. runtime_defs = [defn for defn in runtime_defs if not getattr(defn, "inline", False)] + self._last_cfg_definitions = [ + self._copy_definition_for_cfg(defn) + for defn in runtime_defs + if isinstance(defn, Definition) + ] + for definition in runtime_defs: self._emit_definition(definition, emission.text, debug=debug) @@ -7661,6 +7861,14 @@ def cli(argv: Sequence[str]) -> int: action="store_true", help="shortcut for --no-artifact --ct-run-main", ) + parser.add_argument( + "--dump-cfg", + nargs="?", + default=None, + const="__AUTO__", + metavar="PATH", + help="write Graphviz DOT control-flow dump (default: /.cfg.dot)", + ) # Parse known and unknown args to allow -l flags anywhere args, unknown = parser.parse_known_args(argv) @@ -7684,6 +7892,7 @@ def cli(argv: Sequence[str]) -> int: folding_enabled = not args.no_folding static_list_folding_enabled = not args.no_static_list_folding peephole_enabled = not args.no_peephole + cfg_output: Optional[Path] = None if args.ct_run_main and artifact_kind != "exe": parser.error("--ct-run-main requires --artifact exe") @@ -7720,6 +7929,16 @@ def cli(argv: Sequence[str]) -> int: if args.source is None and not args.repl: parser.error("the following arguments are required: source") + if args.dump_cfg is not None: + if args.repl: + parser.error("--dump-cfg is not available with --repl") + if args.source is None: + parser.error("--dump-cfg requires a source file") + if args.dump_cfg == "__AUTO__": + cfg_output = args.temp_dir / f"{args.source.stem}.cfg.dot" + else: + cfg_output = Path(args.dump_cfg) + if not args.repl and args.output is None and not args.no_artifact: stem = args.source.stem default_outputs = { @@ -7770,7 +7989,7 @@ def cli(argv: Sequence[str]) -> int: # --- assembly-level cache check --- asm_text: Optional[str] = None fhash = "" - if cache and not args.ct_run_main: + if cache and not args.ct_run_main and args.dump_cfg is None: fhash = cache.flags_hash( args.debug, folding_enabled, @@ -7803,6 +8022,12 @@ def cli(argv: Sequence[str]) -> int: has_ct = bool(compiler.parser.compile_time_vm._ct_executed) cache.save(args.source, compiler._loaded_files, fhash, asm_text, has_ct_effects=has_ct) + if cfg_output is not None: + cfg_output.parent.mkdir(parents=True, exist_ok=True) + cfg_dot = compiler.assembler.render_last_cfg_dot() + cfg_output.write_text(cfg_dot) + print(f"[info] wrote {cfg_output}") + if args.ct_run_main: try: compiler.run_compile_time_word("main", libs=ct_run_libs)