diff --git a/__pycache__/main.cpython-314.pyc b/__pycache__/main.cpython-314.pyc
index 8c2e145..a3aa38d 100644
Binary files a/__pycache__/main.cpython-314.pyc and b/__pycache__/main.cpython-314.pyc differ
diff --git a/a.out b/a.out
index 723c48f..c3e95ec 100755
Binary files a/a.out and b/a.out differ
diff --git a/a.sl b/a.sl
index 9f3cb37..f18d4fc 100644
--- a/a.sl
+++ b/a.sl
@@ -2,4 +2,5 @@ import stdlib.sl
 
 : main
     "hello world" puts
-;
\ No newline at end of file
+;
+compile-time main
\ No newline at end of file
diff --git a/build/a.asm b/build/a.asm
index f676e86..12c9dff 100644
--- a/build/a.asm
+++ b/build/a.asm
@@ -19,7 +19,7 @@ _start:
     mov rax, 60
     syscall
 word_puts:
-	; detects string if top is len>=0 and next is a pointer in [data_start, data_end)
+	; detects string if top is len>=0 and next is a pointer in [data_start, data_end]
 	mov rax, [r12]      ; len or int value
 	mov rbx, [r12 + 8]  ; possible address
 	cmp rax, 0
@@ -114,6 +114,64 @@ word_swap:
 	mov [r12], rbx
 	mov [r12 + 8], rax
     ret
+word_rot:
+	mov rax, [r12]       ; x3
+	mov rbx, [r12 + 8]   ; x2
+	mov rcx, [r12 + 16]  ; x1
+	mov [r12], rcx       ; top = x1
+	mov [r12 + 8], rax   ; next = x3
+	mov [r12 + 16], rbx  ; third = x2
+    ret
+word__2drot:
+	mov rax, [r12]       ; x3
+	mov rbx, [r12 + 8]   ; x2
+	mov rcx, [r12 + 16]  ; x1
+	mov [r12], rbx       ; top = x2
+	mov [r12 + 8], rcx   ; next = x1
+	mov [r12 + 16], rax  ; third = x3
+    ret
+word_nip:
+	mov rax, [r12]
+	add r12, 8           ; drop lower element
+	mov [r12], rax       ; keep original top
+    ret
+word_tuck:
+	mov rax, [r12]       ; x2
+	mov rbx, [r12 + 8]   ; x1
+	sub r12, 8           ; make room
+	mov [r12], rax       ; x2
+	mov [r12 + 8], rbx   ; x1
+	mov [r12 + 16], rax  ; x2
+    ret
+word_2dup:
+	mov rax, [r12]       ; b
+	mov rbx, [r12 + 8]   ; a
+	sub r12, 8
+	mov [r12], rbx       ; push a
+	sub r12, 8
+	mov [r12], rax       ; push b
+    ret
+word_2drop:
+	add r12, 16
+    ret
+word_2swap:
+	mov rax, [r12]        ; d
+	mov rbx, [r12 + 8]    ; c
+	mov rcx, [r12 + 16]   ; b
+	mov rdx, [r12 + 24]   ; a
+	mov [r12], rcx        ; top = b
+	mov [r12 + 8], rdx    ; next = a
+	mov [r12 + 16], rax   ; third = d
+	mov [r12 + 24], rbx   ; fourth = c
+    ret
+word_2over:
+	mov rax, [r12 + 16]   ; b
+	mov rbx, [r12 + 24]   ; a
+	sub r12, 8
+	mov [r12], rbx        ; push a
+	sub r12, 8
+	mov [r12], rax        ; push b
+    ret
 word__2b:
 	mov rax, [r12]
 	add r12, 8
@@ -242,6 +300,39 @@ word_exit:
 	mov rax, 60
 	syscall
     ret
+word_and:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	test rax, rax
+	setz cl
+	test rbx, rbx
+	setz dl
+	movzx rcx, cl
+	movzx rdx, dl
+	and rcx, rdx
+	mov [r12], rcx
+    ret
+word_or:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	test rax, rax
+	setz cl
+	test rbx, rbx
+	setz dl
+	movzx rcx, cl
+	movzx rdx, dl
+	or rcx, rdx
+	mov [r12], rcx
+    ret
+word_not:
+	mov rax, [r12]
+	test rax, rax
+	setz al
+	movzx rax, al
+	mov [r12], rax
+    ret
 word__3er:
 	mov rax, [r12]
 	add r12, 8
diff --git a/build/a.o b/build/a.o
index fb71081..9d06821 100644
Binary files a/build/a.o and b/build/a.o differ
diff --git a/build/call_syntax_parens.asm b/build/call_syntax_parens.asm
index 031a9dd..e644000 100644
--- a/build/call_syntax_parens.asm
+++ b/build/call_syntax_parens.asm
@@ -19,7 +19,7 @@ _start:
     mov rax, 60
     syscall
 word_puts:
-	; detects string if top is len>=0 and next is a pointer in [data_start, data_end)
+	; detects string if top is len>=0 and next is a pointer in [data_start, data_end]
 	mov rax, [r12]      ; len or int value
 	mov rbx, [r12 + 8]  ; possible address
 	cmp rax, 0
@@ -114,6 +114,64 @@ word_swap:
 	mov [r12], rbx
 	mov [r12 + 8], rax
     ret
+word_rot:
+	mov rax, [r12]       ; x3
+	mov rbx, [r12 + 8]   ; x2
+	mov rcx, [r12 + 16]  ; x1
+	mov [r12], rcx       ; top = x1
+	mov [r12 + 8], rax   ; next = x3
+	mov [r12 + 16], rbx  ; third = x2
+    ret
+word__2drot:
+	mov rax, [r12]       ; x3
+	mov rbx, [r12 + 8]   ; x2
+	mov rcx, [r12 + 16]  ; x1
+	mov [r12], rbx       ; top = x2
+	mov [r12 + 8], rcx   ; next = x1
+	mov [r12 + 16], rax  ; third = x3
+    ret
+word_nip:
+	mov rax, [r12]
+	add r12, 8           ; drop lower element
+	mov [r12], rax       ; keep original top
+    ret
+word_tuck:
+	mov rax, [r12]       ; x2
+	mov rbx, [r12 + 8]   ; x1
+	sub r12, 8           ; make room
+	mov [r12], rax       ; x2
+	mov [r12 + 8], rbx   ; x1
+	mov [r12 + 16], rax  ; x2
+    ret
+word_2dup:
+	mov rax, [r12]       ; b
+	mov rbx, [r12 + 8]   ; a
+	sub r12, 8
+	mov [r12], rbx       ; push a
+	sub r12, 8
+	mov [r12], rax       ; push b
+    ret
+word_2drop:
+	add r12, 16
+    ret
+word_2swap:
+	mov rax, [r12]        ; d
+	mov rbx, [r12 + 8]    ; c
+	mov rcx, [r12 + 16]   ; b
+	mov rdx, [r12 + 24]   ; a
+	mov [r12], rcx        ; top = b
+	mov [r12 + 8], rdx    ; next = a
+	mov [r12 + 16], rax   ; third = d
+	mov [r12 + 24], rbx   ; fourth = c
+    ret
+word_2over:
+	mov rax, [r12 + 16]   ; b
+	mov rbx, [r12 + 24]   ; a
+	sub r12, 8
+	mov [r12], rbx        ; push a
+	sub r12, 8
+	mov [r12], rax        ; push b
+    ret
 word__2b:
 	mov rax, [r12]
 	add r12, 8
@@ -242,6 +300,39 @@ word_exit:
 	mov rax, 60
 	syscall
     ret
+word_and:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	test rax, rax
+	setz cl
+	test rbx, rbx
+	setz dl
+	movzx rcx, cl
+	movzx rdx, dl
+	and rcx, rdx
+	mov [r12], rcx
+    ret
+word_or:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	test rax, rax
+	setz cl
+	test rbx, rbx
+	setz dl
+	movzx rcx, cl
+	movzx rdx, dl
+	or rcx, rdx
+	mov [r12], rcx
+    ret
+word_not:
+	mov rax, [r12]
+	test rax, rax
+	setz al
+	movzx rax, al
+	mov [r12], rax
+    ret
 word__3er:
 	mov rax, [r12]
 	add r12, 8
diff --git a/build/call_syntax_parens.o b/build/call_syntax_parens.o
index 4cfec6c..1c32cdf 100644
Binary files a/build/call_syntax_parens.o and b/build/call_syntax_parens.o differ
diff --git a/build/hello.asm b/build/hello.asm
new file mode 100644
index 0000000..1d5188d
--- /dev/null
+++ b/build/hello.asm
@@ -0,0 +1,388 @@
+section .text
+%define DSTK_BYTES 65536
+%define RSTK_BYTES 65536
+%define PRINT_BUF_BYTES 128
+global _start
+_start:
+    ; initialize data/return stack pointers
+    lea r12, [rel dstack_top]
+    mov r15, r12
+    lea r13, [rel rstack_top]
+    call word_main
+    mov rax, 0
+    cmp r12, r15
+    je .no_exit_value
+    mov rax, [r12]
+    add r12, 8
+.no_exit_value:
+    mov rdi, rax
+    mov rax, 60
+    syscall
+word_puts:
+	; detects string if top is len>=0 and next is a pointer in [data_start, data_end)
+	mov rax, [r12]      ; len or int value
+	mov rbx, [r12 + 8]  ; possible address
+	cmp rax, 0
+	jl puts_print_int
+	lea r8, [rel data_start]
+	lea r9, [rel data_end]
+	cmp rbx, r8
+	jl puts_print_int
+	cmp rbx, r9
+	jge puts_print_int
+	; treat as string: (addr below len)
+	mov rdx, rax        ; len
+	mov rsi, rbx        ; addr
+	add r12, 16         ; pop len + addr
+	test rdx, rdx
+	jz puts_str_newline_only
+	mov rax, 1
+	mov rdi, 1
+	syscall
+puts_str_newline_only:
+	mov byte [rel print_buf], 10
+	mov rax, 1
+	mov rdi, 1
+	lea rsi, [rel print_buf]
+	mov rdx, 1
+	syscall
+	ret
+
+puts_print_int:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, rax
+	mov r8, 0
+	cmp rbx, 0
+	jge puts_abs
+	neg rbx
+	mov r8, 1
+puts_abs:
+	lea rsi, [rel print_buf_end]
+	mov rcx, 0
+	mov r10, 10
+	cmp rbx, 0
+	jne puts_digits
+	dec rsi
+	mov byte [rsi], '0'
+	inc rcx
+	jmp puts_sign
+puts_digits:
+puts_loop:
+	xor rdx, rdx
+	mov rax, rbx
+	div r10
+	add dl, '0'
+	dec rsi
+	mov [rsi], dl
+	inc rcx
+	mov rbx, rax
+	test rbx, rbx
+	jne puts_loop
+puts_sign:
+	cmp r8, 0
+	je puts_finish_digits
+	dec rsi
+	mov byte [rsi], '-'
+	inc rcx
+puts_finish_digits:
+	mov byte [rsi + rcx], 10
+	inc rcx
+	mov rax, 1
+	mov rdi, 1
+	mov rdx, rcx
+	mov r9, rsi
+	mov rsi, r9
+	syscall
+    ret
+word_dup:
+	mov rax, [r12]
+	sub r12, 8
+	mov [r12], rax
+    ret
+word_drop:
+	add r12, 8
+    ret
+word_over:
+	mov rax, [r12 + 8]
+	sub r12, 8
+	mov [r12], rax
+    ret
+word_swap:
+	mov rax, [r12]
+	mov rbx, [r12 + 8]
+	mov [r12], rbx
+	mov [r12 + 8], rax
+    ret
+word_rot:
+	mov rax, [r12]       ; x3
+	mov rbx, [r12 + 8]   ; x2
+	mov rcx, [r12 + 16]  ; x1
+	mov [r12], rcx       ; top = x1
+	mov [r12 + 8], rax   ; next = x3
+	mov [r12 + 16], rbx  ; third = x2
+    ret
+word__2drot:
+	mov rax, [r12]       ; x3
+	mov rbx, [r12 + 8]   ; x2
+	mov rcx, [r12 + 16]  ; x1
+	mov [r12], rbx       ; top = x2
+	mov [r12 + 8], rcx   ; next = x1
+	mov [r12 + 16], rax  ; third = x3
+    ret
+word_nip:
+	mov rax, [r12]
+	add r12, 8           ; drop lower element
+	mov [r12], rax       ; keep original top
+    ret
+word_tuck:
+	mov rax, [r12]       ; x2
+	mov rbx, [r12 + 8]   ; x1
+	sub r12, 8           ; make room
+	mov [r12], rax       ; x2
+	mov [r12 + 8], rbx   ; x1
+	mov [r12 + 16], rax  ; x2
+    ret
+word_2dup:
+	mov rax, [r12]       ; b
+	mov rbx, [r12 + 8]   ; a
+	sub r12, 8
+	mov [r12], rbx       ; push a
+	sub r12, 8
+	mov [r12], rax       ; push b
+    ret
+word_2drop:
+	add r12, 16
+    ret
+word_2swap:
+	mov rax, [r12]        ; d
+	mov rbx, [r12 + 8]    ; c
+	mov rcx, [r12 + 16]   ; b
+	mov rdx, [r12 + 24]   ; a
+	mov [r12], rcx        ; top = b
+	mov [r12 + 8], rdx    ; next = a
+	mov [r12 + 16], rax   ; third = d
+	mov [r12 + 24], rbx   ; fourth = c
+    ret
+word_2over:
+	mov rax, [r12 + 16]   ; b
+	mov rbx, [r12 + 24]   ; a
+	sub r12, 8
+	mov [r12], rbx        ; push a
+	sub r12, 8
+	mov [r12], rax        ; push b
+    ret
+word__2b:
+	mov rax, [r12]
+	add r12, 8
+	add qword [r12], rax
+    ret
+word__2d:
+	mov rax, [r12]
+	add r12, 8
+	sub qword [r12], rax
+    ret
+word__2a:
+	mov rax, [r12]
+	add r12, 8
+	imul qword [r12]
+	mov [r12], rax
+    ret
+word__2f:
+	mov rbx, [r12]
+	add r12, 8
+	mov rax, [r12]
+	cqo
+	idiv rbx
+	mov [r12], rax
+    ret
+word__25:
+	mov rbx, [r12]
+	add r12, 8
+	mov rax, [r12]
+	cqo
+	idiv rbx
+	mov [r12], rdx
+    ret
+word__3d_3d:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	cmp rbx, rax
+	mov rbx, 0
+	sete bl
+	mov [r12], rbx
+    ret
+word__21_3d:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	cmp rbx, rax
+	mov rbx, 0
+	setne bl
+	mov [r12], rbx
+    ret
+word__3c:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	cmp rbx, rax
+	mov rbx, 0
+	setl bl
+	mov [r12], rbx
+    ret
+word__3e:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	cmp rbx, rax
+	mov rbx, 0
+	setg bl
+	mov [r12], rbx
+    ret
+word__3c_3d:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	cmp rbx, rax
+	mov rbx, 0
+	setle bl
+	mov [r12], rbx
+    ret
+word__3e_3d:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	cmp rbx, rax
+	mov rbx, 0
+	setge bl
+	mov [r12], rbx
+    ret
+word__40:
+	mov rax, [r12]
+	mov rax, [rax]
+	mov [r12], rax
+    ret
+word__21:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	mov [rax], rbx
+	add r12, 8
+    ret
+word_mmap:
+	mov r9, [r12]
+	add r12, 8
+	mov r8, [r12]
+	add r12, 8
+	mov r10, [r12]
+	add r12, 8
+	mov rdx, [r12]
+	add r12, 8
+	mov rsi, [r12]
+	add r12, 8
+	mov rdi, [r12]
+	mov rax, 9
+	syscall
+	mov [r12], rax
+    ret
+word_munmap:
+	mov rsi, [r12]
+	add r12, 8
+	mov rdi, [r12]
+	mov rax, 11
+	syscall
+	mov [r12], rax
+    ret
+word_exit:
+	mov rdi, [r12]
+	add r12, 8
+	mov rax, 60
+	syscall
+    ret
+word_and:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	test rax, rax
+	setz cl
+	test rbx, rbx
+	setz dl
+	movzx rcx, cl
+	movzx rdx, dl
+	and rcx, rdx
+	mov [r12], rcx
+    ret
+word_or:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	test rax, rax
+	setz cl
+	test rbx, rbx
+	setz dl
+	movzx rcx, cl
+	movzx rdx, dl
+	or rcx, rdx
+	mov [r12], rcx
+    ret
+word_not:
+	mov rax, [r12]
+	test rax, rax
+	setz al
+	movzx rax, al
+	mov [r12], rax
+    ret
+word__3er:
+	mov rax, [r12]
+	add r12, 8
+	sub r13, 8
+	mov [r13], rax
+    ret
+word_r_3e:
+	mov rax, [r13]
+	add r13, 8
+	sub r12, 8
+	mov [r12], rax
+    ret
+word_rdrop:
+	add r13, 8
+    ret
+word_pick:
+	mov rcx, [r12]
+	add r12, 8
+	mov rax, [r12 + rcx * 8]
+	sub r12, 8
+	mov [r12], rax
+    ret
+word_rpick:
+	mov rcx, [r12]
+	add r12, 8
+	mov rax, [r13 + rcx * 8]
+	sub r12, 8
+	mov [r12], rax
+    ret
+word_main:
+    ; push str_0
+    sub r12, 8
+    mov qword [r12], str_0
+    ; push 11
+    sub r12, 8
+    mov qword [r12], 11
+    call word_puts
+    ret
+section .data
+data_start:
+str_0: db 104, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100, 0
+str_0_len equ 11
+data_end:
+section .bss
+align 16
+dstack: resb DSTK_BYTES
+dstack_top:
+align 16
+rstack: resb RSTK_BYTES
+rstack_top:
+align 16
+print_buf: resb PRINT_BUF_BYTES
+print_buf_end:
\ No newline at end of file
diff --git a/build/hello.o b/build/hello.o
new file mode 100644
index 0000000..5ba708d
Binary files /dev/null and b/build/hello.o differ
diff --git a/build/loops_and_cmp.asm b/build/loops_and_cmp.asm
index 02100b5..4127d7b 100644
--- a/build/loops_and_cmp.asm
+++ b/build/loops_and_cmp.asm
@@ -19,7 +19,7 @@ _start:
     mov rax, 60
     syscall
 word_puts:
-	; detects string if top is len>=0 and next is a pointer in [data_start, data_end)
+	; detects string if top is len>=0 and next is a pointer in [data_start, data_end]
 	mov rax, [r12]      ; len or int value
 	mov rbx, [r12 + 8]  ; possible address
 	cmp rax, 0
@@ -114,6 +114,64 @@ word_swap:
 	mov [r12], rbx
 	mov [r12 + 8], rax
     ret
+word_rot:
+	mov rax, [r12]       ; x3
+	mov rbx, [r12 + 8]   ; x2
+	mov rcx, [r12 + 16]  ; x1
+	mov [r12], rcx       ; top = x1
+	mov [r12 + 8], rax   ; next = x3
+	mov [r12 + 16], rbx  ; third = x2
+    ret
+word__2drot:
+	mov rax, [r12]       ; x3
+	mov rbx, [r12 + 8]   ; x2
+	mov rcx, [r12 + 16]  ; x1
+	mov [r12], rbx       ; top = x2
+	mov [r12 + 8], rcx   ; next = x1
+	mov [r12 + 16], rax  ; third = x3
+    ret
+word_nip:
+	mov rax, [r12]
+	add r12, 8           ; drop lower element
+	mov [r12], rax       ; keep original top
+    ret
+word_tuck:
+	mov rax, [r12]       ; x2
+	mov rbx, [r12 + 8]   ; x1
+	sub r12, 8           ; make room
+	mov [r12], rax       ; x2
+	mov [r12 + 8], rbx   ; x1
+	mov [r12 + 16], rax  ; x2
+    ret
+word_2dup:
+	mov rax, [r12]       ; b
+	mov rbx, [r12 + 8]   ; a
+	sub r12, 8
+	mov [r12], rbx       ; push a
+	sub r12, 8
+	mov [r12], rax       ; push b
+    ret
+word_2drop:
+	add r12, 16
+    ret
+word_2swap:
+	mov rax, [r12]        ; d
+	mov rbx, [r12 + 8]    ; c
+	mov rcx, [r12 + 16]   ; b
+	mov rdx, [r12 + 24]   ; a
+	mov [r12], rcx        ; top = b
+	mov [r12 + 8], rdx    ; next = a
+	mov [r12 + 16], rax   ; third = d
+	mov [r12 + 24], rbx   ; fourth = c
+    ret
+word_2over:
+	mov rax, [r12 + 16]   ; b
+	mov rbx, [r12 + 24]   ; a
+	sub r12, 8
+	mov [r12], rbx        ; push a
+	sub r12, 8
+	mov [r12], rax        ; push b
+    ret
 word__2b:
 	mov rax, [r12]
 	add r12, 8
@@ -242,6 +300,39 @@ word_exit:
 	mov rax, 60
 	syscall
     ret
+word_and:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	test rax, rax
+	setz cl
+	test rbx, rbx
+	setz dl
+	movzx rcx, cl
+	movzx rdx, dl
+	and rcx, rdx
+	mov [r12], rcx
+    ret
+word_or:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	test rax, rax
+	setz cl
+	test rbx, rbx
+	setz dl
+	movzx rcx, cl
+	movzx rdx, dl
+	or rcx, rdx
+	mov [r12], rcx
+    ret
+word_not:
+	mov rax, [r12]
+	test rax, rax
+	setz al
+	movzx rax, al
+	mov [r12], rax
+    ret
 word__3er:
 	mov rax, [r12]
 	add r12, 8
diff --git a/build/loops_and_cmp.o b/build/loops_and_cmp.o
index 9fbed6f..e0f75f3 100644
Binary files a/build/loops_and_cmp.o and b/build/loops_and_cmp.o differ
diff --git a/build/main.asm b/build/main.asm
index 031a9dd..e644000 100644
--- a/build/main.asm
+++ b/build/main.asm
@@ -19,7 +19,7 @@ _start:
     mov rax, 60
     syscall
 word_puts:
-	; detects string if top is len>=0 and next is a pointer in [data_start, data_end)
+	; detects string if top is len>=0 and next is a pointer in [data_start, data_end]
 	mov rax, [r12]      ; len or int value
 	mov rbx, [r12 + 8]  ; possible address
 	cmp rax, 0
@@ -114,6 +114,64 @@ word_swap:
 	mov [r12], rbx
 	mov [r12 + 8], rax
     ret
+word_rot:
+	mov rax, [r12]       ; x3
+	mov rbx, [r12 + 8]   ; x2
+	mov rcx, [r12 + 16]  ; x1
+	mov [r12], rcx       ; top = x1
+	mov [r12 + 8], rax   ; next = x3
+	mov [r12 + 16], rbx  ; third = x2
+    ret
+word__2drot:
+	mov rax, [r12]       ; x3
+	mov rbx, [r12 + 8]   ; x2
+	mov rcx, [r12 + 16]  ; x1
+	mov [r12], rbx       ; top = x2
+	mov [r12 + 8], rcx   ; next = x1
+	mov [r12 + 16], rax  ; third = x3
+    ret
+word_nip:
+	mov rax, [r12]
+	add r12, 8           ; drop lower element
+	mov [r12], rax       ; keep original top
+    ret
+word_tuck:
+	mov rax, [r12]       ; x2
+	mov rbx, [r12 + 8]   ; x1
+	sub r12, 8           ; make room
+	mov [r12], rax       ; x2
+	mov [r12 + 8], rbx   ; x1
+	mov [r12 + 16], rax  ; x2
+    ret
+word_2dup:
+	mov rax, [r12]       ; b
+	mov rbx, [r12 + 8]   ; a
+	sub r12, 8
+	mov [r12], rbx       ; push a
+	sub r12, 8
+	mov [r12], rax       ; push b
+    ret
+word_2drop:
+	add r12, 16
+    ret
+word_2swap:
+	mov rax, [r12]        ; d
+	mov rbx, [r12 + 8]    ; c
+	mov rcx, [r12 + 16]   ; b
+	mov rdx, [r12 + 24]   ; a
+	mov [r12], rcx        ; top = b
+	mov [r12 + 8], rdx    ; next = a
+	mov [r12 + 16], rax   ; third = d
+	mov [r12 + 24], rbx   ; fourth = c
+    ret
+word_2over:
+	mov rax, [r12 + 16]   ; b
+	mov rbx, [r12 + 24]   ; a
+	sub r12, 8
+	mov [r12], rbx        ; push a
+	sub r12, 8
+	mov [r12], rax        ; push b
+    ret
 word__2b:
 	mov rax, [r12]
 	add r12, 8
@@ -242,6 +300,39 @@ word_exit:
 	mov rax, 60
 	syscall
     ret
+word_and:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	test rax, rax
+	setz cl
+	test rbx, rbx
+	setz dl
+	movzx rcx, cl
+	movzx rdx, dl
+	and rcx, rdx
+	mov [r12], rcx
+    ret
+word_or:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	test rax, rax
+	setz cl
+	test rbx, rbx
+	setz dl
+	movzx rcx, cl
+	movzx rdx, dl
+	or rcx, rdx
+	mov [r12], rcx
+    ret
+word_not:
+	mov rax, [r12]
+	test rax, rax
+	setz al
+	movzx rax, al
+	mov [r12], rax
+    ret
 word__3er:
 	mov rax, [r12]
 	add r12, 8
diff --git a/build/main.o b/build/main.o
index b867902..5f778c4 100644
Binary files a/build/main.o and b/build/main.o differ
diff --git a/build/override_dup_compile_time.asm b/build/override_dup_compile_time.asm
index 98ad08c..a8380b5 100644
--- a/build/override_dup_compile_time.asm
+++ b/build/override_dup_compile_time.asm
@@ -19,7 +19,7 @@ _start:
     mov rax, 60
     syscall
 word_puts:
-	; detects string if top is len>=0 and next is a pointer in [data_start, data_end)
+	; detects string if top is len>=0 and next is a pointer in [data_start, data_end]
 	mov rax, [r12]      ; len or int value
 	mov rbx, [r12 + 8]  ; possible address
 	cmp rax, 0
@@ -114,6 +114,64 @@ word_swap:
 	mov [r12], rbx
 	mov [r12 + 8], rax
     ret
+word_rot:
+	mov rax, [r12]       ; x3
+	mov rbx, [r12 + 8]   ; x2
+	mov rcx, [r12 + 16]  ; x1
+	mov [r12], rcx       ; top = x1
+	mov [r12 + 8], rax   ; next = x3
+	mov [r12 + 16], rbx  ; third = x2
+    ret
+word__2drot:
+	mov rax, [r12]       ; x3
+	mov rbx, [r12 + 8]   ; x2
+	mov rcx, [r12 + 16]  ; x1
+	mov [r12], rbx       ; top = x2
+	mov [r12 + 8], rcx   ; next = x1
+	mov [r12 + 16], rax  ; third = x3
+    ret
+word_nip:
+	mov rax, [r12]
+	add r12, 8           ; drop lower element
+	mov [r12], rax       ; keep original top
+    ret
+word_tuck:
+	mov rax, [r12]       ; x2
+	mov rbx, [r12 + 8]   ; x1
+	sub r12, 8           ; make room
+	mov [r12], rax       ; x2
+	mov [r12 + 8], rbx   ; x1
+	mov [r12 + 16], rax  ; x2
+    ret
+word_2dup:
+	mov rax, [r12]       ; b
+	mov rbx, [r12 + 8]   ; a
+	sub r12, 8
+	mov [r12], rbx       ; push a
+	sub r12, 8
+	mov [r12], rax       ; push b
+    ret
+word_2drop:
+	add r12, 16
+    ret
+word_2swap:
+	mov rax, [r12]        ; d
+	mov rbx, [r12 + 8]    ; c
+	mov rcx, [r12 + 16]   ; b
+	mov rdx, [r12 + 24]   ; a
+	mov [r12], rcx        ; top = b
+	mov [r12 + 8], rdx    ; next = a
+	mov [r12 + 16], rax   ; third = d
+	mov [r12 + 24], rbx   ; fourth = c
+    ret
+word_2over:
+	mov rax, [r12 + 16]   ; b
+	mov rbx, [r12 + 24]   ; a
+	sub r12, 8
+	mov [r12], rbx        ; push a
+	sub r12, 8
+	mov [r12], rax        ; push b
+    ret
 word__2b:
 	mov rax, [r12]
 	add r12, 8
@@ -242,6 +300,39 @@ word_exit:
 	mov rax, 60
 	syscall
     ret
+word_and:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	test rax, rax
+	setz cl
+	test rbx, rbx
+	setz dl
+	movzx rcx, cl
+	movzx rdx, dl
+	and rcx, rdx
+	mov [r12], rcx
+    ret
+word_or:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	test rax, rax
+	setz cl
+	test rbx, rbx
+	setz dl
+	movzx rcx, cl
+	movzx rdx, dl
+	or rcx, rdx
+	mov [r12], rcx
+    ret
+word_not:
+	mov rax, [r12]
+	test rax, rax
+	setz al
+	movzx rax, al
+	mov [r12], rax
+    ret
 word__3er:
 	mov rax, [r12]
 	add r12, 8
diff --git a/build/override_dup_compile_time.o b/build/override_dup_compile_time.o
index 2b38b61..85ecd1a 100644
Binary files a/build/override_dup_compile_time.o and b/build/override_dup_compile_time.o differ
diff --git a/build/string_puts.asm b/build/string_puts.asm
index e868593..d9d9ed8 100644
--- a/build/string_puts.asm
+++ b/build/string_puts.asm
@@ -19,7 +19,7 @@ _start:
     mov rax, 60
     syscall
 word_puts:
-	; detects string if top is len>=0 and next is a pointer in [data_start, data_end)
+	; detects string if top is len>=0 and next is a pointer in [data_start, data_end]
 	mov rax, [r12]      ; len or int value
 	mov rbx, [r12 + 8]  ; possible address
 	cmp rax, 0
@@ -114,6 +114,64 @@ word_swap:
 	mov [r12], rbx
 	mov [r12 + 8], rax
     ret
+word_rot:
+	mov rax, [r12]       ; x3
+	mov rbx, [r12 + 8]   ; x2
+	mov rcx, [r12 + 16]  ; x1
+	mov [r12], rcx       ; top = x1
+	mov [r12 + 8], rax   ; next = x3
+	mov [r12 + 16], rbx  ; third = x2
+    ret
+word__2drot:
+	mov rax, [r12]       ; x3
+	mov rbx, [r12 + 8]   ; x2
+	mov rcx, [r12 + 16]  ; x1
+	mov [r12], rbx       ; top = x2
+	mov [r12 + 8], rcx   ; next = x1
+	mov [r12 + 16], rax  ; third = x3
+    ret
+word_nip:
+	mov rax, [r12]
+	add r12, 8           ; drop lower element
+	mov [r12], rax       ; keep original top
+    ret
+word_tuck:
+	mov rax, [r12]       ; x2
+	mov rbx, [r12 + 8]   ; x1
+	sub r12, 8           ; make room
+	mov [r12], rax       ; x2
+	mov [r12 + 8], rbx   ; x1
+	mov [r12 + 16], rax  ; x2
+    ret
+word_2dup:
+	mov rax, [r12]       ; b
+	mov rbx, [r12 + 8]   ; a
+	sub r12, 8
+	mov [r12], rbx       ; push a
+	sub r12, 8
+	mov [r12], rax       ; push b
+    ret
+word_2drop:
+	add r12, 16
+    ret
+word_2swap:
+	mov rax, [r12]        ; d
+	mov rbx, [r12 + 8]    ; c
+	mov rcx, [r12 + 16]   ; b
+	mov rdx, [r12 + 24]   ; a
+	mov [r12], rcx        ; top = b
+	mov [r12 + 8], rdx    ; next = a
+	mov [r12 + 16], rax   ; third = d
+	mov [r12 + 24], rbx   ; fourth = c
+    ret
+word_2over:
+	mov rax, [r12 + 16]   ; b
+	mov rbx, [r12 + 24]   ; a
+	sub r12, 8
+	mov [r12], rbx        ; push a
+	sub r12, 8
+	mov [r12], rax        ; push b
+    ret
 word__2b:
 	mov rax, [r12]
 	add r12, 8
@@ -242,6 +300,39 @@ word_exit:
 	mov rax, 60
 	syscall
     ret
+word_and:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	test rax, rax
+	setz cl
+	test rbx, rbx
+	setz dl
+	movzx rcx, cl
+	movzx rdx, dl
+	and rcx, rdx
+	mov [r12], rcx
+    ret
+word_or:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	test rax, rax
+	setz cl
+	test rbx, rbx
+	setz dl
+	movzx rcx, cl
+	movzx rdx, dl
+	or rcx, rdx
+	mov [r12], rcx
+    ret
+word_not:
+	mov rax, [r12]
+	test rax, rax
+	setz al
+	movzx rax, al
+	mov [r12], rax
+    ret
 word__3er:
 	mov rax, [r12]
 	add r12, 8
diff --git a/build/string_puts.o b/build/string_puts.o
index a845daf..4e953ac 100644
Binary files a/build/string_puts.o and b/build/string_puts.o differ
diff --git a/build/test.asm b/build/test.asm
index 522ba94..e368921 100644
--- a/build/test.asm
+++ b/build/test.asm
@@ -19,7 +19,7 @@ _start:
     mov rax, 60
     syscall
 word_puts:
-	; detects string if top is len>=0 and next is a pointer in [data_start, data_end)
+	; detects string if top is len>=0 and next is a pointer in [data_start, data_end]
 	mov rax, [r12]      ; len or int value
 	mov rbx, [r12 + 8]  ; possible address
 	cmp rax, 0
@@ -114,6 +114,64 @@ word_swap:
 	mov [r12], rbx
 	mov [r12 + 8], rax
     ret
+word_rot:
+	mov rax, [r12]       ; x3
+	mov rbx, [r12 + 8]   ; x2
+	mov rcx, [r12 + 16]  ; x1
+	mov [r12], rcx       ; top = x1
+	mov [r12 + 8], rax   ; next = x3
+	mov [r12 + 16], rbx  ; third = x2
+    ret
+word__2drot:
+	mov rax, [r12]       ; x3
+	mov rbx, [r12 + 8]   ; x2
+	mov rcx, [r12 + 16]  ; x1
+	mov [r12], rbx       ; top = x2
+	mov [r12 + 8], rcx   ; next = x1
+	mov [r12 + 16], rax  ; third = x3
+    ret
+word_nip:
+	mov rax, [r12]
+	add r12, 8           ; drop lower element
+	mov [r12], rax       ; keep original top
+    ret
+word_tuck:
+	mov rax, [r12]       ; x2
+	mov rbx, [r12 + 8]   ; x1
+	sub r12, 8           ; make room
+	mov [r12], rax       ; x2
+	mov [r12 + 8], rbx   ; x1
+	mov [r12 + 16], rax  ; x2
+    ret
+word_2dup:
+	mov rax, [r12]       ; b
+	mov rbx, [r12 + 8]   ; a
+	sub r12, 8
+	mov [r12], rbx       ; push a
+	sub r12, 8
+	mov [r12], rax       ; push b
+    ret
+word_2drop:
+	add r12, 16
+    ret
+word_2swap:
+	mov rax, [r12]        ; d
+	mov rbx, [r12 + 8]    ; c
+	mov rcx, [r12 + 16]   ; b
+	mov rdx, [r12 + 24]   ; a
+	mov [r12], rcx        ; top = b
+	mov [r12 + 8], rdx    ; next = a
+	mov [r12 + 16], rax   ; third = d
+	mov [r12 + 24], rbx   ; fourth = c
+    ret
+word_2over:
+	mov rax, [r12 + 16]   ; b
+	mov rbx, [r12 + 24]   ; a
+	sub r12, 8
+	mov [r12], rbx        ; push a
+	sub r12, 8
+	mov [r12], rax        ; push b
+    ret
 word__2b:
 	mov rax, [r12]
 	add r12, 8
@@ -242,6 +300,39 @@ word_exit:
 	mov rax, 60
 	syscall
     ret
+word_and:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	test rax, rax
+	setz cl
+	test rbx, rbx
+	setz dl
+	movzx rcx, cl
+	movzx rdx, dl
+	and rcx, rdx
+	mov [r12], rcx
+    ret
+word_or:
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	test rax, rax
+	setz cl
+	test rbx, rbx
+	setz dl
+	movzx rcx, cl
+	movzx rdx, dl
+	or rcx, rdx
+	mov [r12], rcx
+    ret
+word_not:
+	mov rax, [r12]
+	test rax, rax
+	setz al
+	movzx rax, al
+	mov [r12], rax
+    ret
 word__3er:
 	mov rax, [r12]
 	add r12, 8
diff --git a/build/test.o b/build/test.o
index b4a5541..450ae02 100644
Binary files a/build/test.o and b/build/test.o differ
diff --git a/fn.sl b/fn.sl
index 45b0c4b..6b736c0 100644
--- a/fn.sl
+++ b/fn.sl
@@ -100,27 +100,29 @@ compile-only
 compile-only
 
 : fn-lexemes-from-tokens
-	list-new >r                 # tokens    (r: acc)
-	0                           # tokens idx
+	>r                   # (r: tokens)
+	list-new             # acc
 begin
-	over list-length over swap >= if   # stop when idx >= len
-		drop drop                # drop idx and tokens (flag consumed by if)
-		r> exit                  # return acc
+	0 rpick list-empty? if
+		rdrop exit
 	then
-	over over list-get token-lexeme   # tokens idx lex
-	r> swap list-append >r            # tokens idx
-	1 +                               # tokens idx+1
+	0 rpick list-pop-front     # acc tokens' first
+	rdrop                      # acc tokens'
+	swap                       # acc first tokens'
+	>r                         # acc first   (r: tokens')
+	token-lexeme          # acc lex
+	list-append           # acc'
 again
 ;
 compile-only
 
 : fn-validate-body
 	dup list-length 0 == if "empty function body" parse-error then
-	dup >r 0 r> swap list-get "return" string= 0 == if "function body must start with 'return'" parse-error then
+	dup 0 list-get token-lexeme "return" string= 0 == if "function body must start with 'return'" parse-error then
 	dup list-last ";" string= 0 == if "function body must terminate with ';'" parse-error then
-	list-clone                     # work on a copy
-	list-pop drop                  # drop trailing ';'
-	list-pop-front drop            # drop leading 'return'
+	list-clone                     # body body'
+	list-pop drop                  # body expr' (trim trailing ';')
+	list-pop-front drop            # body expr  (trim leading 'return')
 	dup list-length 0 == if "missing return expression" parse-error then
 ;
 compile-only
diff --git a/main.py b/main.py
index dd0d125..5e8da15 100644
--- a/main.py
+++ b/main.py
@@ -11,6 +11,8 @@ This file now contains working scaffolding for:
 from __future__ import annotations
 
 import argparse
+import ctypes
+import mmap
 import subprocess
 import sys
 import textwrap
@@ -18,6 +20,13 @@ from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Set, Union, Tuple
 
+try:  # lazy optional import; required for compile-time :asm execution
+	from keystone import Ks, KsError, KS_ARCH_X86, KS_MODE_64
+except Exception:  # pragma: no cover - optional dependency
+	Ks = None
+	KsError = Exception
+	KS_ARCH_X86 = KS_MODE_64 = None
+
 
 class ParseError(Exception):
 	"""Raised when the source stream cannot be parsed."""
@@ -764,11 +773,13 @@ class CompileTimeVM:
 		self.stack: List[Any] = []
 		self.return_stack: List[Any] = []
 		self.loop_stack: List[Dict[str, Any]] = []
+		self._handles = _CTHandleTable()
 
 	def reset(self) -> None:
 		self.stack.clear()
 		self.return_stack.clear()
 		self.loop_stack.clear()
+		self._handles.clear()
 
 	def push(self, value: Any) -> None:
 		self.stack.append(value)
@@ -778,6 +789,22 @@ class CompileTimeVM:
 			raise ParseError("compile-time stack underflow")
 		return self.stack.pop()
 
+	def _resolve_handle(self, value: Any) -> Any:
+		if isinstance(value, int):
+			for delta in (0, -1, 1):
+				candidate = value + delta
+				if candidate in self._handles.objects:
+					obj = self._handles.objects[candidate]
+					self._handles.objects[value] = obj
+					return obj
+			# Occasionally a raw object id can appear on the stack; recover it if we still
+			# hold the object reference.
+			for obj in self._handles.objects.values():
+				if id(obj) == value:
+					self._handles.objects[value] = obj
+					return obj
+		return value
+
 	def peek(self) -> Any:
 		if not self.stack:
 			raise ParseError("compile-time stack underflow")
@@ -790,19 +817,24 @@ class CompileTimeVM:
 		return value
 
 	def pop_str(self) -> str:
-		value = self.pop()
+		value = self._resolve_handle(self.pop())
 		if not isinstance(value, str):
 			raise ParseError("expected string on compile-time stack")
 		return value
 
 	def pop_list(self) -> List[Any]:
-		value = self.pop()
+		value = self._resolve_handle(self.pop())
 		if not isinstance(value, list):
-			raise ParseError("expected list on compile-time stack")
+			known = value in self._handles.objects if isinstance(value, int) else False
+			handles_size = len(self._handles.objects)
+			handle_keys = list(self._handles.objects.keys())
+			raise ParseError(
+				f"expected list on compile-time stack, got {type(value).__name__} value={value!r} known_handle={known} handles={handles_size}:{handle_keys!r} stack={self.stack!r}"
+			)
 		return value
 
 	def pop_token(self) -> Token:
-		value = self.pop()
+		value = self._resolve_handle(self.pop())
 		if not isinstance(value, Token):
 			raise ParseError("expected token on compile-time stack")
 		return value
@@ -826,9 +858,210 @@ class CompileTimeVM:
 		if definition is None:
 			raise ParseError(f"word '{word.name}' has no compile-time definition")
 		if isinstance(definition, AsmDefinition):
-			raise ParseError(f"word '{word.name}' cannot run at compile time")
+			self._run_asm_definition(word)
+			return
 		self._execute_nodes(definition.body)
 
+	def _run_asm_definition(self, word: Word) -> None:
+		definition = word.definition
+		if Ks is None:
+			raise ParseError("keystone is required for compile-time :asm execution; install keystone-engine")
+		if not isinstance(definition, AsmDefinition):  # pragma: no cover - defensive
+			raise ParseError(f"word '{word.name}' has no asm body")
+		asm_body = definition.body.strip("\n")
+
+		string_mode = word.name == "puts"
+
+		handles = self._handles
+
+		non_int_data = any(not isinstance(v, int) for v in self.stack)
+		non_int_return = any(not isinstance(v, int) for v in self.return_stack)
+
+		# Collect all strings present on data and return stacks so we can point
+		# puts() at a real buffer and pass its range check (data_start..data_end).
+		strings: List[str] = []
+		if string_mode:
+			for v in self.stack + self.return_stack:
+				if isinstance(v, str):
+					strings.append(v)
+		data_blob = b""
+		string_addrs: Dict[str, Tuple[int, int]] = {}
+		if strings:
+			offset = 0
+			parts: List[bytes] = []
+			seen: Dict[str, Tuple[int, int]] = {}
+			for s in strings:
+				if s in seen:
+					string_addrs[s] = seen[s]
+					continue
+				encoded = s.encode("utf-8") + b"\x00"
+				parts.append(encoded)
+				addr = offset
+				length = len(encoded) - 1
+				seen[s] = (addr, length)
+				string_addrs[s] = (addr, length)
+				offset += len(encoded)
+			data_blob = b"".join(parts)
+		string_buffer: Optional[ctypes.Array[Any]] = None
+		data_start = 0
+		data_end = 0
+		if data_blob:
+			string_buffer = ctypes.create_string_buffer(data_blob)
+			data_start = ctypes.addressof(string_buffer)
+			data_end = data_start + len(data_blob)
+			handles.refs.append(string_buffer)
+			for s, (off, _len) in string_addrs.items():
+				handles.objects[data_start + off] = s
+
+		PRINT_BUF_BYTES = 128
+		print_buffer = ctypes.create_string_buffer(PRINT_BUF_BYTES)
+		handles.refs.append(print_buffer)
+		print_buf = ctypes.addressof(print_buffer)
+
+		wrapper_lines = []
+		wrapper_lines.extend([
+			"_ct_entry:",
+			"    push rbx",
+			"    push r12",
+			"    push r13",
+			"    push r14",
+			"    push r15",
+			"    mov r12, rdi",  # data stack pointer
+			"    mov r13, rsi",  # return stack pointer
+			"    mov r14, rdx",  # out ptr for r12
+			"    mov r15, rcx",  # out ptr for r13
+		])
+		if asm_body:
+			patched_body = []
+			for line in asm_body.splitlines():
+				line = line.strip()
+				if line == "ret":
+					line = "jmp _ct_save"
+				if "lea r8, [rel data_start]" in line:
+					line = line.replace("lea r8, [rel data_start]", f"mov r8, {data_start}")
+				if "lea r9, [rel data_end]" in line:
+					line = line.replace("lea r9, [rel data_end]", f"mov r9, {data_end}")
+				if "mov byte [rel print_buf]" in line or "mov byte ptr [rel print_buf]" in line:
+					patched_body.append(f"mov rax, {print_buf}")
+					patched_body.append("mov byte ptr [rax], 10")
+					continue
+				if "lea rsi, [rel print_buf_end]" in line:
+					line = f"mov rsi, {print_buf + PRINT_BUF_BYTES}"
+				if "lea rsi, [rel print_buf]" in line:
+					line = f"mov rsi, {print_buf}"
+				patched_body.append(line)
+			wrapper_lines.extend(patched_body)
+		wrapper_lines.extend([
+			"_ct_save:",
+			"    mov [r14], r12",
+			"    mov [r15], r13",
+			"    pop r15",
+			"    pop r14",
+			"    pop r13",
+			"    pop r12",
+			"    pop rbx",
+			"    ret",
+		])
+		def _normalize_sizes(line: str) -> str:
+			for size in ("qword", "dword", "word", "byte"):
+				line = line.replace(f"{size} [", f"{size} ptr [")
+			return line
+
+		def _strip_comment(line: str) -> str:
+			return line.split(";", 1)[0].rstrip()
+
+		normalized_lines = []
+		for raw in wrapper_lines:
+			stripped = _strip_comment(raw)
+			if not stripped.strip():
+				continue
+			normalized_lines.append(_normalize_sizes(stripped))
+		ks = Ks(KS_ARCH_X86, KS_MODE_64)
+		try:
+			encoding, _ = ks.asm("\n".join(normalized_lines))
+		except KsError as exc:
+			debug_lines = "\n".join(normalized_lines)
+			raise ParseError(
+				f"keystone failed for word '{word.name}': {exc}\n--- asm ---\n{debug_lines}\n--- end asm ---"
+			) from exc
+		if encoding is None:
+			raise ParseError(
+				f"keystone produced no code for word '{word.name}' (lines: {len(wrapper_lines)})"
+			)
+
+		code = bytes(encoding)
+		code_buf = mmap.mmap(-1, len(code), prot=mmap.PROT_READ | mmap.PROT_WRITE | mmap.PROT_EXEC)
+		code_buf.write(code)
+		code_ptr = ctypes.addressof(ctypes.c_char.from_buffer(code_buf))
+		func_type = ctypes.CFUNCTYPE(None, ctypes.c_uint64, ctypes.c_uint64, ctypes.c_uint64, ctypes.c_uint64)
+		func = func_type(code_ptr)
+
+		handles = self._handles
+
+		def _marshal_stack(py_stack: List[Any]) -> Tuple[int, int, int, Any]:
+			capacity = len(py_stack) + 16
+			buffer = (ctypes.c_int64 * capacity)()
+			base = ctypes.addressof(buffer)
+			top = base + capacity * 8
+			sp = top
+			for value in py_stack:
+				sp -= 8
+				if isinstance(value, int):
+					ctypes.c_int64.from_address(sp).value = value
+				elif isinstance(value, str):
+					if string_mode:
+						offset, strlen = string_addrs.get(value, (0, 0))
+						addr = data_start + offset if data_start else handles.store(value)
+						# puts expects (len, addr) with len on top
+						ctypes.c_int64.from_address(sp).value = addr
+						sp -= 8
+						ctypes.c_int64.from_address(sp).value = strlen
+					else:
+						ctypes.c_int64.from_address(sp).value = handles.store(value)
+				else:
+					ctypes.c_int64.from_address(sp).value = handles.store(value)
+			return sp, top, base, buffer
+
+		# r12/r13 must point at the top element (or top of buffer if empty)
+		buffers: List[Any] = []
+		d_sp, d_top, d_base, d_buf = _marshal_stack(self.stack)
+		buffers.append(d_buf)
+		r_sp, r_top, r_base, r_buf = _marshal_stack(self.return_stack)
+		buffers.append(r_buf)
+		out_d = ctypes.c_uint64(0)
+		out_r = ctypes.c_uint64(0)
+		func(d_sp, r_sp, ctypes.addressof(out_d), ctypes.addressof(out_r))
+
+		new_d = out_d.value
+		new_r = out_r.value
+		if not (d_base <= new_d <= d_top):
+			raise ParseError(f"compile-time asm '{word.name}' corrupted data stack pointer")
+		if not (r_base <= new_r <= r_top):
+			raise ParseError(f"compile-time asm '{word.name}' corrupted return stack pointer")
+
+		def _unmarshal_stack(sp: int, top: int, table: _CTHandleTable) -> List[Any]:
+			if sp == top:
+				return []
+			values: List[Any] = []
+			addr = top - 8
+			while addr >= sp:
+				raw = ctypes.c_int64.from_address(addr).value
+				if raw in table.objects:
+					obj = table.objects[raw]
+					if isinstance(obj, str) and values and isinstance(values[-1], int):
+						# collapse (len, addr) pairs back into the original string
+						values.pop()
+						values.append(obj)
+					else:
+						values.append(obj)
+				else:
+					values.append(raw)
+				addr -= 8
+			return values
+
+		self.stack = _unmarshal_stack(new_d, d_top, handles)
+		self.return_stack = _unmarshal_stack(new_r, r_top, handles)
+
 	def _call_word_by_name(self, name: str) -> None:
 		word = self.dictionary.lookup(name)
 		if word is None:
@@ -1085,6 +1318,27 @@ def _parse_string_literal(token: Token) -> Optional[str]:
 	return "".join(result)
 
 
+class _CTHandleTable:
+	"""Keeps Python object references stable across compile-time asm calls."""
+
+	def __init__(self) -> None:
+		self.objects: Dict[int, Any] = {}
+		self.refs: List[Any] = []
+		self.string_buffers: List[ctypes.Array[Any]] = []
+
+	def clear(self) -> None:
+		self.objects.clear()
+		self.refs.clear()
+		self.string_buffers.clear()
+
+	def store(self, value: Any) -> int:
+		addr = id(value)
+		self.refs.append(value)
+		self.objects[addr] = value
+		return addr
+
+
+
 class Assembler:
 	def __init__(self, dictionary: Dictionary) -> None:
 		self.dictionary = dictionary
@@ -1298,6 +1552,24 @@ def macro_compile_only(ctx: MacroContext) -> Optional[List[ASTNode]]:
 	return None
 
 
+def macro_compile_time(ctx: MacroContext) -> Optional[List[ASTNode]]:
+	"""Run the next word at compile time and still emit it for runtime."""
+	parser = ctx.parser
+	if parser._eof():
+		raise ParseError("word name missing after 'compile-time'")
+	tok = parser.next_token()
+	name = tok.lexeme
+	word = parser.dictionary.lookup(name)
+	if word is None:
+		raise ParseError(f"unknown word '{name}' for compile-time")
+	if word.compile_only:
+		raise ParseError(f"word '{name}' is compile-time only")
+	parser.compile_time_vm.invoke(word)
+	if isinstance(parser.context_stack[-1], Definition):
+		parser.emit_node(WordRef(name=name))
+	return None
+
+
 def macro_begin_text_macro(ctx: MacroContext) -> Optional[List[ASTNode]]:
 	parser = ctx.parser
 	if parser._eof():
@@ -1447,14 +1719,6 @@ def _ensure_lexer(value: Any) -> SplitLexer:
 	return value
 
 
-def _truthy(value: Any) -> bool:
-	if isinstance(value, bool):
-		return value
-	if isinstance(value, int):
-		return value != 0
-	return value is not None
-
-
 def _coerce_str(value: Any) -> str:
 	if isinstance(value, str):
 		return value
@@ -1473,217 +1737,21 @@ def _default_template(template: Optional[Token]) -> Token:
 	return template
 
 
-def _trunc_divmod(a: int, b: int) -> Tuple[int, int]:
-	if b == 0:
-		raise ParseError("division by zero")
-	quot = abs(a) // abs(b)
-	if (a < 0) ^ (b < 0):
-		quot = -quot
-	rem = a - quot * b
-	return quot, rem
-
-
-def _ct_dup(vm: CompileTimeVM) -> None:
-	vm.push(vm.peek())
-
-
-def _ct_drop(vm: CompileTimeVM) -> None:
-	vm.pop()
-
-
-def _ct_swap(vm: CompileTimeVM) -> None:
-	a = vm.pop()
-	b = vm.pop()
-	vm.push(a)
-	vm.push(b)
-
-
-def _ct_over(vm: CompileTimeVM) -> None:
-	if len(vm.stack) < 2:
-		raise ParseError("over requires two stack values")
-	vm.push(vm.stack[-2])
-
-
-def _ct_rot(vm: CompileTimeVM) -> None:
-	if len(vm.stack) < 3:
-		raise ParseError("rot requires three stack values")
-	vm.stack[-3], vm.stack[-2], vm.stack[-1] = vm.stack[-2], vm.stack[-1], vm.stack[-3]
-
-
-def _ct_nip(vm: CompileTimeVM) -> None:
-	if len(vm.stack) < 2:
-		raise ParseError("nip requires two stack values")
-	top = vm.pop()
-	vm.pop()
-	vm.push(top)
-
-
-def _ct_tuck(vm: CompileTimeVM) -> None:
-	if len(vm.stack) < 2:
-		raise ParseError("tuck requires two stack values")
-	first = vm.pop()
-	second = vm.pop()
-	vm.push(first)
-	vm.push(second)
-	vm.push(first)
-
-
-def _ct_2dup(vm: CompileTimeVM) -> None:
-	if len(vm.stack) < 2:
-		raise ParseError("2dup requires two stack values")
-	second = vm.pop()
-	first = vm.pop()
-	vm.push(first)
-	vm.push(second)
-	vm.push(first)
-	vm.push(second)
-
-
-def _ct_2drop(vm: CompileTimeVM) -> None:
-	if len(vm.stack) < 2:
-		raise ParseError("2drop requires two stack values")
-	vm.pop()
-	vm.pop()
-
-
-def _ct_2swap(vm: CompileTimeVM) -> None:
-	if len(vm.stack) < 4:
-		raise ParseError("2swap requires four stack values")
-	a = vm.pop()
-	b = vm.pop()
-	c = vm.pop()
-	d = vm.pop()
-	vm.push(a)
-	vm.push(b)
-	vm.push(c)
-	vm.push(d)
-
-
-def _ct_2over(vm: CompileTimeVM) -> None:
-	if len(vm.stack) < 4:
-		raise ParseError("2over requires four stack values")
-	vm.push(vm.stack[-4])
-	vm.push(vm.stack[-3])
-
-
-def _ct_minus_rot(vm: CompileTimeVM) -> None:
-	if len(vm.stack) < 3:
-		raise ParseError("-rot requires three stack values")
-	vm.stack[-3], vm.stack[-2], vm.stack[-1] = vm.stack[-1], vm.stack[-3], vm.stack[-2]
-
-
-def _ct_binary_int(vm: CompileTimeVM, func: Callable[[int, int], int]) -> None:
-	b = vm.pop_int()
-	a = vm.pop_int()
-	vm.push(func(a, b))
-
-
-def _ct_add(vm: CompileTimeVM) -> None:
-	_ct_binary_int(vm, lambda a, b: a + b)
-
-
-def _ct_sub(vm: CompileTimeVM) -> None:
-	_ct_binary_int(vm, lambda a, b: a - b)
-
-
-def _ct_mul(vm: CompileTimeVM) -> None:
-	_ct_binary_int(vm, lambda a, b: a * b)
-
-
-def _ct_div(vm: CompileTimeVM) -> None:
-	divisor = vm.pop_int()
-	dividend = vm.pop_int()
-	quot, _ = _trunc_divmod(dividend, divisor)
-	vm.push(quot)
-
-
-def _ct_mod(vm: CompileTimeVM) -> None:
-	divisor = vm.pop_int()
-	dividend = vm.pop_int()
-	_, rem = _trunc_divmod(dividend, divisor)
-	vm.push(rem)
-
-
-def _ct_compare(vm: CompileTimeVM, predicate: Callable[[Any, Any], bool]) -> None:
-	b = vm.pop()
-	a = vm.pop()
-	vm.push(1 if predicate(a, b) else 0)
-
-
-def _ct_eq(vm: CompileTimeVM) -> None:
-	_ct_compare(vm, lambda a, b: a == b)
-
-
-def _ct_ne(vm: CompileTimeVM) -> None:
-	_ct_compare(vm, lambda a, b: a != b)
-
-
-def _ct_lt(vm: CompileTimeVM) -> None:
-	_ct_compare(vm, lambda a, b: a < b)
-
-
-def _ct_le(vm: CompileTimeVM) -> None:
-	_ct_compare(vm, lambda a, b: a <= b)
-
-
-def _ct_gt(vm: CompileTimeVM) -> None:
-	_ct_compare(vm, lambda a, b: a > b)
-
-
-def _ct_ge(vm: CompileTimeVM) -> None:
-	_ct_compare(vm, lambda a, b: a >= b)
-
-
-def _ct_and(vm: CompileTimeVM) -> None:
-	b = _truthy(vm.pop())
-	a = _truthy(vm.pop())
-	vm.push(1 if (a and b) else 0)
-
-
-def _ct_or(vm: CompileTimeVM) -> None:
-	b = _truthy(vm.pop())
-	a = _truthy(vm.pop())
-	vm.push(1 if (a or b) else 0)
-
-
-def _ct_not(vm: CompileTimeVM) -> None:
-	vm.push(1 if not _truthy(vm.pop()) else 0)
-
-
-def _ct_to_r(vm: CompileTimeVM) -> None:
-	vm.return_stack.append(vm.pop())
-
-
-def _ct_r_from(vm: CompileTimeVM) -> None:
-	if not vm.return_stack:
-		raise ParseError("return stack underflow")
-	vm.push(vm.return_stack.pop())
-
-
-def _ct_rdrop(vm: CompileTimeVM) -> None:
-	if not vm.return_stack:
-		raise ParseError("return stack underflow")
-	vm.return_stack.pop()
-
-
-def _ct_rpick(vm: CompileTimeVM) -> None:
-	index = vm.pop_int()
-	if index < 0 or index >= len(vm.return_stack):
-		raise ParseError("rpick index out of range")
-	vm.push(vm.return_stack[-1 - index])
-
-
-def _ct_pick(vm: CompileTimeVM) -> None:
-	index = vm.pop_int()
-	if index < 0 or index >= len(vm.stack):
-		raise ParseError("pick index out of range")
-	vm.push(vm.stack[-1 - index])
-
-
 def _ct_nil(vm: CompileTimeVM) -> None:
 	vm.push(None)
 
 
+def _ct_puts(vm: CompileTimeVM) -> None:
+	value = vm.pop()
+	if isinstance(value, str):
+		print(value)
+		return
+	if isinstance(value, int):
+		print(value)
+		return
+	raise ParseError("puts expects string or integer at compile time")
+
+
 def _ct_nil_p(vm: CompileTimeVM) -> None:
 	vm.push(1 if vm.pop() is None else 0)
 
@@ -1704,6 +1772,12 @@ def _ct_list_append(vm: CompileTimeVM) -> None:
 	vm.push(lst)
 
 
+def _ct_drop(vm: CompileTimeVM) -> None:
+	if not vm.stack:
+		return
+	vm.pop()
+
+
 def _ct_list_pop(vm: CompileTimeVM) -> None:
 	lst = _ensure_list(vm.pop())
 	if not lst:
@@ -1723,7 +1797,7 @@ def _ct_list_pop_front(vm: CompileTimeVM) -> None:
 
 
 def _ct_list_length(vm: CompileTimeVM) -> None:
-	lst = _ensure_list(vm.pop())
+	lst = vm.pop_list()
 	vm.push(len(lst))
 
 
@@ -1955,13 +2029,24 @@ def _ct_int_to_string(vm: CompileTimeVM) -> None:
 
 
 def _ct_identifier_p(vm: CompileTimeVM) -> None:
-	value = vm.pop_str()
+	value = vm._resolve_handle(vm.pop())
+	if isinstance(value, Token):
+		value = value.lexeme
+	if not isinstance(value, str):
+		vm.push(0)
+		return
 	vm.push(1 if _is_identifier(value) else 0)
 
 
 def _ct_token_lexeme(vm: CompileTimeVM) -> None:
-	token = vm.pop_token()
-	vm.push(token.lexeme)
+	value = vm._resolve_handle(vm.pop())
+	if isinstance(value, Token):
+		vm.push(value.lexeme)
+		return
+	if isinstance(value, str):
+		vm.push(value)
+		return
+	raise ParseError("expected token or string on compile-time stack")
 
 
 def _ct_token_from_lexeme(vm: CompileTimeVM) -> None:
@@ -2068,43 +2153,12 @@ def _register_compile_time_primitives(dictionary: Dictionary) -> None:
 		if compile_only:
 			word.compile_only = True
 
-	register("dup", _ct_dup)
-	register("drop", _ct_drop)
-	register("swap", _ct_swap)
-	register("over", _ct_over)
-	register("rot", _ct_rot)
-	register("nip", _ct_nip)
-	register("tuck", _ct_tuck)
-	register("2dup", _ct_2dup)
-	register("2drop", _ct_2drop)
-	register("2swap", _ct_2swap)
-	register("2over", _ct_2over)
-	register("-rot", _ct_minus_rot)
-	register("+", _ct_add)
-	register("-", _ct_sub)
-	register("*", _ct_mul)
-	register("/", _ct_div)
-	register("%", _ct_mod)
-	register("==", _ct_eq)
-	register("!=", _ct_ne)
-	register("<", _ct_lt)
-	register("<=", _ct_le)
-	register(">", _ct_gt)
-	register(">=", _ct_ge)
-	register("and", _ct_and)
-	register("or", _ct_or)
-	register("not", _ct_not)
-	register(">r", _ct_to_r)
-	register("r>", _ct_r_from)
-	register("rdrop", _ct_rdrop)
-	register("rpick", _ct_rpick)
-	register("pick", _ct_pick)
-
 	register("nil", _ct_nil, compile_only=True)
 	register("nil?", _ct_nil_p, compile_only=True)
 	register("list-new", _ct_list_new, compile_only=True)
 	register("list-clone", _ct_list_clone, compile_only=True)
 	register("list-append", _ct_list_append, compile_only=True)
+	register("drop", _ct_drop)
 	register("list-pop", _ct_list_pop, compile_only=True)
 	register("list-pop-front", _ct_list_pop_front, compile_only=True)
 	register("list-length", _ct_list_length, compile_only=True)
@@ -2239,6 +2293,7 @@ def bootstrap_dictionary() -> Dictionary:
 	dictionary = Dictionary()
 	dictionary.register(Word(name="immediate", immediate=True, macro=macro_immediate))
 	dictionary.register(Word(name="compile-only", immediate=True, macro=macro_compile_only))
+	dictionary.register(Word(name="compile-time", immediate=True, macro=macro_compile_time))
 	dictionary.register(Word(name="macro:", immediate=True, macro=macro_begin_text_macro))
 	dictionary.register(Word(name=";macro", immediate=True, macro=macro_end_text_macro))
 	dictionary.register(Word(name="struct:", immediate=True, macro=macro_struct_begin))
diff --git a/stdlib.sl b/stdlib.sl
index 25a96b6..a25fa30 100644
--- a/stdlib.sl
+++ b/stdlib.sl
@@ -1,5 +1,5 @@
 :asm puts {
-	; detects string if top is len>=0 and next is a pointer in [data_start, data_end)
+	; detects string if top is len>=0 and next is a pointer in [data_start, data_end]
 	mov rax, [r12]      ; len or int value
 	mov rbx, [r12 + 8]  ; possible address
 	cmp rax, 0
@@ -104,6 +104,80 @@ puts_finish_digits:
 }
 ;
 
+:asm rot {
+	mov rax, [r12]       ; x3
+	mov rbx, [r12 + 8]   ; x2
+	mov rcx, [r12 + 16]  ; x1
+	mov [r12], rcx       ; top = x1
+	mov [r12 + 8], rax   ; next = x3
+	mov [r12 + 16], rbx  ; third = x2
+}
+;
+
+:asm -rot {
+	mov rax, [r12]       ; x3
+	mov rbx, [r12 + 8]   ; x2
+	mov rcx, [r12 + 16]  ; x1
+	mov [r12], rbx       ; top = x2
+	mov [r12 + 8], rcx   ; next = x1
+	mov [r12 + 16], rax  ; third = x3
+}
+;
+
+:asm nip {
+	mov rax, [r12]
+	add r12, 8           ; drop lower element
+	mov [r12], rax       ; keep original top
+}
+;
+
+:asm tuck {
+	mov rax, [r12]       ; x2
+	mov rbx, [r12 + 8]   ; x1
+	sub r12, 8           ; make room
+	mov [r12], rax       ; x2
+	mov [r12 + 8], rbx   ; x1
+	mov [r12 + 16], rax  ; x2
+}
+;
+
+:asm 2dup {
+	mov rax, [r12]       ; b
+	mov rbx, [r12 + 8]   ; a
+	sub r12, 8
+	mov [r12], rbx       ; push a
+	sub r12, 8
+	mov [r12], rax       ; push b
+}
+;
+
+:asm 2drop {
+	add r12, 16
+}
+;
+
+:asm 2swap {
+	mov rax, [r12]        ; d
+	mov rbx, [r12 + 8]    ; c
+	mov rcx, [r12 + 16]   ; b
+	mov rdx, [r12 + 24]   ; a
+	mov [r12], rcx        ; top = b
+	mov [r12 + 8], rdx    ; next = a
+	mov [r12 + 16], rax   ; third = d
+	mov [r12 + 24], rbx   ; fourth = c
+}
+;
+
+:asm 2over {
+	mov rax, [r12 + 16]   ; b
+	mov rbx, [r12 + 24]   ; a
+	sub r12, 8
+	mov [r12], rbx        ; push a
+	sub r12, 8
+	mov [r12], rax        ; push b
+}
+;
+
 :asm + {
 	mov rax, [r12]
 	add r12, 8
@@ -264,6 +338,45 @@ puts_finish_digits:
 }
 ;
 
+:asm and {
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	test rax, rax
+	setz cl
+	test rbx, rbx
+	setz dl
+	movzx rcx, cl
+	movzx rdx, dl
+	and rcx, rdx
+	mov [r12], rcx
+}
+;
+
+:asm or {
+	mov rax, [r12]
+	add r12, 8
+	mov rbx, [r12]
+	test rax, rax
+	setz cl
+	test rbx, rbx
+	setz dl
+	movzx rcx, cl
+	movzx rdx, dl
+	or rcx, rdx
+	mov [r12], rcx
+}
+;
+
+:asm not {
+	mov rax, [r12]
+	test rax, rax
+	setz al
+	movzx rax, al
+	mov [r12], rax
+}
+;
+
 :asm >r {
 	mov rax, [r12]
 	add r12, 8
diff --git a/tests/__pycache__/run_tests.cpython-314.pyc b/tests/__pycache__/run_tests.cpython-314.pyc
new file mode 100644
index 0000000..b51fee4
Binary files /dev/null and b/tests/__pycache__/run_tests.cpython-314.pyc differ