From 6ea160b38858235b13b663a157ab01d4c4c33114 Mon Sep 17 00:00:00 2001 From: IgorCielniak Date: Tue, 13 Jan 2026 18:41:05 +0100 Subject: [PATCH] update to the C FFI --- main.py | 311 ++++++++++++++------ tests/__pycache__/run_tests.cpython-314.pyc | Bin 5770 -> 0 bytes 2 files changed, 216 insertions(+), 95 deletions(-) delete mode 100644 tests/__pycache__/run_tests.cpython-314.pyc diff --git a/main.py b/main.py index 8559872..b56e4bc 100644 --- a/main.py +++ b/main.py @@ -556,104 +556,143 @@ class Parser: if self._eof(): raise ParseError(f"extern missing name at {token.line}:{token.column}") - # Heuristic: check if the first token is a likely C type - c_types = {"void", "int", "long", "char", "bool", "size_t", "float", "double"} - - # Peek at tokens to decide mode - t1 = self._consume() - is_c_decl = False - - # If t1 is a type (or type*), and next is name, and next is '(', it's C-style. - # But since we already consumed t1, we have to proceed carefully. - - # Check if t1 looks like a type - base_type = t1.lexeme.rstrip("*") - if base_type in c_types: - # Likely C-style, but let's confirm next token is not a number (which would mean t1 was the name in raw mode) - peek = self.peek_token() - if peek is not None and peek.lexeme != "(" and not peek.lexeme.isdigit(): - # t1=type, peek=name. Confirm peek2='('? - # Actually, if t1 is "int", it's extremely unlikely to be a function name in L2. - is_c_decl = True + first_token = self._consume() + if self._try_parse_c_extern(first_token): + return + self._parse_legacy_extern(first_token) - if is_c_decl: - # C-Style Parsing - ret_type = t1.lexeme - if self._eof(): - raise ParseError("extern missing name after return type") - name_token = self._consume() - name = name_token.lexeme - - # Handle pointers in name token if tokenizer didn't split them (e.g. *name) - while name.startswith("*"): - name = name[1:] - - if self._eof() or self._consume().lexeme != "(": - raise ParseError(f"expected '(' after extern function name '{name}'") - - inputs = 0 - arg_types: List[str] = [] - while True: - if self._eof(): - raise ParseError("extern unclosed '('") - peek = self.peek_token() - if peek.lexeme == ")": - self._consume() - break - - # Parse argument type - arg_type_tok = self._consume() - arg_type = arg_type_tok.lexeme - - # Handle "type *" sequence - peek_ptr = self.peek_token() - while peek_ptr and peek_ptr.lexeme == "*": - self._consume() - arg_type += "*" - peek_ptr = self.peek_token() + def _parse_legacy_extern(self, name_token: Token) -> None: + name = name_token.lexeme + word = self.dictionary.lookup(name) + if word is None: + word = Word(name=name) + self.dictionary.register(word) + word.is_extern = True - if arg_type != "void": - inputs += 1 - arg_types.append(arg_type) - # Optional argument name - peek_name = self.peek_token() - if peek_name and peek_name.lexeme not in (",", ")"): - self._consume() # Consume arg name - - peek_sep = self.peek_token() - if peek_sep and peek_sep.lexeme == ",": - self._consume() - - outputs = 0 if ret_type == "void" else 1 - - word = self.dictionary.lookup(name) - if word is None: - word = Word(name=name) - self.dictionary.register(word) - word.is_extern = True - word.extern_inputs = inputs - word.extern_outputs = outputs - word.extern_signature = (arg_types, ret_type) - - else: - # Raw/Legacy Parsing - name = t1.lexeme - word = self.dictionary.lookup(name) - if word is None: - word = Word(name=name) - self.dictionary.register(word) - word.is_extern = True - - # Check for optional inputs/outputs + peek = self.peek_token() + if peek is not None and peek.lexeme.isdigit(): + word.extern_inputs = int(self._consume().lexeme) peek = self.peek_token() if peek is not None and peek.lexeme.isdigit(): - word.extern_inputs = int(self._consume().lexeme) - peek = self.peek_token() - if peek is not None and peek.lexeme.isdigit(): - word.extern_outputs = int(self._consume().lexeme) + word.extern_outputs = int(self._consume().lexeme) else: - word.extern_inputs = 0 word.extern_outputs = 0 + else: + word.extern_inputs = 0 + word.extern_outputs = 0 + + def _try_parse_c_extern(self, first_token: Token) -> bool: + saved_pos = self.pos + prefix_tokens: List[str] = [first_token.lexeme] + + while True: + if self._eof(): + self.pos = saved_pos + return False + lookahead = self._consume() + if lookahead.lexeme == "(": + break + if lookahead.lexeme.isdigit(): + self.pos = saved_pos + return False + prefix_tokens.append(lookahead.lexeme) + + if not prefix_tokens: + raise ParseError("extern missing return type/name before '('") + + name_lexeme = prefix_tokens.pop() + if not _is_identifier(name_lexeme): + prefix_name, suffix_name = _split_trailing_identifier(name_lexeme) + if suffix_name is None: + raise ParseError(f"extern expected identifier before '(' but got '{name_lexeme}'") + name_lexeme = suffix_name + if prefix_name: + prefix_tokens.append(prefix_name) + + if not _is_identifier(name_lexeme): + raise ParseError(f"extern expected identifier before '(' but got '{name_lexeme}'") + + ret_type = _normalize_c_type_tokens(prefix_tokens, allow_default=True) + inputs, arg_types = self._parse_c_param_list() + outputs = 0 if ret_type == "void" else 1 + self._register_c_extern(name_lexeme, inputs, outputs, arg_types, ret_type) + return True + + def _parse_c_param_list(self) -> Tuple[int, List[str]]: + inputs = 0 + arg_types: List[str] = [] + + if self._eof(): + raise ParseError("extern unclosed '('") + peek = self.peek_token() + if peek.lexeme == ")": + self._consume() + return inputs, arg_types + + while True: + lexemes = self._collect_c_param_lexemes() + arg_type = _normalize_c_type_tokens(lexemes, allow_default=False) + if arg_type == "void" and inputs == 0: + if self._eof(): + raise ParseError("extern unclosed '(' after 'void'") + closing = self._consume() + if closing.lexeme != ")": + raise ParseError("expected ')' after 'void' in extern parameter list") + return 0, [] + inputs += 1 + arg_types.append(arg_type) + if self._eof(): + raise ParseError("extern unclosed '('") + separator = self._consume() + if separator.lexeme == ")": + break + if separator.lexeme != ",": + raise ParseError( + f"expected ',' or ')' in extern parameter list, got '{separator.lexeme}'" + ) + return inputs, arg_types + + def _collect_c_param_lexemes(self) -> List[str]: + lexemes: List[str] = [] + while True: + if self._eof(): + raise ParseError("extern unclosed '('") + peek = self.peek_token() + if peek.lexeme in (",", ")"): + break + lexemes.append(self._consume().lexeme) + + if not lexemes: + raise ParseError("missing parameter type in extern declaration") + + if len(lexemes) > 1 and _is_identifier(lexemes[-1]): + lexemes.pop() + return lexemes + + prefix, suffix = _split_trailing_identifier(lexemes[-1]) + if suffix is not None: + if prefix: + lexemes[-1] = prefix + else: + lexemes.pop() + return lexemes + + def _register_c_extern( + self, + name: str, + inputs: int, + outputs: int, + arg_types: List[str], + ret_type: str, + ) -> None: + word = self.dictionary.lookup(name) + if word is None: + word = Word(name=name) + self.dictionary.register(word) + word.is_extern = True + word.extern_inputs = inputs + word.extern_outputs = outputs + word.extern_signature = (arg_types, ret_type) def _handle_token(self, token: Token) -> None: if self._try_literal(token): @@ -1576,6 +1615,70 @@ def _is_identifier(text: str) -> bool: return all(ch.isalnum() or ch == "_" for ch in text) +_C_TYPE_IGNORED_QUALIFIERS = { + "const", + "volatile", + "register", + "restrict", + "static", + "extern", + "_Atomic", +} + + +def _split_trailing_identifier(text: str) -> Tuple[str, Optional[str]]: + if not text: + return text, None + idx = len(text) + while idx > 0 and (text[idx - 1].isalnum() or text[idx - 1] == "_"): + idx -= 1 + if idx == 0 or idx == len(text): + return text, None + prefix = text[:idx] + suffix = text[idx:] + if any(not ch.isalnum() and ch != "_" for ch in prefix): + return prefix, suffix + return text, None + + +def _normalize_c_type_tokens(tokens: Sequence[str], *, allow_default: bool) -> str: + pointer_count = 0 + parts: List[str] = [] + for raw in tokens: + text = raw.strip() + if not text: + continue + if set(text) == {"*"}: + pointer_count += len(text) + continue + while text.startswith("*"): + pointer_count += 1 + text = text[1:] + while text.endswith("*"): + pointer_count += 1 + text = text[:-1] + if not text: + continue + if text in _C_TYPE_IGNORED_QUALIFIERS: + continue + parts.append(text) + if not parts: + if allow_default: + base = "int" + else: + raise ParseError("expected C type before parameter name") + else: + base = " ".join(parts) + return base + ("*" * pointer_count) + + +def _ctype_uses_sse(type_name: Optional[str]) -> bool: + if type_name is None: + return False + base = type_name.rstrip("*") + return base in {"float", "double"} + + def _parse_string_literal(token: Token) -> Optional[str]: text = token.lexeme if len(text) < 2 or text[0] != '"' or text[-1] != '"': @@ -1998,7 +2101,7 @@ class Assembler: outputs = getattr(word, "extern_outputs", 0) signature = getattr(word, "extern_signature", None) - if inputs > 0 or outputs > 0: + if signature is not None or inputs > 0 or outputs > 0: regs = ["rdi", "rsi", "rdx", "rcx", "r8", "r9"] xmm_regs = [f"xmm{i}" for i in range(8)] @@ -2048,7 +2151,7 @@ class Assembler: builder.emit(" leave") # Handle Return Value - if ret_type in ("float", "double"): + if _ctype_uses_sse(ret_type): # Result in xmm0, move to stack builder.emit(" sub r12, 8") builder.emit(" movq rax, xmm0") @@ -3320,7 +3423,25 @@ def run_linker(obj_path: Path, exe_path: Path, debug: bool = False, libs=None): "-dynamic-linker", "/lib64/ld-linux-x86-64.so.2", ]) for lib in libs: - cmd.append(f"-l:{lib}" if ".so" in lib else f"-l{lib}") + if not lib: + continue + lib = str(lib) + if lib.startswith(("-L", "-l", "-Wl,")): + cmd.append(lib) + continue + if lib.startswith(":"): + cmd.append(f"-l{lib}") + continue + if os.path.isabs(lib) or lib.startswith("./") or lib.startswith("../"): + cmd.append(lib) + continue + if os.path.sep in lib or lib.endswith(".a"): + cmd.append(lib) + continue + if ".so" in lib: + cmd.append(f"-l:{lib}") + continue + cmd.append(f"-l{lib}") if debug: cmd.append("-g") diff --git a/tests/__pycache__/run_tests.cpython-314.pyc b/tests/__pycache__/run_tests.cpython-314.pyc deleted file mode 100644 index b51fee4164b2e03b65a19f51a06a7fd31590b66b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5770 zcmcH-TWlN0agTR=9Zyf|MTr#8GHaW$Dao`fTXvk-aU4joVk>7sNo!Gu6M3?(A@A6` zr7baNARtAfplG8sA90ETanS;GfqVpLQKVlD(4PX$2USaXv1f5;- zL`s%W#BGn@?d;6#-0bYk?Cc(?uj3Is?_GFYo2o>A-PM_V^`)7FDM(6S@mVITHU9&W@x%wwLy?vcQ-fQ1N3^7~P; zZUE$yXDnQ9Yu^Kv&RBOXwhT`(M?0%n+?Z^XJVygHm?kRUJr=Fv9LV`5-k0>XqWwtn zwj;?m;40Idui}s1Qky9@slm<@- zp=8H?+cV`AY?SG8;|ql&7Yj$nZ^w*ty-d@1D$lQn@U!+jJ%mr0huNQbuOo-u$mSvoHp{fR~(;JD}s4VzhS%t z<1wJ74Fg-)RB~d<$mX;ReAmz^WhOIEg{gCDDi3;5Bz!ELPl4EJO;^nMUOvXy0Y%ph ztYFR1t;^8&rxmQEGKytEhZ|GyENMqI3m3duMbmq8^C`z?NLyK0=Kdgb7Els`0RqA` z)rMIJ4JOG+E^2H7v0D&FAqWo1a{$G>wr?CvcLD5UvoIf<)0M1hd&)+!8ys`UWpn%n zU5?iwXjjY=`Z9`s-RjFE`iKWu zec%Ezg}_**7o^Zv8vs@_yk0|}^L4L|tO*UTURrDIC~+O;K8ML}PAQp;Y|ZOfnUiyh zsp?k2$AhaGCKd~Rus}we?zJ)n_l!=Od7eKll7-<#coO13zj%a#xjeQgpw409qh*m4 zGh-Nsj~x{gvF(ig{288~(ZO|%!DbGZ&s1^V)I~)+B2J&-zt0!k0}1{( zA7f3D7)_ElO_FTvx{P7uEHHFhPGxfid7Fm)RMrU+9~2^5AFHUWgR0wAIw77A&z=>x z=5j(D*w!a17d$8UV|+hv2aOx5X+m(x>3mMEM1+jBtXlY*=m1zMeGmp!sBuHjPz$PA z4aY0v(`wn6Jusk0%v)+alZdCvn!%Kb#*Qek&lBLV;1Cq8YZ=;Nh)_K~V;b3b1|l%4 zQfvl-D6XqFX~(*$l(nH)C6|LWq6AIbohwWio0a z*C1~P%8JT_QW)K~C_S@kCS!;<4KtJOsam^-#LcluoivGBd<{&+2W!UjbP#M=tI1ns;hg*z)5EsPzX$*vC4*7;iZj$5-w+Z zji5=+p^d@#olkYMIm5_I-j!$Nt?ehrOjb$|geDkA;$GjBn1ri#GtRlb(P-C>(d>k4 ztfrK3^oFk!gYS688g?L{>ujPDd$fiqagzIF#b#ieCWkah5+!|{dDhU#YVJXs*s`?b zsg9Un9Vxx;95+caF@4559H`pvM-!anmAsQ%rIh45Sy{83d%`^ys`5I}6gma2lzxpk z?BrHqHYPgsYCCqI3OkVGceDcweB;reQPh!CZU0M&VpWm@)kr+#;C8o$!#T_RCrj_T zLd>4l_e^;H57y@$eILpCp8vV^J-hi$jgD38?`jQ)bCv;X?cx~_v4@@YtG3jC-+4s; z4eXhN*u+-kJ&JWDVRvn7Gl( zFU!>K2vYq4B%&k85q}->VF+oi2nvHV@*92Z3>!uo6KgDZu0eH~6bmdV{W#`l~6mUw*R5lIvEuXw( zQ?>FLkVcCCGHiK4%t2Bviek9}@ev{|T+RZCRv+hKu@}%f=U;qgY2fXlH;0yv6#4EI zF0$@#D*2;-^+!wogDd`ncMh!YYrn<5HFP`r>)0=1x4Z6KC`S5M_w_IOb`qX0MvkrS zJGSUs^EH-yUCX|%TTj1x>77fZP<%NQzZ>d%Ke`$k{II>~>ss-ReA(Xl^TD@X{?&!| ze7}6|cgNqK`|a5eqaU$<9{x*Rv29{8upT_{Zuif+*M#=9uE<(b>w06$rr+JkFV=4e zNa!pDqRWBk9cCpETNfJMc=q*YOG0Q_2o=L8R)oQAs9RH|aAG-}SP_oziEj=OuEFK- z;N9kvMd9R^t&z8Y*SOU2qvYb1S{>T>MV{!;RtkvA0rB?vmB6>Q2`+|Dt_Y{L zp>BPr6z*LP_pS(id*VR{;3AsdZ9ZNUj(;9#Ed@f$fzWNPBn~W#11o`nZG(%E(;p1{ zap(_29~}8mDu#zwgpqCZTgf})rTF>f`1#fF`4!=*?Zg-F@TJ2;%ZG2>)0 z^KZqALS(~_gcA?WBL2|kFoWFu;>D%t+p#xeOI<}i{4v-4V1xl0xW~UNQiJ{eQeZUd z{wNw49q@0fuSt1D3s&ORd)3!{4Aj=F*##^nxpp#1AyO%(%?Q9Xl{BrL>FYKjki#n} zpjk$|M_!9*+LSA2plv-QxTzwe06^uuMQB9N)FYHblRzy$6R4WkFEt|MN8M=B?NFr} zszelb6`HTXxtZRxSYi%tFVuAMGR!9kLrfBECdsZbhK4qsN+l#Ya*8s~0r58ct+I*! z*4Vnpt@|36T7FW0k3~&wUvhqk%8$7SofNryAj4!i)@gUiaMaY(8o5DHGR_HuT&KYG zf>)N)MhXD9qnOFlJS>}JJ1`H*Rh%ng-r!&L#c%zh#=PPiZ=le-^f8m@)NhE~3bpavmjcG)WD2m}|M#57Mq zLrrt8lC2&uU#mP}lJz2+nx(+|pBpU0Fq<65a9;_CIq(H)`V{e>qK41V{?AayXQ=(} z-oWDUtFM&2;bm|5t~XNh9$NMuy2Gq`A6wwo0u4*C)j-#R?-PIBV(Jb3b^WeCyx`gJ zAXngli)Dfv;6%auEOa+W_g+16*R9p<-*7Q4kkW+3msfqAMQ_&zVnVck^T+i?Z~F$S z?_f6Ejg8z!4+{2d9%^xW?mb3_h8`X&gqRJs4tPT-7~S081iVqod-!4AF2WlE-qR@9 zz4@dFyiY+qf=-Ux0*x)$6EuiQs3FpTRf?1J;