| 1 | //! Machine IR — low-level representation between SSA IR and ARM64 assembly. |
| 2 | //! |
| 3 | //! Uses virtual registers (VReg) that will be assigned to physical registers |
| 4 | //! by the register allocator. Before allocation, all vregs are spilled. |
| 5 | |
| 6 | /// Virtual register identifier. |
| 7 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] |
| 8 | pub struct VRegId(pub u32); |
| 9 | |
| 10 | /// Virtual register with type class. |
| 11 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] |
| 12 | pub struct VReg { |
| 13 | pub id: VRegId, |
| 14 | pub class: RegClass, |
| 15 | } |
| 16 | |
| 17 | /// Register class — determines which physical registers can hold this value. |
| 18 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] |
| 19 | pub enum RegClass { |
| 20 | /// General purpose (X0-X28, excluding x18/x29/x30). |
| 21 | Gp64, |
| 22 | /// 32-bit general purpose (W0-W28). |
| 23 | Gp32, |
| 24 | /// FP/SIMD double (D0-D31). |
| 25 | Fp64, |
| 26 | /// FP/SIMD single (S0-S31). |
| 27 | Fp32, |
| 28 | /// 128-bit NEON vector (Q0-Q31). Covers 4×f32, 2×f64, 4×i32, |
| 29 | /// 2×i64, etc. — every shape in `IrType::Vector`. Codegen |
| 30 | /// shares the same physical bank as Fp32/Fp64 (the V registers |
| 31 | /// are the 128-bit form of D/S), so the regalloc assigns them |
| 32 | /// from the same pool but at 128-bit width. |
| 33 | V128, |
| 34 | } |
| 35 | |
| 36 | /// ARM64 opcodes that we emit. |
| 37 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] |
| 38 | pub enum ArmOpcode { |
| 39 | // ---- Integer arithmetic ---- |
| 40 | AddReg, // ADD Xd, Xn, Xm |
| 41 | AddsReg, // ADDS Xd, Xn, Xm (sets flags) |
| 42 | AdcReg, // ADC Xd, Xn, Xm |
| 43 | AddImm, // ADD Xd, Xn, #imm |
| 44 | SubReg, // SUB Xd, Xn, Xm |
| 45 | SubsReg, // SUBS Xd, Xn, Xm (sets flags) |
| 46 | SbcReg, // SBC Xd, Xn, Xm |
| 47 | SubImm, // SUB Xd, Xn, #imm |
| 48 | Mul, // MUL Xd, Xn, Xm |
| 49 | Sdiv, // SDIV Xd, Xn, Xm |
| 50 | Madd, // MADD Xd, Xn, Xm, Xa (Xa + Xn*Xm; produced by mul-add peephole) |
| 51 | Msub, // MSUB Xd, Xn, Xm, Xa (Xa - Xn*Xm; for imod and the mul-sub peephole) |
| 52 | Neg, // NEG Xd, Xm (alias: SUB Xd, XZR, Xm) |
| 53 | |
| 54 | // ---- Logic ---- |
| 55 | AndReg, |
| 56 | OrrReg, |
| 57 | EorReg, |
| 58 | OrnReg, // MVN is ORN Xd, XZR, Xm |
| 59 | |
| 60 | // ---- Shifts ---- |
| 61 | LslReg, |
| 62 | LsrReg, |
| 63 | AsrReg, |
| 64 | |
| 65 | // ---- Bit manipulation ---- |
| 66 | Mvn, // MVN Xd, Xm (bitwise NOT, alias: ORN Xd, XZR, Xm) |
| 67 | Clz, // CLZ Xd, Xn (count leading zeros) |
| 68 | Rbit, // RBIT Xd, Xn (reverse bits) |
| 69 | |
| 70 | // ---- Comparison & select ---- |
| 71 | CmpReg, // CMP Xn, Xm (alias: SUBS XZR, Xn, Xm) |
| 72 | CmpImm, // CMP Xn, #imm |
| 73 | Cset, // CSET Xd, cond |
| 74 | CselReg, // CSEL Xd, Xn, Xm, cond |
| 75 | FCmpReg, // FCMP Dn, Dm |
| 76 | FCset, // CSET Xd, cond (after FCMP) |
| 77 | FcselReg, // FCSEL Dd, Dn, Dm, cond |
| 78 | |
| 79 | // ---- Float arithmetic ---- |
| 80 | FaddS, |
| 81 | FaddD, |
| 82 | FsubS, |
| 83 | FsubD, |
| 84 | FmulS, |
| 85 | FmulD, |
| 86 | FdivS, |
| 87 | FdivD, |
| 88 | FnegS, |
| 89 | FnegD, |
| 90 | FabsS, |
| 91 | FabsD, |
| 92 | FsqrtS, |
| 93 | FsqrtD, |
| 94 | // Fused multiply-add/subtract (FMADD/FMSUB/FNMSUB). |
| 95 | // 3-source: dest = Sa ± Sn*Sm. |
| 96 | FmaddS, |
| 97 | FmaddD, // FMADD: dest = Sa + Sn*Sm |
| 98 | |
| 99 | // ---- NEON SIMD vector arithmetic (Sprint 12 Stage 2) ---- |
| 100 | // |
| 101 | // Each opcode encodes the lane shape so emit/encoding stays |
| 102 | // table-driven. Naming convention: `<Op><LaneCount><LaneType>`. |
| 103 | // Examples: `FaddV4S` is "fadd Vd.4s, Vn.4s, Vm.4s", `FmlaV2D` |
| 104 | // is "fmla Vd.2d, Vn.2d, Vm.2d". |
| 105 | // |
| 106 | // Operands across this family: dest VReg of class V128 plus the |
| 107 | // expected source operands for that op. Lane shape is implicit |
| 108 | // in the opcode; emit just dispatches. |
| 109 | AddV4S, // ADD Vd.4s, Vn.4s, Vm.4s (integer) |
| 110 | AddV2D, // ADD Vd.2d, Vn.2d, Vm.2d |
| 111 | SubV4S, |
| 112 | SubV2D, |
| 113 | MulV4S, // MUL Vd.4s, Vn.4s, Vm.4s (integer; 2D not in NEON) |
| 114 | NegV4S, |
| 115 | NegV2D, |
| 116 | FaddV4S, // FADD Vd.4s, Vn.4s, Vm.4s |
| 117 | FaddV2D, // FADD Vd.2d, Vn.2d, Vm.2d |
| 118 | FsubV4S, |
| 119 | FsubV2D, |
| 120 | FmulV4S, |
| 121 | FmulV2D, |
| 122 | FdivV4S, |
| 123 | FdivV2D, |
| 124 | FnegV4S, |
| 125 | FnegV2D, |
| 126 | FabsV4S, |
| 127 | FabsV2D, |
| 128 | FsqrtV4S, |
| 129 | FsqrtV2D, |
| 130 | FmlaV4S, // FMLA Vd.4s, Vn.4s, Vm.4s (Vd += Vn*Vm) |
| 131 | FmlaV2D, |
| 132 | /// BSL Vd.16B, Vn.16B, Vm.16B — bit select. Per-bit: |
| 133 | /// `Vd[i] = Vd[i] ? Vn[i] : Vm[i]`. Vd is destructive |
| 134 | /// (input mask + output). Used to lower VSelect. |
| 135 | BslV16B, |
| 136 | /// Vector compare (per-lane all-ones / all-zeros result). |
| 137 | FcmgtV4S, |
| 138 | FcmgtV2D, |
| 139 | FcmgeV4S, |
| 140 | FcmgeV2D, |
| 141 | FcmeqV4S, |
| 142 | FcmeqV2D, |
| 143 | CmgtV4S, |
| 144 | CmgeV4S, |
| 145 | CmeqV4S, |
| 146 | FminV4S, |
| 147 | FminV2D, |
| 148 | FmaxV4S, |
| 149 | FmaxV2D, |
| 150 | SminV4S, // SMIN (signed integer) |
| 151 | SmaxV4S, |
| 152 | UminV4S, |
| 153 | UmaxV4S, |
| 154 | |
| 155 | // Cross-lane reductions |
| 156 | FaddpV2S, // FADDP Sd, Vn.2s (pair-add → scalar; 2-lane f32) |
| 157 | /// `FADDP.4S Vd, Vn, Vm` — 3-operand pairwise add over four |
| 158 | /// f32 lanes. For cross-lane f32 sum reduction we use this with |
| 159 | /// `Vn = Vm = v_src` then follow with FaddpV2S to fold the |
| 160 | /// remaining two lanes (NEON has no `faddv.4s`). |
| 161 | FaddpV4S, |
| 162 | FaddpV2D, // FADDP Dd, Vn.2d (pair-add → scalar; 2-lane f64) |
| 163 | Faddv4S, // FADDV Sd, Vn.4s (across 4 f32 lanes → scalar) |
| 164 | Sminv4S, // SMINV Sd, Vn.4s |
| 165 | Smaxv4S, |
| 166 | /// `FMAXV.4S Sd, Vn` — across-lane f32 max reduction → scalar. |
| 167 | FmaxvV4S, |
| 168 | /// `FMINV.4S Sd, Vn` — across-lane f32 min reduction → scalar. |
| 169 | FminvV4S, |
| 170 | /// `FMAXP.2D Dd, Vn` — pairwise f64 max reduction (2 lanes → scalar). |
| 171 | /// NEON has no `fmaxv.2d`; for two f64 lanes the pairwise form is |
| 172 | /// the across-lane reduction. |
| 173 | FmaxpV2DScalar, |
| 174 | /// `FMINP.2D Dd, Vn` — pairwise f64 min reduction (2 lanes → scalar). |
| 175 | FminpV2DScalar, |
| 176 | /// `ADDP.2D Vd, Vn, Vm` — pairwise integer add over two i64 lanes. |
| 177 | /// Used for i64 cross-lane reduction: `addp.2d v_dst, v_src, v_src` |
| 178 | /// puts the sum of the two lanes in v_dst[0]. |
| 179 | AddpV2D, |
| 180 | Uminv4S, |
| 181 | Umaxv4S, |
| 182 | Addv4S, // integer cross-lane add over 4×i32 |
| 183 | |
| 184 | // Lane move / broadcast |
| 185 | DupGen4S, // DUP Vd.4s, Wn (broadcast scalar to 4 lanes) |
| 186 | DupGen2D, // DUP Vd.2d, Xn |
| 187 | DupEl4S, // DUP Vd.4s, Vn.s[0] (broadcast lane 0 to 4 lanes) |
| 188 | DupEl2D, |
| 189 | Ins4S, // INS Vd.s[lane], Wn (insert scalar into one lane) |
| 190 | Ins2D, |
| 191 | Umov4S, // UMOV Wd, Vn.s[lane] (extract lane to scalar) |
| 192 | Umov2D, |
| 193 | FmovEl4S, // FMOV Sd, Vn.s[lane] (extract f32 lane) |
| 194 | FmovEl2D, |
| 195 | |
| 196 | // Vector load/store (128-bit Q register) |
| 197 | LdrQ, // LDR Qt, [Xn, #imm] |
| 198 | StrQ, // STR Qt, [Xn, #imm] |
| 199 | /// `mov.16b vN, vM` — 128-bit register-to-register copy. |
| 200 | /// Used by regalloc when moving a V128 vreg between physical |
| 201 | /// regs; FmovReg only handles the low 64 bits and would corrupt |
| 202 | /// the upper lanes of a V128. |
| 203 | Mov16B, |
| 204 | FmsubS, |
| 205 | FmsubD, // FMSUB: dest = Sa - Sn*Sm |
| 206 | FnmsubS, |
| 207 | FnmsubD, // FNMSUB: dest = Sn*Sm - Sa |
| 208 | |
| 209 | // ---- Conversions ---- |
| 210 | ScvtfSW, |
| 211 | ScvtfDW, // signed int32 → float |
| 212 | ScvtfSX, |
| 213 | ScvtfDX, // signed int64 → float |
| 214 | FcvtzsWS, |
| 215 | FcvtzsWD, // float → int32 |
| 216 | FcvtzsXS, |
| 217 | FcvtzsXD, // float → int64 |
| 218 | FcvtSD, |
| 219 | FcvtDS, // float↔double |
| 220 | |
| 221 | // ---- Move ---- |
| 222 | Movz, // MOVZ Xd, #imm16, LSL #shift |
| 223 | Movk, // MOVK Xd, #imm16, LSL #shift |
| 224 | Movn, // MOVN Xd, #imm16, LSL #shift |
| 225 | MovReg, // MOV Xd, Xm (alias: ORR Xd, XZR, Xm) |
| 226 | FmovReg, // FMOV Dd, Dm |
| 227 | |
| 228 | // ---- Memory ---- |
| 229 | StrImm, // STR Xt, [Xn, #imm] |
| 230 | LdrImm, // LDR Xt, [Xn, #imm] |
| 231 | StrhImm, // STRH Wt, [Xn, #imm] (store 16-bit half) |
| 232 | LdrshImm, // LDRSH Wt, [Xn, #imm] (load 16-bit half, sign-extended) |
| 233 | StrbImm, // STRB Wt, [Xn, #imm] (store 8-bit byte) |
| 234 | LdrsbImm, // LDRSB Wt, [Xn, #imm] (load 8-bit byte, sign-extended) |
| 235 | StrFpImm, // STR Dt, [Xn, #imm] (float store) |
| 236 | LdrFpImm, // LDR Dt, [Xn, #imm] (float load) |
| 237 | // Register-offset loads/stores: address = base + index << shift. |
| 238 | // Operands: [dest, base, idx, Imm(shift)]. Shift ∈ {0,1,2,3}. |
| 239 | // Sprint 05: emitted by `scaled_addressing_fusion` from a |
| 240 | // Movz+Mul+AddReg+Ldr/Str sequence when elem_size ∈ {1,2,4,8}. |
| 241 | LdrReg, // LDR Xt|Wt, [Xn, Xm, lsl #shift] |
| 242 | StrReg, // STR Xt|Wt, [Xn, Xm, lsl #shift] |
| 243 | LdrFpReg, // LDR Dt|St, [Xn, Xm, lsl #shift] |
| 244 | StrFpReg, // STR Dt|St, [Xn, Xm, lsl #shift] |
| 245 | StpPre, // STP Xt1, Xt2, [Xn, #imm]! (pre-index) |
| 246 | LdpPost, // LDP Xt1, Xt2, [Xn], #imm (post-index) |
| 247 | StpOffset, // STP Xt1, Xt2, [Xn, #imm] (signed offset, no writeback) |
| 248 | LdpOffset, // LDP Xt1, Xt2, [Xn, #imm] (signed offset, no writeback) |
| 249 | AdrpLdr, // ADRP + LDR sequence (load value from PC-relative address) |
| 250 | AdrpAdd, // ADRP + ADD sequence (compute PC-relative address) |
| 251 | |
| 252 | // ---- Branch ---- |
| 253 | B, // B label |
| 254 | BCond, // B.cond label |
| 255 | // Compare-and-branch (single-instruction zero check). Operands: |
| 256 | // [VReg|PhysReg of register to test, BlockRef target] |
| 257 | // Width inferred from the test register's class (Gp32 → cbz w; Gp64 → cbz x). |
| 258 | // ±1MB range (19-bit signed × 4), same as BCond — relaxed identically. |
| 259 | Cbz, |
| 260 | Cbnz, |
| 261 | // Test-bit-and-branch. Operands: |
| 262 | // [VReg|PhysReg of test reg, Imm(bit_index 0..63), BlockRef target] |
| 263 | // ±32KB range (14-bit signed × 4), tighter than BCond — needs its own relax bound. |
| 264 | Tbz, |
| 265 | Tbnz, |
| 266 | Bl, // BL label (call) |
| 267 | Blr, // BLR reg (indirect call) |
| 268 | Ret, // RET |
| 269 | |
| 270 | // ---- Extend ---- |
| 271 | Sxtw, // SXTW Xd, Wn (sign-extend 32→64) |
| 272 | Sxth, // SXTH Wd|Xd, Wn (sign-extend 16→32 or 16→64) |
| 273 | Sxtb, // SXTB Wd|Xd, Wn (sign-extend 8→32 or 8→64) |
| 274 | |
| 275 | // ---- Special ---- |
| 276 | Nop, |
| 277 | Brk, // BRK #imm16 (debug trap) |
| 278 | } |
| 279 | |
| 280 | /// ARM64 condition codes. |
| 281 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] |
| 282 | pub enum ArmCond { |
| 283 | Eq, |
| 284 | Ne, |
| 285 | Hs, |
| 286 | Lo, // unsigned >=, < |
| 287 | Mi, |
| 288 | Pl, // negative, positive |
| 289 | Hi, |
| 290 | Ls, // unsigned >, <= |
| 291 | Ge, |
| 292 | Lt, // signed >=, < |
| 293 | Gt, |
| 294 | Le, // signed >, <= |
| 295 | } |
| 296 | |
| 297 | impl ArmCond { |
| 298 | /// The condition that takes the opposite branch — used by the |
| 299 | /// branch-relaxation pass when expanding a far `B.cond` into a |
| 300 | /// short `B.{!cond}` over an unconditional `B`. The pairs are |
| 301 | /// EQ/NE, HS/LO, MI/PL, HI/LS, GE/LT, GT/LE; involution is |
| 302 | /// guaranteed (`c.inverse().inverse() == c`). |
| 303 | pub fn inverse(self) -> ArmCond { |
| 304 | match self { |
| 305 | ArmCond::Eq => ArmCond::Ne, |
| 306 | ArmCond::Ne => ArmCond::Eq, |
| 307 | ArmCond::Hs => ArmCond::Lo, |
| 308 | ArmCond::Lo => ArmCond::Hs, |
| 309 | ArmCond::Mi => ArmCond::Pl, |
| 310 | ArmCond::Pl => ArmCond::Mi, |
| 311 | ArmCond::Hi => ArmCond::Ls, |
| 312 | ArmCond::Ls => ArmCond::Hi, |
| 313 | ArmCond::Ge => ArmCond::Lt, |
| 314 | ArmCond::Lt => ArmCond::Ge, |
| 315 | ArmCond::Gt => ArmCond::Le, |
| 316 | ArmCond::Le => ArmCond::Gt, |
| 317 | } |
| 318 | } |
| 319 | } |
| 320 | |
| 321 | /// A machine operand. |
| 322 | #[derive(Debug, Clone, PartialEq)] |
| 323 | pub enum MachineOperand { |
| 324 | /// Virtual register. |
| 325 | VReg(VRegId), |
| 326 | /// Physical register (post-allocation or fixed registers like SP, FP, LR). |
| 327 | PhysReg(PhysReg), |
| 328 | /// Immediate value. |
| 329 | Imm(i64), |
| 330 | /// Stack frame slot (offset from FP). |
| 331 | FrameSlot(i32), |
| 332 | /// Condition code. |
| 333 | Cond(ArmCond), |
| 334 | /// Reference to a machine block (branch target). |
| 335 | BlockRef(MBlockId), |
| 336 | /// External symbol name (for BL to functions). |
| 337 | Extern(String), |
| 338 | /// Module-level global by name. Used by ADRP+ADD for SAVE'd |
| 339 | /// locals and module variables, where the operand resolves to |
| 340 | /// `_globalname@PAGE` / `_globalname@PAGEOFF` at emit time. |
| 341 | GlobalLabel(String), |
| 342 | /// Constant pool entry index. |
| 343 | ConstPool(u32), |
| 344 | /// Shift amount for MOVZ/MOVK (0, 16, 32, 48). |
| 345 | Shift(u8), |
| 346 | } |
| 347 | |
| 348 | /// Physical register reference. |
| 349 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] |
| 350 | pub enum PhysReg { |
| 351 | /// 64-bit general purpose register (X0-X30). |
| 352 | Gp(u8), |
| 353 | /// 32-bit general purpose register (W0-W30). |
| 354 | Gp32(u8), |
| 355 | /// 64-bit FP/SIMD register (D0-D31). |
| 356 | Fp(u8), |
| 357 | /// 32-bit FP/SIMD register (S0-S31). |
| 358 | Fp32(u8), |
| 359 | /// Stack pointer. |
| 360 | Sp, |
| 361 | /// Zero register (64-bit context). |
| 362 | Xzr, |
| 363 | /// Zero register (32-bit context). |
| 364 | Wzr, |
| 365 | } |
| 366 | |
| 367 | impl PhysReg { |
| 368 | pub const FP: PhysReg = PhysReg::Gp(29); |
| 369 | pub const LR: PhysReg = PhysReg::Gp(30); |
| 370 | } |
| 371 | |
| 372 | /// Machine block identifier. |
| 373 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] |
| 374 | pub struct MBlockId(pub u32); |
| 375 | |
| 376 | /// A machine instruction. |
| 377 | #[derive(Debug, Clone)] |
| 378 | pub struct MachineInst { |
| 379 | pub opcode: ArmOpcode, |
| 380 | pub operands: Vec<MachineOperand>, |
| 381 | /// Virtual register defined by this instruction (if any). |
| 382 | pub def: Option<VRegId>, |
| 383 | } |
| 384 | |
| 385 | /// A machine basic block. |
| 386 | #[derive(Debug, Clone)] |
| 387 | pub struct MachineBlock { |
| 388 | pub id: MBlockId, |
| 389 | pub label: String, |
| 390 | pub insts: Vec<MachineInst>, |
| 391 | } |
| 392 | |
| 393 | impl MachineBlock { |
| 394 | pub fn new(id: MBlockId, label: String) -> Self { |
| 395 | Self { |
| 396 | id, |
| 397 | label, |
| 398 | insts: Vec::new(), |
| 399 | } |
| 400 | } |
| 401 | } |
| 402 | |
| 403 | /// Constant pool entry. |
| 404 | #[derive(Debug, Clone)] |
| 405 | pub enum ConstPoolEntry { |
| 406 | F32(f32), |
| 407 | F64(f64), |
| 408 | I64(i64), |
| 409 | Bytes(Vec<u8>), |
| 410 | } |
| 411 | |
| 412 | /// Stack frame layout. |
| 413 | #[derive(Debug, Clone)] |
| 414 | pub struct StackFrame { |
| 415 | /// Slots for local variables (name → offset from FP). |
| 416 | pub locals: Vec<FrameSlot>, |
| 417 | /// Total frame size in bytes (16-byte aligned). |
| 418 | pub size: u32, |
| 419 | /// Offset of the next available local slot. |
| 420 | next_offset: i32, |
| 421 | /// Maximum outgoing stack argument area reserved at the bottom of the frame. |
| 422 | outgoing_arg_size: u32, |
| 423 | } |
| 424 | |
| 425 | /// A stack frame slot. |
| 426 | #[derive(Debug, Clone)] |
| 427 | pub struct FrameSlot { |
| 428 | pub offset: i32, // negative offset from FP |
| 429 | pub size: u32, // size in bytes |
| 430 | } |
| 431 | |
| 432 | impl StackFrame { |
| 433 | pub fn new() -> Self { |
| 434 | // Apple ARM64 frame layout: |
| 435 | // FP points at saved FP/LR (top of frame). |
| 436 | // Locals are at negative offsets from FP. |
| 437 | // SP is at the bottom of the frame. |
| 438 | // |
| 439 | // [FP+0] = saved x29 |
| 440 | // [FP+8] = saved x30 |
| 441 | // [FP-8] = first local |
| 442 | // [FP-16] = second local |
| 443 | // ... |
| 444 | // [SP] = bottom of frame |
| 445 | // |
| 446 | // Prologue: sub sp, sp, #FRAME_SIZE |
| 447 | // stp x29, x30, [sp, #FRAME_SIZE - 16] |
| 448 | // add x29, sp, #FRAME_SIZE - 16 |
| 449 | // Epilogue: ldp x29, x30, [sp, #FRAME_SIZE - 16] |
| 450 | // add sp, sp, #FRAME_SIZE |
| 451 | // ret |
| 452 | Self { |
| 453 | locals: Vec::new(), |
| 454 | size: 16, |
| 455 | next_offset: 0, |
| 456 | outgoing_arg_size: 0, |
| 457 | } |
| 458 | } |
| 459 | } |
| 460 | |
| 461 | impl Default for StackFrame { |
| 462 | fn default() -> Self { |
| 463 | Self::new() |
| 464 | } |
| 465 | } |
| 466 | |
| 467 | impl StackFrame { |
| 468 | /// Allocate a local variable slot. Returns a negative offset from FP. |
| 469 | /// Locals grow downward from FP: first local at [FP-8], etc. |
| 470 | /// |
| 471 | /// Alignment ladders 4 → 8 → 16 by size. The 16 case matters for |
| 472 | /// 128-bit NEON vector spills — Apple Silicon's `LDR Q` / `STR Q` |
| 473 | /// require 16-byte alignment; an 8-byte cap silently produces |
| 474 | /// addresses that may fault on slow paths. |
| 475 | pub fn alloc_local(&mut self, size: u32) -> i32 { |
| 476 | let align = if size >= 16 { |
| 477 | 16i32 |
| 478 | } else if size >= 8 { |
| 479 | 8 |
| 480 | } else { |
| 481 | 4 |
| 482 | }; |
| 483 | self.next_offset += size as i32; |
| 484 | self.next_offset = (self.next_offset + align - 1) & !(align - 1); |
| 485 | let offset = -self.next_offset; // negative from FP |
| 486 | self.locals.push(FrameSlot { offset, size }); |
| 487 | self.recompute_size(); |
| 488 | offset |
| 489 | } |
| 490 | |
| 491 | /// Frame size = 16 (FP+LR) + locals, 16-byte aligned. |
| 492 | fn recompute_size(&mut self) { |
| 493 | let raw = 16 + self.next_offset as u32 + self.outgoing_arg_size; |
| 494 | self.size = (raw + 15) & !15; |
| 495 | } |
| 496 | |
| 497 | /// Reserve the maximum outgoing stack argument area this function needs. |
| 498 | pub fn reserve_outgoing_args(&mut self, size: u32) { |
| 499 | if size > self.outgoing_arg_size { |
| 500 | self.outgoing_arg_size = size; |
| 501 | self.recompute_size(); |
| 502 | } |
| 503 | } |
| 504 | } |
| 505 | |
| 506 | /// A machine function — the codegen output for one IR function. |
| 507 | #[derive(Debug, Clone)] |
| 508 | pub struct MachineFunction { |
| 509 | pub name: String, |
| 510 | pub blocks: Vec<MachineBlock>, |
| 511 | pub frame: StackFrame, |
| 512 | pub vregs: Vec<VReg>, |
| 513 | pub const_pool: Vec<ConstPoolEntry>, |
| 514 | pub internal_only: bool, |
| 515 | next_vreg: u32, |
| 516 | next_block: u32, |
| 517 | } |
| 518 | |
| 519 | impl MachineFunction { |
| 520 | pub fn new(name: String) -> Self { |
| 521 | let entry = MachineBlock::new(MBlockId(0), format!("_{}", name)); |
| 522 | Self { |
| 523 | name, |
| 524 | blocks: vec![entry], |
| 525 | frame: StackFrame::new(), |
| 526 | vregs: Vec::new(), |
| 527 | const_pool: Vec::new(), |
| 528 | internal_only: false, |
| 529 | next_vreg: 0, |
| 530 | next_block: 1, |
| 531 | } |
| 532 | } |
| 533 | |
| 534 | /// Allocate a new virtual register. |
| 535 | pub fn new_vreg(&mut self, class: RegClass) -> VRegId { |
| 536 | let id = VRegId(self.next_vreg); |
| 537 | self.next_vreg += 1; |
| 538 | self.vregs.push(VReg { id, class }); |
| 539 | id |
| 540 | } |
| 541 | |
| 542 | /// Create a new machine block. |
| 543 | pub fn new_block(&mut self, label: &str) -> MBlockId { |
| 544 | let id = MBlockId(self.next_block); |
| 545 | self.next_block += 1; |
| 546 | self.blocks.push(MachineBlock::new(id, label.into())); |
| 547 | id |
| 548 | } |
| 549 | |
| 550 | /// Allocate a fresh block-id without inserting a block. The |
| 551 | /// caller is responsible for placing the block at the right |
| 552 | /// position in `self.blocks`. Used by passes that need physical |
| 553 | /// block adjacency (e.g. branch relaxation, which inserts a |
| 554 | /// skip block immediately after the source block). |
| 555 | pub fn next_block_id(&mut self) -> u32 { |
| 556 | let id = self.next_block; |
| 557 | self.next_block += 1; |
| 558 | id |
| 559 | } |
| 560 | |
| 561 | /// Get a block by ID. |
| 562 | pub fn block(&self, id: MBlockId) -> &MachineBlock { |
| 563 | self.blocks |
| 564 | .iter() |
| 565 | .find(|b| b.id == id) |
| 566 | .expect("machine block not found") |
| 567 | } |
| 568 | |
| 569 | /// Get a mutable block by ID. |
| 570 | pub fn block_mut(&mut self, id: MBlockId) -> &mut MachineBlock { |
| 571 | self.blocks |
| 572 | .iter_mut() |
| 573 | .find(|b| b.id == id) |
| 574 | .expect("machine block not found") |
| 575 | } |
| 576 | |
| 577 | /// Add a constant pool entry, return its index. |
| 578 | pub fn add_const(&mut self, entry: ConstPoolEntry) -> u32 { |
| 579 | let idx = self.const_pool.len() as u32; |
| 580 | self.const_pool.push(entry); |
| 581 | idx |
| 582 | } |
| 583 | |
| 584 | /// Allocate a local stack slot. |
| 585 | pub fn alloc_local(&mut self, size: u32) -> i32 { |
| 586 | self.frame.alloc_local(size) |
| 587 | } |
| 588 | |
| 589 | /// Reserve outgoing stack argument space for calls made by this function. |
| 590 | pub fn reserve_outgoing_args(&mut self, size: u32) { |
| 591 | self.frame.reserve_outgoing_args(size) |
| 592 | } |
| 593 | } |
| 594 | |
| 595 | #[cfg(test)] |
| 596 | mod tests { |
| 597 | use super::*; |
| 598 | |
| 599 | #[test] |
| 600 | fn stack_frame_alignment() { |
| 601 | let mut frame = StackFrame::new(); |
| 602 | frame.alloc_local(4); // i32 |
| 603 | assert_eq!( |
| 604 | frame.size % 16, |
| 605 | 0, |
| 606 | "frame size {} not 16-byte aligned", |
| 607 | frame.size |
| 608 | ); |
| 609 | |
| 610 | frame.alloc_local(8); // i64 |
| 611 | assert_eq!(frame.size % 16, 0); |
| 612 | |
| 613 | frame.alloc_local(1); // bool |
| 614 | assert_eq!(frame.size % 16, 0); |
| 615 | } |
| 616 | |
| 617 | #[test] |
| 618 | fn stack_slots_dont_overlap() { |
| 619 | let mut frame = StackFrame::new(); |
| 620 | let off1 = frame.alloc_local(4); |
| 621 | let off2 = frame.alloc_local(4); |
| 622 | let off3 = frame.alloc_local(8); |
| 623 | assert_ne!(off1, off2); |
| 624 | assert_ne!(off2, off3); |
| 625 | // All offsets should be negative (below FP). |
| 626 | assert!(off1 < 0); |
| 627 | assert!(off2 < 0); |
| 628 | assert!(off3 < 0); |
| 629 | // No overlap: each slot's range is [offset, offset+size). |
| 630 | assert!(off2 + 4 <= off1 || off1 + 4 <= off2); |
| 631 | } |
| 632 | |
| 633 | #[test] |
| 634 | fn vreg_allocation() { |
| 635 | let mut mf = MachineFunction::new("test".into()); |
| 636 | let v0 = mf.new_vreg(RegClass::Gp64); |
| 637 | let v1 = mf.new_vreg(RegClass::Fp64); |
| 638 | assert_eq!(v0, VRegId(0)); |
| 639 | assert_eq!(v1, VRegId(1)); |
| 640 | assert_eq!(mf.vregs.len(), 2); |
| 641 | assert_eq!(mf.vregs[0].class, RegClass::Gp64); |
| 642 | assert_eq!(mf.vregs[1].class, RegClass::Fp64); |
| 643 | } |
| 644 | |
| 645 | #[test] |
| 646 | fn const_pool() { |
| 647 | let mut mf = MachineFunction::new("test".into()); |
| 648 | let idx0 = mf.add_const(ConstPoolEntry::F64(3.14)); |
| 649 | let idx1 = mf.add_const(ConstPoolEntry::F32(2.0)); |
| 650 | assert_eq!(idx0, 0); |
| 651 | assert_eq!(idx1, 1); |
| 652 | } |
| 653 | |
| 654 | #[test] |
| 655 | fn frame_size_starts_at_16() { |
| 656 | let frame = StackFrame::new(); |
| 657 | assert_eq!(frame.size, 16); // just FP+LR |
| 658 | } |
| 659 | |
| 660 | #[test] |
| 661 | fn reserve_outgoing_args_grows_frame() { |
| 662 | let mut frame = StackFrame::new(); |
| 663 | frame.alloc_local(8); |
| 664 | let before = frame.size; |
| 665 | frame.reserve_outgoing_args(16); |
| 666 | assert!(frame.size >= before + 16); |
| 667 | assert_eq!(frame.size % 16, 0); |
| 668 | } |
| 669 | } |
| 670 |