armfortas Public

Watch 0 Fork 0 Star 0

Rust · 20667 bytes Raw Blame History

  
        1
        //! Machine IR — low-level representation between SSA IR and ARM64 assembly.
      
        2
        //!
      
        3
        //! Uses virtual registers (VReg) that will be assigned to physical registers
      
        4
        //! by the register allocator. Before allocation, all vregs are spilled.
      
        5
        
        6
        /// Virtual register identifier.
      
        7
        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
      
        8
        pub struct VRegId(pub u32);
      
        9
        
        10
        /// Virtual register with type class.
      
        11
        #[derive(Debug, Clone, Copy, PartialEq, Eq)]
      
        12
        pub struct VReg {
      
        13
            pub id: VRegId,
      
        14
            pub class: RegClass,
      
        15
        }
      
        16
        
        17
        /// Register class — determines which physical registers can hold this value.
      
        18
        #[derive(Debug, Clone, Copy, PartialEq, Eq)]
      
        19
        pub enum RegClass {
      
        20
            /// General purpose (X0-X28, excluding x18/x29/x30).
      
        21
            Gp64,
      
        22
            /// 32-bit general purpose (W0-W28).
      
        23
            Gp32,
      
        24
            /// FP/SIMD double (D0-D31).
      
        25
            Fp64,
      
        26
            /// FP/SIMD single (S0-S31).
      
        27
            Fp32,
      
        28
            /// 128-bit NEON vector (Q0-Q31). Covers 4×f32, 2×f64, 4×i32,
      
        29
            /// 2×i64, etc. — every shape in `IrType::Vector`. Codegen
      
        30
            /// shares the same physical bank as Fp32/Fp64 (the V registers
      
        31
            /// are the 128-bit form of D/S), so the regalloc assigns them
      
        32
            /// from the same pool but at 128-bit width.
      
        33
            V128,
      
        34
        }
      
        35
        
        36
        /// ARM64 opcodes that we emit.
      
        37
        #[derive(Debug, Clone, Copy, PartialEq, Eq)]
      
        38
        pub enum ArmOpcode {
      
        39
            // ---- Integer arithmetic ----
      
        40
            AddReg,  // ADD Xd, Xn, Xm
      
        41
            AddsReg, // ADDS Xd, Xn, Xm (sets flags)
      
        42
            AdcReg,  // ADC Xd, Xn, Xm
      
        43
            AddImm,  // ADD Xd, Xn, #imm
      
        44
            SubReg,  // SUB Xd, Xn, Xm
      
        45
            SubsReg, // SUBS Xd, Xn, Xm (sets flags)
      
        46
            SbcReg,  // SBC Xd, Xn, Xm
      
        47
            SubImm,  // SUB Xd, Xn, #imm
      
        48
            Mul,     // MUL Xd, Xn, Xm
      
        49
            Sdiv,    // SDIV Xd, Xn, Xm
      
        50
            Madd,    // MADD Xd, Xn, Xm, Xa   (Xa + Xn*Xm; produced by mul-add peephole)
      
        51
            Msub,    // MSUB Xd, Xn, Xm, Xa   (Xa - Xn*Xm; for imod and the mul-sub peephole)
      
        52
            Neg,     // NEG Xd, Xm  (alias: SUB Xd, XZR, Xm)
      
        53
        
        54
            // ---- Logic ----
      
        55
            AndReg,
      
        56
            OrrReg,
      
        57
            EorReg,
      
        58
            OrnReg, // MVN is ORN Xd, XZR, Xm
      
        59
        
        60
            // ---- Shifts ----
      
        61
            LslReg,
      
        62
            LsrReg,
      
        63
            AsrReg,
      
        64
        
        65
            // ---- Bit manipulation ----
      
        66
            Mvn,  // MVN Xd, Xm  (bitwise NOT, alias: ORN Xd, XZR, Xm)
      
        67
            Clz,  // CLZ Xd, Xn  (count leading zeros)
      
        68
            Rbit, // RBIT Xd, Xn (reverse bits)
      
        69
        
        70
            // ---- Comparison & select ----
      
        71
            CmpReg,   // CMP Xn, Xm  (alias: SUBS XZR, Xn, Xm)
      
        72
            CmpImm,   // CMP Xn, #imm
      
        73
            Cset,     // CSET Xd, cond
      
        74
            CselReg,  // CSEL Xd, Xn, Xm, cond
      
        75
            FCmpReg,  // FCMP Dn, Dm
      
        76
            FCset,    // CSET Xd, cond  (after FCMP)
      
        77
            FcselReg, // FCSEL Dd, Dn, Dm, cond
      
        78
        
        79
            // ---- Float arithmetic ----
      
        80
            FaddS,
      
        81
            FaddD,
      
        82
            FsubS,
      
        83
            FsubD,
      
        84
            FmulS,
      
        85
            FmulD,
      
        86
            FdivS,
      
        87
            FdivD,
      
        88
            FnegS,
      
        89
            FnegD,
      
        90
            FabsS,
      
        91
            FabsD,
      
        92
            FsqrtS,
      
        93
            FsqrtD,
      
        94
            // Fused multiply-add/subtract (FMADD/FMSUB/FNMSUB).
      
        95
            // 3-source: dest = Sa ± Sn*Sm.
      
        96
            FmaddS,
      
        97
            FmaddD, // FMADD:  dest = Sa + Sn*Sm
      
        98
        
        99
            // ---- NEON SIMD vector arithmetic (Sprint 12 Stage 2) ----
      
        100
            //
      
        101
            // Each opcode encodes the lane shape so emit/encoding stays
      
        102
            // table-driven. Naming convention: `<Op><LaneCount><LaneType>`.
      
        103
            // Examples: `FaddV4S` is "fadd Vd.4s, Vn.4s, Vm.4s", `FmlaV2D`
      
        104
            // is "fmla Vd.2d, Vn.2d, Vm.2d".
      
        105
            //
      
        106
            // Operands across this family: dest VReg of class V128 plus the
      
        107
            // expected source operands for that op. Lane shape is implicit
      
        108
            // in the opcode; emit just dispatches.
      
        109
            AddV4S,  // ADD Vd.4s, Vn.4s, Vm.4s    (integer)
      
        110
            AddV2D,  // ADD Vd.2d, Vn.2d, Vm.2d
      
        111
            SubV4S,
      
        112
            SubV2D,
      
        113
            MulV4S,  // MUL Vd.4s, Vn.4s, Vm.4s    (integer; 2D not in NEON)
      
        114
            NegV4S,
      
        115
            NegV2D,
      
        116
            FaddV4S, // FADD Vd.4s, Vn.4s, Vm.4s
      
        117
            FaddV2D, // FADD Vd.2d, Vn.2d, Vm.2d
      
        118
            FsubV4S,
      
        119
            FsubV2D,
      
        120
            FmulV4S,
      
        121
            FmulV2D,
      
        122
            FdivV4S,
      
        123
            FdivV2D,
      
        124
            FnegV4S,
      
        125
            FnegV2D,
      
        126
            FabsV4S,
      
        127
            FabsV2D,
      
        128
            FsqrtV4S,
      
        129
            FsqrtV2D,
      
        130
            FmlaV4S, // FMLA Vd.4s, Vn.4s, Vm.4s   (Vd += Vn*Vm)
      
        131
            FmlaV2D,
      
        132
            /// BSL Vd.16B, Vn.16B, Vm.16B — bit select. Per-bit:
      
        133
            /// `Vd[i] = Vd[i] ? Vn[i] : Vm[i]`. Vd is destructive
      
        134
            /// (input mask + output). Used to lower VSelect.
      
        135
            BslV16B,
      
        136
            /// Vector compare (per-lane all-ones / all-zeros result).
      
        137
            FcmgtV4S,
      
        138
            FcmgtV2D,
      
        139
            FcmgeV4S,
      
        140
            FcmgeV2D,
      
        141
            FcmeqV4S,
      
        142
            FcmeqV2D,
      
        143
            CmgtV4S,
      
        144
            CmgeV4S,
      
        145
            CmeqV4S,
      
        146
            FminV4S,
      
        147
            FminV2D,
      
        148
            FmaxV4S,
      
        149
            FmaxV2D,
      
        150
            SminV4S, // SMIN (signed integer)
      
        151
            SmaxV4S,
      
        152
            UminV4S,
      
        153
            UmaxV4S,
      
        154
        
        155
            // Cross-lane reductions
      
        156
            FaddpV2S, // FADDP Sd, Vn.2s     (pair-add → scalar; 2-lane f32)
      
        157
            /// `FADDP.4S Vd, Vn, Vm` — 3-operand pairwise add over four
      
        158
            /// f32 lanes. For cross-lane f32 sum reduction we use this with
      
        159
            /// `Vn = Vm = v_src` then follow with FaddpV2S to fold the
      
        160
            /// remaining two lanes (NEON has no `faddv.4s`).
      
        161
            FaddpV4S,
      
        162
            FaddpV2D, // FADDP Dd, Vn.2d     (pair-add → scalar; 2-lane f64)
      
        163
            Faddv4S,  // FADDV Sd, Vn.4s     (across 4 f32 lanes → scalar)
      
        164
            Sminv4S,  // SMINV Sd, Vn.4s
      
        165
            Smaxv4S,
      
        166
            /// `FMAXV.4S Sd, Vn` — across-lane f32 max reduction → scalar.
      
        167
            FmaxvV4S,
      
        168
            /// `FMINV.4S Sd, Vn` — across-lane f32 min reduction → scalar.
      
        169
            FminvV4S,
      
        170
            /// `FMAXP.2D Dd, Vn` — pairwise f64 max reduction (2 lanes → scalar).
      
        171
            /// NEON has no `fmaxv.2d`; for two f64 lanes the pairwise form is
      
        172
            /// the across-lane reduction.
      
        173
            FmaxpV2DScalar,
      
        174
            /// `FMINP.2D Dd, Vn` — pairwise f64 min reduction (2 lanes → scalar).
      
        175
            FminpV2DScalar,
      
        176
            /// `ADDP.2D Vd, Vn, Vm` — pairwise integer add over two i64 lanes.
      
        177
            /// Used for i64 cross-lane reduction: `addp.2d v_dst, v_src, v_src`
      
        178
            /// puts the sum of the two lanes in v_dst[0].
      
        179
            AddpV2D,
      
        180
            Uminv4S,
      
        181
            Umaxv4S,
      
        182
            Addv4S,   // integer cross-lane add over 4×i32
      
        183
        
        184
            // Lane move / broadcast
      
        185
            DupGen4S, // DUP Vd.4s, Wn       (broadcast scalar to 4 lanes)
      
        186
            DupGen2D, // DUP Vd.2d, Xn
      
        187
            DupEl4S,  // DUP Vd.4s, Vn.s[0]  (broadcast lane 0 to 4 lanes)
      
        188
            DupEl2D,
      
        189
            Ins4S,    // INS Vd.s[lane], Wn  (insert scalar into one lane)
      
        190
            Ins2D,
      
        191
            Umov4S,   // UMOV Wd, Vn.s[lane] (extract lane to scalar)
      
        192
            Umov2D,
      
        193
            FmovEl4S, // FMOV Sd, Vn.s[lane] (extract f32 lane)
      
        194
            FmovEl2D,
      
        195
        
        196
            // Vector load/store (128-bit Q register)
      
        197
            LdrQ,     // LDR Qt, [Xn, #imm]
      
        198
            StrQ,     // STR Qt, [Xn, #imm]
      
        199
            /// `mov.16b vN, vM` — 128-bit register-to-register copy.
      
        200
            /// Used by regalloc when moving a V128 vreg between physical
      
        201
            /// regs; FmovReg only handles the low 64 bits and would corrupt
      
        202
            /// the upper lanes of a V128.
      
        203
            Mov16B,
      
        204
            FmsubS,
      
        205
            FmsubD, // FMSUB:  dest = Sa - Sn*Sm
      
        206
            FnmsubS,
      
        207
            FnmsubD, // FNMSUB: dest = Sn*Sm - Sa
      
        208
        
        209
            // ---- Conversions ----
      
        210
            ScvtfSW,
      
        211
            ScvtfDW, // signed int32 → float
      
        212
            ScvtfSX,
      
        213
            ScvtfDX, // signed int64 → float
      
        214
            FcvtzsWS,
      
        215
            FcvtzsWD, // float → int32
      
        216
            FcvtzsXS,
      
        217
            FcvtzsXD, // float → int64
      
        218
            FcvtSD,
      
        219
            FcvtDS, // float↔double
      
        220
        
        221
            // ---- Move ----
      
        222
            Movz,    // MOVZ Xd, #imm16, LSL #shift
      
        223
            Movk,    // MOVK Xd, #imm16, LSL #shift
      
        224
            Movn,    // MOVN Xd, #imm16, LSL #shift
      
        225
            MovReg,  // MOV Xd, Xm  (alias: ORR Xd, XZR, Xm)
      
        226
            FmovReg, // FMOV Dd, Dm
      
        227
        
        228
            // ---- Memory ----
      
        229
            StrImm,    // STR Xt, [Xn, #imm]
      
        230
            LdrImm,    // LDR Xt, [Xn, #imm]
      
        231
            StrhImm,   // STRH Wt, [Xn, #imm]  (store 16-bit half)
      
        232
            LdrshImm,  // LDRSH Wt, [Xn, #imm] (load 16-bit half, sign-extended)
      
        233
            StrbImm,   // STRB Wt, [Xn, #imm]  (store 8-bit byte)
      
        234
            LdrsbImm,  // LDRSB Wt, [Xn, #imm] (load 8-bit byte, sign-extended)
      
        235
            StrFpImm,  // STR Dt, [Xn, #imm]  (float store)
      
        236
            LdrFpImm,  // LDR Dt, [Xn, #imm]  (float load)
      
        237
            // Register-offset loads/stores: address = base + index << shift.
      
        238
            // Operands: [dest, base, idx, Imm(shift)]. Shift ∈ {0,1,2,3}.
      
        239
            // Sprint 05: emitted by `scaled_addressing_fusion` from a
      
        240
            // Movz+Mul+AddReg+Ldr/Str sequence when elem_size ∈ {1,2,4,8}.
      
        241
            LdrReg,    // LDR Xt|Wt, [Xn, Xm, lsl #shift]
      
        242
            StrReg,    // STR Xt|Wt, [Xn, Xm, lsl #shift]
      
        243
            LdrFpReg,  // LDR Dt|St, [Xn, Xm, lsl #shift]
      
        244
            StrFpReg,  // STR Dt|St, [Xn, Xm, lsl #shift]
      
        245
            StpPre,    // STP Xt1, Xt2, [Xn, #imm]!  (pre-index)
      
        246
            LdpPost,   // LDP Xt1, Xt2, [Xn], #imm   (post-index)
      
        247
            StpOffset, // STP Xt1, Xt2, [Xn, #imm]   (signed offset, no writeback)
      
        248
            LdpOffset, // LDP Xt1, Xt2, [Xn, #imm]   (signed offset, no writeback)
      
        249
            AdrpLdr,   // ADRP + LDR sequence (load value from PC-relative address)
      
        250
            AdrpAdd,   // ADRP + ADD sequence (compute PC-relative address)
      
        251
        
        252
            // ---- Branch ----
      
        253
            B,     // B label
      
        254
            BCond, // B.cond label
      
        255
            // Compare-and-branch (single-instruction zero check). Operands:
      
        256
            //   [VReg|PhysReg of register to test, BlockRef target]
      
        257
            // Width inferred from the test register's class (Gp32 → cbz w; Gp64 → cbz x).
      
        258
            // ±1MB range (19-bit signed × 4), same as BCond — relaxed identically.
      
        259
            Cbz,
      
        260
            Cbnz,
      
        261
            // Test-bit-and-branch. Operands:
      
        262
            //   [VReg|PhysReg of test reg, Imm(bit_index 0..63), BlockRef target]
      
        263
            // ±32KB range (14-bit signed × 4), tighter than BCond — needs its own relax bound.
      
        264
            Tbz,
      
        265
            Tbnz,
      
        266
            Bl,    // BL label  (call)
      
        267
            Blr,   // BLR reg   (indirect call)
      
        268
            Ret,   // RET
      
        269
        
        270
            // ---- Extend ----
      
        271
            Sxtw, // SXTW Xd, Wn (sign-extend 32→64)
      
        272
            Sxth, // SXTH Wd|Xd, Wn (sign-extend 16→32 or 16→64)
      
        273
            Sxtb, // SXTB Wd|Xd, Wn (sign-extend 8→32 or 8→64)
      
        274
        
        275
            // ---- Special ----
      
        276
            Nop,
      
        277
            Brk, // BRK #imm16  (debug trap)
      
        278
        }
      
        279
        
        280
        /// ARM64 condition codes.
      
        281
        #[derive(Debug, Clone, Copy, PartialEq, Eq)]
      
        282
        pub enum ArmCond {
      
        283
            Eq,
      
        284
            Ne,
      
        285
            Hs,
      
        286
            Lo, // unsigned >=, <
      
        287
            Mi,
      
        288
            Pl, // negative, positive
      
        289
            Hi,
      
        290
            Ls, // unsigned >, <=
      
        291
            Ge,
      
        292
            Lt, // signed >=, <
      
        293
            Gt,
      
        294
            Le, // signed >, <=
      
        295
        }
      
        296
        
        297
        impl ArmCond {
      
        298
            /// The condition that takes the opposite branch — used by the
      
        299
            /// branch-relaxation pass when expanding a far `B.cond` into a
      
        300
            /// short `B.{!cond}` over an unconditional `B`. The pairs are
      
        301
            /// EQ/NE, HS/LO, MI/PL, HI/LS, GE/LT, GT/LE; involution is
      
        302
            /// guaranteed (`c.inverse().inverse() == c`).
      
        303
            pub fn inverse(self) -> ArmCond {
      
        304
                match self {
      
        305
                    ArmCond::Eq => ArmCond::Ne,
      
        306
                    ArmCond::Ne => ArmCond::Eq,
      
        307
                    ArmCond::Hs => ArmCond::Lo,
      
        308
                    ArmCond::Lo => ArmCond::Hs,
      
        309
                    ArmCond::Mi => ArmCond::Pl,
      
        310
                    ArmCond::Pl => ArmCond::Mi,
      
        311
                    ArmCond::Hi => ArmCond::Ls,
      
        312
                    ArmCond::Ls => ArmCond::Hi,
      
        313
                    ArmCond::Ge => ArmCond::Lt,
      
        314
                    ArmCond::Lt => ArmCond::Ge,
      
        315
                    ArmCond::Gt => ArmCond::Le,
      
        316
                    ArmCond::Le => ArmCond::Gt,
      
        317
                }
      
        318
            }
      
        319
        }
      
        320
        
        321
        /// A machine operand.
      
        322
        #[derive(Debug, Clone, PartialEq)]
      
        323
        pub enum MachineOperand {
      
        324
            /// Virtual register.
      
        325
            VReg(VRegId),
      
        326
            /// Physical register (post-allocation or fixed registers like SP, FP, LR).
      
        327
            PhysReg(PhysReg),
      
        328
            /// Immediate value.
      
        329
            Imm(i64),
      
        330
            /// Stack frame slot (offset from FP).
      
        331
            FrameSlot(i32),
      
        332
            /// Condition code.
      
        333
            Cond(ArmCond),
      
        334
            /// Reference to a machine block (branch target).
      
        335
            BlockRef(MBlockId),
      
        336
            /// External symbol name (for BL to functions).
      
        337
            Extern(String),
      
        338
            /// Module-level global by name. Used by ADRP+ADD for SAVE'd
      
        339
            /// locals and module variables, where the operand resolves to
      
        340
            /// `_globalname@PAGE` / `_globalname@PAGEOFF` at emit time.
      
        341
            GlobalLabel(String),
      
        342
            /// Constant pool entry index.
      
        343
            ConstPool(u32),
      
        344
            /// Shift amount for MOVZ/MOVK (0, 16, 32, 48).
      
        345
            Shift(u8),
      
        346
        }
      
        347
        
        348
        /// Physical register reference.
      
        349
        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
      
        350
        pub enum PhysReg {
      
        351
            /// 64-bit general purpose register (X0-X30).
      
        352
            Gp(u8),
      
        353
            /// 32-bit general purpose register (W0-W30).
      
        354
            Gp32(u8),
      
        355
            /// 64-bit FP/SIMD register (D0-D31).
      
        356
            Fp(u8),
      
        357
            /// 32-bit FP/SIMD register (S0-S31).
      
        358
            Fp32(u8),
      
        359
            /// Stack pointer.
      
        360
            Sp,
      
        361
            /// Zero register (64-bit context).
      
        362
            Xzr,
      
        363
            /// Zero register (32-bit context).
      
        364
            Wzr,
      
        365
        }
      
        366
        
        367
        impl PhysReg {
      
        368
            pub const FP: PhysReg = PhysReg::Gp(29);
      
        369
            pub const LR: PhysReg = PhysReg::Gp(30);
      
        370
        }
      
        371
        
        372
        /// Machine block identifier.
      
        373
        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
      
        374
        pub struct MBlockId(pub u32);
      
        375
        
        376
        /// A machine instruction.
      
        377
        #[derive(Debug, Clone)]
      
        378
        pub struct MachineInst {
      
        379
            pub opcode: ArmOpcode,
      
        380
            pub operands: Vec<MachineOperand>,
      
        381
            /// Virtual register defined by this instruction (if any).
      
        382
            pub def: Option<VRegId>,
      
        383
        }
      
        384
        
        385
        /// A machine basic block.
      
        386
        #[derive(Debug, Clone)]
      
        387
        pub struct MachineBlock {
      
        388
            pub id: MBlockId,
      
        389
            pub label: String,
      
        390
            pub insts: Vec<MachineInst>,
      
        391
        }
      
        392
        
        393
        impl MachineBlock {
      
        394
            pub fn new(id: MBlockId, label: String) -> Self {
      
        395
                Self {
      
        396
                    id,
      
        397
                    label,
      
        398
                    insts: Vec::new(),
      
        399
                }
      
        400
            }
      
        401
        }
      
        402
        
        403
        /// Constant pool entry.
      
        404
        #[derive(Debug, Clone)]
      
        405
        pub enum ConstPoolEntry {
      
        406
            F32(f32),
      
        407
            F64(f64),
      
        408
            I64(i64),
      
        409
            Bytes(Vec<u8>),
      
        410
        }
      
        411
        
        412
        /// Stack frame layout.
      
        413
        #[derive(Debug, Clone)]
      
        414
        pub struct StackFrame {
      
        415
            /// Slots for local variables (name → offset from FP).
      
        416
            pub locals: Vec<FrameSlot>,
      
        417
            /// Total frame size in bytes (16-byte aligned).
      
        418
            pub size: u32,
      
        419
            /// Offset of the next available local slot.
      
        420
            next_offset: i32,
      
        421
            /// Maximum outgoing stack argument area reserved at the bottom of the frame.
      
        422
            outgoing_arg_size: u32,
      
        423
        }
      
        424
        
        425
        /// A stack frame slot.
      
        426
        #[derive(Debug, Clone)]
      
        427
        pub struct FrameSlot {
      
        428
            pub offset: i32, // negative offset from FP
      
        429
            pub size: u32,   // size in bytes
      
        430
        }
      
        431
        
        432
        impl StackFrame {
      
        433
            pub fn new() -> Self {
      
        434
                // Apple ARM64 frame layout:
      
        435
                //   FP points at saved FP/LR (top of frame).
      
        436
                //   Locals are at negative offsets from FP.
      
        437
                //   SP is at the bottom of the frame.
      
        438
                //
      
        439
                //   [FP+0]  = saved x29
      
        440
                //   [FP+8]  = saved x30
      
        441
                //   [FP-8]  = first local
      
        442
                //   [FP-16] = second local
      
        443
                //   ...
      
        444
                //   [SP]    = bottom of frame
      
        445
                //
      
        446
                // Prologue: sub sp, sp, #FRAME_SIZE
      
        447
                //           stp x29, x30, [sp, #FRAME_SIZE - 16]
      
        448
                //           add x29, sp, #FRAME_SIZE - 16
      
        449
                // Epilogue: ldp x29, x30, [sp, #FRAME_SIZE - 16]
      
        450
                //           add sp, sp, #FRAME_SIZE
      
        451
                //           ret
      
        452
                Self {
      
        453
                    locals: Vec::new(),
      
        454
                    size: 16,
      
        455
                    next_offset: 0,
      
        456
                    outgoing_arg_size: 0,
      
        457
                }
      
        458
            }
      
        459
        }
      
        460
        
        461
        impl Default for StackFrame {
      
        462
            fn default() -> Self {
      
        463
                Self::new()
      
        464
            }
      
        465
        }
      
        466
        
        467
        impl StackFrame {
      
        468
            /// Allocate a local variable slot. Returns a negative offset from FP.
      
        469
            /// Locals grow downward from FP: first local at [FP-8], etc.
      
        470
            ///
      
        471
            /// Alignment ladders 4 → 8 → 16 by size. The 16 case matters for
      
        472
            /// 128-bit NEON vector spills — Apple Silicon's `LDR Q` / `STR Q`
      
        473
            /// require 16-byte alignment; an 8-byte cap silently produces
      
        474
            /// addresses that may fault on slow paths.
      
        475
            pub fn alloc_local(&mut self, size: u32) -> i32 {
      
        476
                let align = if size >= 16 {
      
        477
                    16i32
      
        478
                } else if size >= 8 {
      
        479
                    8
      
        480
                } else {
      
        481
                    4
      
        482
                };
      
        483
                self.next_offset += size as i32;
      
        484
                self.next_offset = (self.next_offset + align - 1) & !(align - 1);
      
        485
                let offset = -self.next_offset; // negative from FP
      
        486
                self.locals.push(FrameSlot { offset, size });
      
        487
                self.recompute_size();
      
        488
                offset
      
        489
            }
      
        490
        
        491
            /// Frame size = 16 (FP+LR) + locals, 16-byte aligned.
      
        492
            fn recompute_size(&mut self) {
      
        493
                let raw = 16 + self.next_offset as u32 + self.outgoing_arg_size;
      
        494
                self.size = (raw + 15) & !15;
      
        495
            }
      
        496
        
        497
            /// Reserve the maximum outgoing stack argument area this function needs.
      
        498
            pub fn reserve_outgoing_args(&mut self, size: u32) {
      
        499
                if size > self.outgoing_arg_size {
      
        500
                    self.outgoing_arg_size = size;
      
        501
                    self.recompute_size();
      
        502
                }
      
        503
            }
      
        504
        }
      
        505
        
        506
        /// A machine function — the codegen output for one IR function.
      
        507
        #[derive(Debug, Clone)]
      
        508
        pub struct MachineFunction {
      
        509
            pub name: String,
      
        510
            pub blocks: Vec<MachineBlock>,
      
        511
            pub frame: StackFrame,
      
        512
            pub vregs: Vec<VReg>,
      
        513
            pub const_pool: Vec<ConstPoolEntry>,
      
        514
            pub internal_only: bool,
      
        515
            next_vreg: u32,
      
        516
            next_block: u32,
      
        517
        }
      
        518
        
        519
        impl MachineFunction {
      
        520
            pub fn new(name: String) -> Self {
      
        521
                let entry = MachineBlock::new(MBlockId(0), format!("_{}", name));
      
        522
                Self {
      
        523
                    name,
      
        524
                    blocks: vec![entry],
      
        525
                    frame: StackFrame::new(),
      
        526
                    vregs: Vec::new(),
      
        527
                    const_pool: Vec::new(),
      
        528
                    internal_only: false,
      
        529
                    next_vreg: 0,
      
        530
                    next_block: 1,
      
        531
                }
      
        532
            }
      
        533
        
        534
            /// Allocate a new virtual register.
      
        535
            pub fn new_vreg(&mut self, class: RegClass) -> VRegId {
      
        536
                let id = VRegId(self.next_vreg);
      
        537
                self.next_vreg += 1;
      
        538
                self.vregs.push(VReg { id, class });
      
        539
                id
      
        540
            }
      
        541
        
        542
            /// Create a new machine block.
      
        543
            pub fn new_block(&mut self, label: &str) -> MBlockId {
      
        544
                let id = MBlockId(self.next_block);
      
        545
                self.next_block += 1;
      
        546
                self.blocks.push(MachineBlock::new(id, label.into()));
      
        547
                id
      
        548
            }
      
        549
        
        550
            /// Allocate a fresh block-id without inserting a block. The
      
        551
            /// caller is responsible for placing the block at the right
      
        552
            /// position in `self.blocks`. Used by passes that need physical
      
        553
            /// block adjacency (e.g. branch relaxation, which inserts a
      
        554
            /// skip block immediately after the source block).
      
        555
            pub fn next_block_id(&mut self) -> u32 {
      
        556
                let id = self.next_block;
      
        557
                self.next_block += 1;
      
        558
                id
      
        559
            }
      
        560
        
        561
            /// Get a block by ID.
      
        562
            pub fn block(&self, id: MBlockId) -> &MachineBlock {
      
        563
                self.blocks
      
        564
                    .iter()
      
        565
                    .find(|b| b.id == id)
      
        566
                    .expect("machine block not found")
      
        567
            }
      
        568
        
        569
            /// Get a mutable block by ID.
      
        570
            pub fn block_mut(&mut self, id: MBlockId) -> &mut MachineBlock {
      
        571
                self.blocks
      
        572
                    .iter_mut()
      
        573
                    .find(|b| b.id == id)
      
        574
                    .expect("machine block not found")
      
        575
            }
      
        576
        
        577
            /// Add a constant pool entry, return its index.
      
        578
            pub fn add_const(&mut self, entry: ConstPoolEntry) -> u32 {
      
        579
                let idx = self.const_pool.len() as u32;
      
        580
                self.const_pool.push(entry);
      
        581
                idx
      
        582
            }
      
        583
        
        584
            /// Allocate a local stack slot.
      
        585
            pub fn alloc_local(&mut self, size: u32) -> i32 {
      
        586
                self.frame.alloc_local(size)
      
        587
            }
      
        588
        
        589
            /// Reserve outgoing stack argument space for calls made by this function.
      
        590
            pub fn reserve_outgoing_args(&mut self, size: u32) {
      
        591
                self.frame.reserve_outgoing_args(size)
      
        592
            }
      
        593
        }
      
        594
        
        595
        #[cfg(test)]
      
        596
        mod tests {
      
        597
            use super::*;
      
        598
        
        599
            #[test]
      
        600
            fn stack_frame_alignment() {
      
        601
                let mut frame = StackFrame::new();
      
        602
                frame.alloc_local(4); // i32
      
        603
                assert_eq!(
      
        604
                    frame.size % 16,
      
        605
                    0,
      
        606
                    "frame size {} not 16-byte aligned",
      
        607
                    frame.size
      
        608
                );
      
        609
        
        610
                frame.alloc_local(8); // i64
      
        611
                assert_eq!(frame.size % 16, 0);
      
        612
        
        613
                frame.alloc_local(1); // bool
      
        614
                assert_eq!(frame.size % 16, 0);
      
        615
            }
      
        616
        
        617
            #[test]
      
        618
            fn stack_slots_dont_overlap() {
      
        619
                let mut frame = StackFrame::new();
      
        620
                let off1 = frame.alloc_local(4);
      
        621
                let off2 = frame.alloc_local(4);
      
        622
                let off3 = frame.alloc_local(8);
      
        623
                assert_ne!(off1, off2);
      
        624
                assert_ne!(off2, off3);
      
        625
                // All offsets should be negative (below FP).
      
        626
                assert!(off1 < 0);
      
        627
                assert!(off2 < 0);
      
        628
                assert!(off3 < 0);
      
        629
                // No overlap: each slot's range is [offset, offset+size).
      
        630
                assert!(off2 + 4 <= off1 || off1 + 4 <= off2);
      
        631
            }
      
        632
        
        633
            #[test]
      
        634
            fn vreg_allocation() {
      
        635
                let mut mf = MachineFunction::new("test".into());
      
        636
                let v0 = mf.new_vreg(RegClass::Gp64);
      
        637
                let v1 = mf.new_vreg(RegClass::Fp64);
      
        638
                assert_eq!(v0, VRegId(0));
      
        639
                assert_eq!(v1, VRegId(1));
      
        640
                assert_eq!(mf.vregs.len(), 2);
      
        641
                assert_eq!(mf.vregs[0].class, RegClass::Gp64);
      
        642
                assert_eq!(mf.vregs[1].class, RegClass::Fp64);
      
        643
            }
      
        644
        
        645
            #[test]
      
        646
            fn const_pool() {
      
        647
                let mut mf = MachineFunction::new("test".into());
      
        648
                let idx0 = mf.add_const(ConstPoolEntry::F64(3.14));
      
        649
                let idx1 = mf.add_const(ConstPoolEntry::F32(2.0));
      
        650
                assert_eq!(idx0, 0);
      
        651
                assert_eq!(idx1, 1);
      
        652
            }
      
        653
        
        654
            #[test]
      
        655
            fn frame_size_starts_at_16() {
      
        656
                let frame = StackFrame::new();
      
        657
                assert_eq!(frame.size, 16); // just FP+LR
      
        658
            }
      
        659
        
        660
            #[test]
      
        661
            fn reserve_outgoing_args_grows_frame() {
      
        662
                let mut frame = StackFrame::new();
      
        663
                frame.alloc_local(8);
      
        664
                let before = frame.size;
      
        665
                frame.reserve_outgoing_args(16);
      
        666
                assert!(frame.size >= before + 16);
      
        667
                assert_eq!(frame.size % 16, 0);
      
        668
            }
      
        669
        }
      
        670

1	//! Machine IR — low-level representation between SSA IR and ARM64 assembly.
2	//!
3	//! Uses virtual registers (VReg) that will be assigned to physical registers
4	//! by the register allocator. Before allocation, all vregs are spilled.
5
6	/// Virtual register identifier.
7	#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
8	pub struct VRegId(pub u32);
9
10	/// Virtual register with type class.
11	#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12	pub struct VReg {
13	pub id: VRegId,
14	pub class: RegClass,
15	}
16
17	/// Register class — determines which physical registers can hold this value.
18	#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19	pub enum RegClass {
20	/// General purpose (X0-X28, excluding x18/x29/x30).
21	Gp64,
22	/// 32-bit general purpose (W0-W28).
23	Gp32,
24	/// FP/SIMD double (D0-D31).
25	Fp64,
26	/// FP/SIMD single (S0-S31).
27	Fp32,
28	/// 128-bit NEON vector (Q0-Q31). Covers 4×f32, 2×f64, 4×i32,
29	/// 2×i64, etc. — every shape in `IrType::Vector`. Codegen
30	/// shares the same physical bank as Fp32/Fp64 (the V registers
31	/// are the 128-bit form of D/S), so the regalloc assigns them
32	/// from the same pool but at 128-bit width.
33	V128,
34	}
35
36	/// ARM64 opcodes that we emit.
37	#[derive(Debug, Clone, Copy, PartialEq, Eq)]
38	pub enum ArmOpcode {
39	// ---- Integer arithmetic ----
40	AddReg, // ADD Xd, Xn, Xm
41	AddsReg, // ADDS Xd, Xn, Xm (sets flags)
42	AdcReg, // ADC Xd, Xn, Xm
43	AddImm, // ADD Xd, Xn, #imm
44	SubReg, // SUB Xd, Xn, Xm
45	SubsReg, // SUBS Xd, Xn, Xm (sets flags)
46	SbcReg, // SBC Xd, Xn, Xm
47	SubImm, // SUB Xd, Xn, #imm
48	Mul, // MUL Xd, Xn, Xm
49	Sdiv, // SDIV Xd, Xn, Xm
50	Madd, // MADD Xd, Xn, Xm, Xa (Xa + Xn*Xm; produced by mul-add peephole)
51	Msub, // MSUB Xd, Xn, Xm, Xa (Xa - Xn*Xm; for imod and the mul-sub peephole)
52	Neg, // NEG Xd, Xm (alias: SUB Xd, XZR, Xm)
53
54	// ---- Logic ----
55	AndReg,
56	OrrReg,
57	EorReg,
58	OrnReg, // MVN is ORN Xd, XZR, Xm
59
60	// ---- Shifts ----
61	LslReg,
62	LsrReg,
63	AsrReg,
64
65	// ---- Bit manipulation ----
66	Mvn, // MVN Xd, Xm (bitwise NOT, alias: ORN Xd, XZR, Xm)
67	Clz, // CLZ Xd, Xn (count leading zeros)
68	Rbit, // RBIT Xd, Xn (reverse bits)
69
70	// ---- Comparison & select ----
71	CmpReg, // CMP Xn, Xm (alias: SUBS XZR, Xn, Xm)
72	CmpImm, // CMP Xn, #imm
73	Cset, // CSET Xd, cond
74	CselReg, // CSEL Xd, Xn, Xm, cond
75	FCmpReg, // FCMP Dn, Dm
76	FCset, // CSET Xd, cond (after FCMP)
77	FcselReg, // FCSEL Dd, Dn, Dm, cond
78
79	// ---- Float arithmetic ----
80	FaddS,
81	FaddD,
82	FsubS,
83	FsubD,
84	FmulS,
85	FmulD,
86	FdivS,
87	FdivD,
88	FnegS,
89	FnegD,
90	FabsS,
91	FabsD,
92	FsqrtS,
93	FsqrtD,
94	// Fused multiply-add/subtract (FMADD/FMSUB/FNMSUB).
95	// 3-source: dest = Sa ± Sn*Sm.
96	FmaddS,
97	FmaddD, // FMADD: dest = Sa + Sn*Sm
98
99	// ---- NEON SIMD vector arithmetic (Sprint 12 Stage 2) ----
100	//
101	// Each opcode encodes the lane shape so emit/encoding stays
102	// table-driven. Naming convention: `<Op><LaneCount><LaneType>`.
103	// Examples: `FaddV4S` is "fadd Vd.4s, Vn.4s, Vm.4s", `FmlaV2D`
104	// is "fmla Vd.2d, Vn.2d, Vm.2d".
105	//
106	// Operands across this family: dest VReg of class V128 plus the
107	// expected source operands for that op. Lane shape is implicit
108	// in the opcode; emit just dispatches.
109	AddV4S, // ADD Vd.4s, Vn.4s, Vm.4s (integer)
110	AddV2D, // ADD Vd.2d, Vn.2d, Vm.2d
111	SubV4S,
112	SubV2D,
113	MulV4S, // MUL Vd.4s, Vn.4s, Vm.4s (integer; 2D not in NEON)
114	NegV4S,
115	NegV2D,
116	FaddV4S, // FADD Vd.4s, Vn.4s, Vm.4s
117	FaddV2D, // FADD Vd.2d, Vn.2d, Vm.2d
118	FsubV4S,
119	FsubV2D,
120	FmulV4S,
121	FmulV2D,
122	FdivV4S,
123	FdivV2D,
124	FnegV4S,
125	FnegV2D,
126	FabsV4S,
127	FabsV2D,
128	FsqrtV4S,
129	FsqrtV2D,
130	FmlaV4S, // FMLA Vd.4s, Vn.4s, Vm.4s (Vd += Vn*Vm)
131	FmlaV2D,
132	/// BSL Vd.16B, Vn.16B, Vm.16B — bit select. Per-bit:
133	/// `Vd[i] = Vd[i] ? Vn[i] : Vm[i]`. Vd is destructive
134	/// (input mask + output). Used to lower VSelect.
135	BslV16B,
136	/// Vector compare (per-lane all-ones / all-zeros result).
137	FcmgtV4S,
138	FcmgtV2D,
139	FcmgeV4S,
140	FcmgeV2D,
141	FcmeqV4S,
142	FcmeqV2D,
143	CmgtV4S,
144	CmgeV4S,
145	CmeqV4S,
146	FminV4S,
147	FminV2D,
148	FmaxV4S,
149	FmaxV2D,
150	SminV4S, // SMIN (signed integer)
151	SmaxV4S,
152	UminV4S,
153	UmaxV4S,
154
155	// Cross-lane reductions
156	FaddpV2S, // FADDP Sd, Vn.2s (pair-add → scalar; 2-lane f32)
157	/// `FADDP.4S Vd, Vn, Vm` — 3-operand pairwise add over four
158	/// f32 lanes. For cross-lane f32 sum reduction we use this with
159	/// `Vn = Vm = v_src` then follow with FaddpV2S to fold the
160	/// remaining two lanes (NEON has no `faddv.4s`).
161	FaddpV4S,
162	FaddpV2D, // FADDP Dd, Vn.2d (pair-add → scalar; 2-lane f64)
163	Faddv4S, // FADDV Sd, Vn.4s (across 4 f32 lanes → scalar)
164	Sminv4S, // SMINV Sd, Vn.4s
165	Smaxv4S,
166	/// `FMAXV.4S Sd, Vn` — across-lane f32 max reduction → scalar.
167	FmaxvV4S,
168	/// `FMINV.4S Sd, Vn` — across-lane f32 min reduction → scalar.
169	FminvV4S,
170	/// `FMAXP.2D Dd, Vn` — pairwise f64 max reduction (2 lanes → scalar).
171	/// NEON has no `fmaxv.2d`; for two f64 lanes the pairwise form is
172	/// the across-lane reduction.
173	FmaxpV2DScalar,
174	/// `FMINP.2D Dd, Vn` — pairwise f64 min reduction (2 lanes → scalar).
175	FminpV2DScalar,
176	/// `ADDP.2D Vd, Vn, Vm` — pairwise integer add over two i64 lanes.
177	/// Used for i64 cross-lane reduction: `addp.2d v_dst, v_src, v_src`
178	/// puts the sum of the two lanes in v_dst[0].
179	AddpV2D,
180	Uminv4S,
181	Umaxv4S,
182	Addv4S, // integer cross-lane add over 4×i32
183
184	// Lane move / broadcast
185	DupGen4S, // DUP Vd.4s, Wn (broadcast scalar to 4 lanes)
186	DupGen2D, // DUP Vd.2d, Xn
187	DupEl4S, // DUP Vd.4s, Vn.s[0] (broadcast lane 0 to 4 lanes)
188	DupEl2D,
189	Ins4S, // INS Vd.s[lane], Wn (insert scalar into one lane)
190	Ins2D,
191	Umov4S, // UMOV Wd, Vn.s[lane] (extract lane to scalar)
192	Umov2D,
193	FmovEl4S, // FMOV Sd, Vn.s[lane] (extract f32 lane)
194	FmovEl2D,
195
196	// Vector load/store (128-bit Q register)
197	LdrQ, // LDR Qt, [Xn, #imm]
198	StrQ, // STR Qt, [Xn, #imm]
199	/// `mov.16b vN, vM` — 128-bit register-to-register copy.
200	/// Used by regalloc when moving a V128 vreg between physical
201	/// regs; FmovReg only handles the low 64 bits and would corrupt
202	/// the upper lanes of a V128.
203	Mov16B,
204	FmsubS,
205	FmsubD, // FMSUB: dest = Sa - Sn*Sm
206	FnmsubS,
207	FnmsubD, // FNMSUB: dest = Sn*Sm - Sa
208
209	// ---- Conversions ----
210	ScvtfSW,
211	ScvtfDW, // signed int32 → float
212	ScvtfSX,
213	ScvtfDX, // signed int64 → float
214	FcvtzsWS,
215	FcvtzsWD, // float → int32
216	FcvtzsXS,
217	FcvtzsXD, // float → int64
218	FcvtSD,
219	FcvtDS, // float↔double
220
221	// ---- Move ----
222	Movz, // MOVZ Xd, #imm16, LSL #shift
223	Movk, // MOVK Xd, #imm16, LSL #shift
224	Movn, // MOVN Xd, #imm16, LSL #shift
225	MovReg, // MOV Xd, Xm (alias: ORR Xd, XZR, Xm)
226	FmovReg, // FMOV Dd, Dm
227
228	// ---- Memory ----
229	StrImm, // STR Xt, [Xn, #imm]
230	LdrImm, // LDR Xt, [Xn, #imm]
231	StrhImm, // STRH Wt, [Xn, #imm] (store 16-bit half)
232	LdrshImm, // LDRSH Wt, [Xn, #imm] (load 16-bit half, sign-extended)
233	StrbImm, // STRB Wt, [Xn, #imm] (store 8-bit byte)
234	LdrsbImm, // LDRSB Wt, [Xn, #imm] (load 8-bit byte, sign-extended)
235	StrFpImm, // STR Dt, [Xn, #imm] (float store)
236	LdrFpImm, // LDR Dt, [Xn, #imm] (float load)
237	// Register-offset loads/stores: address = base + index << shift.
238	// Operands: [dest, base, idx, Imm(shift)]. Shift ∈ {0,1,2,3}.
239	// Sprint 05: emitted by `scaled_addressing_fusion` from a
240	// Movz+Mul+AddReg+Ldr/Str sequence when elem_size ∈ {1,2,4,8}.
241	LdrReg, // LDR Xt\|Wt, [Xn, Xm, lsl #shift]
242	StrReg, // STR Xt\|Wt, [Xn, Xm, lsl #shift]
243	LdrFpReg, // LDR Dt\|St, [Xn, Xm, lsl #shift]
244	StrFpReg, // STR Dt\|St, [Xn, Xm, lsl #shift]
245	StpPre, // STP Xt1, Xt2, [Xn, #imm]! (pre-index)
246	LdpPost, // LDP Xt1, Xt2, [Xn], #imm (post-index)
247	StpOffset, // STP Xt1, Xt2, [Xn, #imm] (signed offset, no writeback)
248	LdpOffset, // LDP Xt1, Xt2, [Xn, #imm] (signed offset, no writeback)
249	AdrpLdr, // ADRP + LDR sequence (load value from PC-relative address)
250	AdrpAdd, // ADRP + ADD sequence (compute PC-relative address)
251
252	// ---- Branch ----
253	B, // B label
254	BCond, // B.cond label
255	// Compare-and-branch (single-instruction zero check). Operands:
256	// [VReg\|PhysReg of register to test, BlockRef target]
257	// Width inferred from the test register's class (Gp32 → cbz w; Gp64 → cbz x).
258	// ±1MB range (19-bit signed × 4), same as BCond — relaxed identically.
259	Cbz,
260	Cbnz,
261	// Test-bit-and-branch. Operands:
262	// [VReg\|PhysReg of test reg, Imm(bit_index 0..63), BlockRef target]
263	// ±32KB range (14-bit signed × 4), tighter than BCond — needs its own relax bound.
264	Tbz,
265	Tbnz,
266	Bl, // BL label (call)
267	Blr, // BLR reg (indirect call)
268	Ret, // RET
269
270	// ---- Extend ----
271	Sxtw, // SXTW Xd, Wn (sign-extend 32→64)
272	Sxth, // SXTH Wd\|Xd, Wn (sign-extend 16→32 or 16→64)
273	Sxtb, // SXTB Wd\|Xd, Wn (sign-extend 8→32 or 8→64)
274
275	// ---- Special ----
276	Nop,
277	Brk, // BRK #imm16 (debug trap)
278	}
279
280	/// ARM64 condition codes.
281	#[derive(Debug, Clone, Copy, PartialEq, Eq)]
282	pub enum ArmCond {
283	Eq,
284	Ne,
285	Hs,
286	Lo, // unsigned >=, <
287	Mi,
288	Pl, // negative, positive
289	Hi,
290	Ls, // unsigned >, <=
291	Ge,
292	Lt, // signed >=, <
293	Gt,
294	Le, // signed >, <=
295	}
296
297	impl ArmCond {
298	/// The condition that takes the opposite branch — used by the
299	/// branch-relaxation pass when expanding a far `B.cond` into a
300	/// short `B.{!cond}` over an unconditional `B`. The pairs are
301	/// EQ/NE, HS/LO, MI/PL, HI/LS, GE/LT, GT/LE; involution is
302	/// guaranteed (`c.inverse().inverse() == c`).
303	pub fn inverse(self) -> ArmCond {
304	match self {
305	ArmCond::Eq => ArmCond::Ne,
306	ArmCond::Ne => ArmCond::Eq,
307	ArmCond::Hs => ArmCond::Lo,
308	ArmCond::Lo => ArmCond::Hs,
309	ArmCond::Mi => ArmCond::Pl,
310	ArmCond::Pl => ArmCond::Mi,
311	ArmCond::Hi => ArmCond::Ls,
312	ArmCond::Ls => ArmCond::Hi,
313	ArmCond::Ge => ArmCond::Lt,
314	ArmCond::Lt => ArmCond::Ge,
315	ArmCond::Gt => ArmCond::Le,
316	ArmCond::Le => ArmCond::Gt,
317	}
318	}
319	}
320
321	/// A machine operand.
322	#[derive(Debug, Clone, PartialEq)]
323	pub enum MachineOperand {
324	/// Virtual register.
325	VReg(VRegId),
326	/// Physical register (post-allocation or fixed registers like SP, FP, LR).
327	PhysReg(PhysReg),
328	/// Immediate value.
329	Imm(i64),
330	/// Stack frame slot (offset from FP).
331	FrameSlot(i32),
332	/// Condition code.
333	Cond(ArmCond),
334	/// Reference to a machine block (branch target).
335	BlockRef(MBlockId),
336	/// External symbol name (for BL to functions).
337	Extern(String),
338	/// Module-level global by name. Used by ADRP+ADD for SAVE'd
339	/// locals and module variables, where the operand resolves to
340	/// `_globalname@PAGE` / `_globalname@PAGEOFF` at emit time.
341	GlobalLabel(String),
342	/// Constant pool entry index.
343	ConstPool(u32),
344	/// Shift amount for MOVZ/MOVK (0, 16, 32, 48).
345	Shift(u8),
346	}
347
348	/// Physical register reference.
349	#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
350	pub enum PhysReg {
351	/// 64-bit general purpose register (X0-X30).
352	Gp(u8),
353	/// 32-bit general purpose register (W0-W30).
354	Gp32(u8),
355	/// 64-bit FP/SIMD register (D0-D31).
356	Fp(u8),
357	/// 32-bit FP/SIMD register (S0-S31).
358	Fp32(u8),
359	/// Stack pointer.
360	Sp,
361	/// Zero register (64-bit context).
362	Xzr,
363	/// Zero register (32-bit context).
364	Wzr,
365	}
366
367	impl PhysReg {
368	pub const FP: PhysReg = PhysReg::Gp(29);
369	pub const LR: PhysReg = PhysReg::Gp(30);
370	}
371
372	/// Machine block identifier.
373	#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
374	pub struct MBlockId(pub u32);
375
376	/// A machine instruction.
377	#[derive(Debug, Clone)]
378	pub struct MachineInst {
379	pub opcode: ArmOpcode,
380	pub operands: Vec<MachineOperand>,
381	/// Virtual register defined by this instruction (if any).
382	pub def: Option<VRegId>,
383	}
384
385	/// A machine basic block.
386	#[derive(Debug, Clone)]
387	pub struct MachineBlock {
388	pub id: MBlockId,
389	pub label: String,
390	pub insts: Vec<MachineInst>,
391	}
392
393	impl MachineBlock {
394	pub fn new(id: MBlockId, label: String) -> Self {
395	Self {
396	id,
397	label,
398	insts: Vec::new(),
399	}
400	}
401	}
402
403	/// Constant pool entry.
404	#[derive(Debug, Clone)]
405	pub enum ConstPoolEntry {
406	F32(f32),
407	F64(f64),
408	I64(i64),
409	Bytes(Vec<u8>),
410	}
411
412	/// Stack frame layout.
413	#[derive(Debug, Clone)]
414	pub struct StackFrame {
415	/// Slots for local variables (name → offset from FP).
416	pub locals: Vec<FrameSlot>,
417	/// Total frame size in bytes (16-byte aligned).
418	pub size: u32,
419	/// Offset of the next available local slot.
420	next_offset: i32,
421	/// Maximum outgoing stack argument area reserved at the bottom of the frame.
422	outgoing_arg_size: u32,
423	}
424
425	/// A stack frame slot.
426	#[derive(Debug, Clone)]
427	pub struct FrameSlot {
428	pub offset: i32, // negative offset from FP
429	pub size: u32, // size in bytes
430	}
431
432	impl StackFrame {
433	pub fn new() -> Self {
434	// Apple ARM64 frame layout:
435	// FP points at saved FP/LR (top of frame).
436	// Locals are at negative offsets from FP.
437	// SP is at the bottom of the frame.
438	//
439	// [FP+0] = saved x29
440	// [FP+8] = saved x30
441	// [FP-8] = first local
442	// [FP-16] = second local
443	// ...
444	// [SP] = bottom of frame
445	//
446	// Prologue: sub sp, sp, #FRAME_SIZE
447	// stp x29, x30, [sp, #FRAME_SIZE - 16]
448	// add x29, sp, #FRAME_SIZE - 16
449	// Epilogue: ldp x29, x30, [sp, #FRAME_SIZE - 16]
450	// add sp, sp, #FRAME_SIZE
451	// ret
452	Self {
453	locals: Vec::new(),
454	size: 16,
455	next_offset: 0,
456	outgoing_arg_size: 0,
457	}
458	}
459	}
460
461	impl Default for StackFrame {
462	fn default() -> Self {
463	Self::new()
464	}
465	}
466
467	impl StackFrame {
468	/// Allocate a local variable slot. Returns a negative offset from FP.
469	/// Locals grow downward from FP: first local at [FP-8], etc.
470	///
471	/// Alignment ladders 4 → 8 → 16 by size. The 16 case matters for
472	/// 128-bit NEON vector spills — Apple Silicon's `LDR Q` / `STR Q`
473	/// require 16-byte alignment; an 8-byte cap silently produces
474	/// addresses that may fault on slow paths.
475	pub fn alloc_local(&mut self, size: u32) -> i32 {
476	let align = if size >= 16 {
477	16i32
478	} else if size >= 8 {
479	8
480	} else {
481	4
482	};
483	self.next_offset += size as i32;
484	self.next_offset = (self.next_offset + align - 1) & !(align - 1);
485	let offset = -self.next_offset; // negative from FP
486	self.locals.push(FrameSlot { offset, size });
487	self.recompute_size();
488	offset
489	}
490
491	/// Frame size = 16 (FP+LR) + locals, 16-byte aligned.
492	fn recompute_size(&mut self) {
493	let raw = 16 + self.next_offset as u32 + self.outgoing_arg_size;
494	self.size = (raw + 15) & !15;
495	}
496
497	/// Reserve the maximum outgoing stack argument area this function needs.
498	pub fn reserve_outgoing_args(&mut self, size: u32) {
499	if size > self.outgoing_arg_size {
500	self.outgoing_arg_size = size;
501	self.recompute_size();
502	}
503	}
504	}
505
506	/// A machine function — the codegen output for one IR function.
507	#[derive(Debug, Clone)]
508	pub struct MachineFunction {
509	pub name: String,
510	pub blocks: Vec<MachineBlock>,
511	pub frame: StackFrame,
512	pub vregs: Vec<VReg>,
513	pub const_pool: Vec<ConstPoolEntry>,
514	pub internal_only: bool,
515	next_vreg: u32,
516	next_block: u32,
517	}
518
519	impl MachineFunction {
520	pub fn new(name: String) -> Self {
521	let entry = MachineBlock::new(MBlockId(0), format!("_{}", name));
522	Self {
523	name,
524	blocks: vec![entry],
525	frame: StackFrame::new(),
526	vregs: Vec::new(),
527	const_pool: Vec::new(),
528	internal_only: false,
529	next_vreg: 0,
530	next_block: 1,
531	}
532	}
533
534	/// Allocate a new virtual register.
535	pub fn new_vreg(&mut self, class: RegClass) -> VRegId {
536	let id = VRegId(self.next_vreg);
537	self.next_vreg += 1;
538	self.vregs.push(VReg { id, class });
539	id
540	}
541
542	/// Create a new machine block.
543	pub fn new_block(&mut self, label: &str) -> MBlockId {
544	let id = MBlockId(self.next_block);
545	self.next_block += 1;
546	self.blocks.push(MachineBlock::new(id, label.into()));
547	id
548	}
549
550	/// Allocate a fresh block-id without inserting a block. The
551	/// caller is responsible for placing the block at the right
552	/// position in `self.blocks`. Used by passes that need physical
553	/// block adjacency (e.g. branch relaxation, which inserts a
554	/// skip block immediately after the source block).
555	pub fn next_block_id(&mut self) -> u32 {
556	let id = self.next_block;
557	self.next_block += 1;
558	id
559	}
560
561	/// Get a block by ID.
562	pub fn block(&self, id: MBlockId) -> &MachineBlock {
563	self.blocks
564	.iter()
565	.find(\|b\| b.id == id)
566	.expect("machine block not found")
567	}
568
569	/// Get a mutable block by ID.
570	pub fn block_mut(&mut self, id: MBlockId) -> &mut MachineBlock {
571	self.blocks
572	.iter_mut()
573	.find(\|b\| b.id == id)
574	.expect("machine block not found")
575	}
576
577	/// Add a constant pool entry, return its index.
578	pub fn add_const(&mut self, entry: ConstPoolEntry) -> u32 {
579	let idx = self.const_pool.len() as u32;
580	self.const_pool.push(entry);
581	idx
582	}
583
584	/// Allocate a local stack slot.
585	pub fn alloc_local(&mut self, size: u32) -> i32 {
586	self.frame.alloc_local(size)
587	}
588
589	/// Reserve outgoing stack argument space for calls made by this function.
590	pub fn reserve_outgoing_args(&mut self, size: u32) {
591	self.frame.reserve_outgoing_args(size)
592	}
593	}
594
595	#[cfg(test)]
596	mod tests {
597	use super::*;
598
599	#[test]
600	fn stack_frame_alignment() {
601	let mut frame = StackFrame::new();
602	frame.alloc_local(4); // i32
603	assert_eq!(
604	frame.size % 16,
605	0,
606	"frame size {} not 16-byte aligned",
607	frame.size
608	);
609
610	frame.alloc_local(8); // i64
611	assert_eq!(frame.size % 16, 0);
612
613	frame.alloc_local(1); // bool
614	assert_eq!(frame.size % 16, 0);
615	}
616
617	#[test]
618	fn stack_slots_dont_overlap() {
619	let mut frame = StackFrame::new();
620	let off1 = frame.alloc_local(4);
621	let off2 = frame.alloc_local(4);
622	let off3 = frame.alloc_local(8);
623	assert_ne!(off1, off2);
624	assert_ne!(off2, off3);
625	// All offsets should be negative (below FP).
626	assert!(off1 < 0);
627	assert!(off2 < 0);
628	assert!(off3 < 0);
629	// No overlap: each slot's range is [offset, offset+size).
630	assert!(off2 + 4 <= off1 \|\| off1 + 4 <= off2);
631	}
632
633	#[test]
634	fn vreg_allocation() {
635	let mut mf = MachineFunction::new("test".into());
636	let v0 = mf.new_vreg(RegClass::Gp64);
637	let v1 = mf.new_vreg(RegClass::Fp64);
638	assert_eq!(v0, VRegId(0));
639	assert_eq!(v1, VRegId(1));
640	assert_eq!(mf.vregs.len(), 2);
641	assert_eq!(mf.vregs[0].class, RegClass::Gp64);
642	assert_eq!(mf.vregs[1].class, RegClass::Fp64);
643	}
644
645	#[test]
646	fn const_pool() {
647	let mut mf = MachineFunction::new("test".into());
648	let idx0 = mf.add_const(ConstPoolEntry::F64(3.14));
649	let idx1 = mf.add_const(ConstPoolEntry::F32(2.0));
650	assert_eq!(idx0, 0);
651	assert_eq!(idx1, 1);
652	}
653
654	#[test]
655	fn frame_size_starts_at_16() {
656	let frame = StackFrame::new();
657	assert_eq!(frame.size, 16); // just FP+LR
658	}
659
660	#[test]
661	fn reserve_outgoing_args_grows_frame() {
662	let mut frame = StackFrame::new();
663	frame.alloc_local(8);
664	let before = frame.size;
665	frame.reserve_outgoing_args(16);
666	assert!(frame.size >= before + 16);
667	assert_eq!(frame.size % 16, 0);
668	}
669	}
670