//! Instruction selection — translate SSA IR to Machine IR. //! //! Maps each IR instruction to one or more ARM64 machine instructions. //! Uses virtual registers throughout; physical register assignment //! happens in the register allocator (Sprint 21). //! //! Strategy: naive spill-everything. Every vreg lives on the stack. //! Load before use, store after def. Correct but slow — optimized later. use super::mir::*; use crate::ir::inst::*; use crate::ir::types::*; use std::collections::{HashMap, HashSet}; /// Select machine instructions for an entire IR module. pub fn select_module(module: &Module) -> Vec { // Build function name table for resolving Internal call refs. let func_names: Vec = module.functions.iter().map(|f| f.name.clone()).collect(); module .functions .iter() .map(|f| select_function_with_names(f, &func_names)) .collect() } fn select_function_with_names(func: &Function, func_names: &[String]) -> MachineFunction { let mut mf = select_function(func); // Resolve any Internal call references to actual function names. for block in &mut mf.blocks { for inst in &mut block.insts { if let super::mir::ArmOpcode::Bl = inst.opcode { if let Some(super::mir::MachineOperand::Extern(ref mut name)) = inst.operands.first_mut() { // Check if this is a placeholder "_func_N" name from isel. if name.starts_with("_func_") { if let Ok(idx) = name[6..].parse::() { if idx < func_names.len() { *name = func_names[idx].clone(); } } } } } } } mf } use super::abi::{classify_abi_arg, AbiArgLoc, AbiArgState}; /// Select machine instructions for one IR function. pub fn select_function(func: &Function) -> MachineFunction { let mut mf = MachineFunction::new(func.name.clone()); mf.internal_only = func.internal_only; let mut ctx = ISelCtx::new(); // Phase 1: allocate stack slots for all IR alloca instructions. for block in &func.blocks { for inst in &block.insts { if let InstKind::Alloca(ty) = &inst.kind { let size = alloca_size(ty); let offset = mf.alloc_local(size); ctx.alloca_offsets.insert(inst.id, offset); } } } // Phase 2: create machine blocks corresponding to IR blocks. // Entry block already exists as MBlockId(0). // // Block labels are prefixed with the function name so two // functions in the same .s file don't collide on common names // like `do_check_1`. The `L` prefix turns them into local // symbols on Apple's assembler. ctx.block_map.insert(func.entry, MBlockId(0)); for block in &func.blocks { if block.id != func.entry { let label = format!("L{}_{}", mf.name, block.name); let mb_id = mf.new_block(&label); ctx.block_map.insert(block.id, mb_id); } } enum IncomingParam { Narrow(VRegId, RegClass, AbiArgLoc, IrType), Wide(i32, AbiArgLoc), } // Phase 2.5: handle incoming parameters. // Create a vreg or a wide stack slot for each param. // The physical register save happens after the prologue. let mut param_info: Vec = Vec::new(); let mut abi_state = AbiArgState::default(); for param in &func.params { let loc = classify_abi_arg(¶m.ty, &mut abi_state); if matches!(param.ty, IrType::Int(IntWidth::I128)) { let offset = mf.alloc_local(16); ctx.wide_value_slots.insert(param.id, offset); param_info.push(IncomingParam::Wide(offset, loc)); continue; } let class = type_to_reg_class(¶m.ty); let vreg = mf.new_vreg(class); ctx.value_map.insert(param.id, vreg); param_info.push(IncomingParam::Narrow(vreg, class, loc, param.ty.clone())); } // Phase 3: emit prologue in entry block. emit_prologue(&mut mf, MBlockId(0)); // Phase 3.5: move incoming argument registers into param vregs. // Dispatch by register class: GP args from x0-x7, FP args from d0-d7. for info in ¶m_info { match info { IncomingParam::Wide(offset, AbiArgLoc::GpPair(reg)) => { emit_store_phys_i128_pair( &mut mf, MBlockId(0), MachineOperand::PhysReg(PhysReg::FP), *offset as i64, PhysReg::Gp(*reg), PhysReg::Gp(*reg + 1), ); } IncomingParam::Wide(offset, AbiArgLoc::Stack(stack_offset)) => { emit_load_phys_i128_pair( &mut mf, MBlockId(0), MachineOperand::PhysReg(PhysReg::FP), 16 + *stack_offset, PhysReg::Gp(16), PhysReg::Gp(17), ); emit_store_phys_i128_pair( &mut mf, MBlockId(0), MachineOperand::PhysReg(PhysReg::FP), *offset as i64, PhysReg::Gp(16), PhysReg::Gp(17), ); } IncomingParam::Narrow(vreg, RegClass::Fp64, AbiArgLoc::Fp(reg), _) => { mf.block_mut(MBlockId(0)).insts.push(MachineInst { opcode: ArmOpcode::FmovReg, operands: vec![ MachineOperand::VReg(*vreg), MachineOperand::PhysReg(PhysReg::Fp(*reg)), ], def: Some(*vreg), }); } IncomingParam::Narrow(vreg, RegClass::Fp32, AbiArgLoc::Fp32(reg), _) => { mf.block_mut(MBlockId(0)).insts.push(MachineInst { opcode: ArmOpcode::FmovReg, operands: vec![ MachineOperand::VReg(*vreg), MachineOperand::PhysReg(PhysReg::Fp32(*reg)), ], def: Some(*vreg), }); } IncomingParam::Narrow(vreg, RegClass::Gp32, AbiArgLoc::Gp32(reg), _) => { mf.block_mut(MBlockId(0)).insts.push(MachineInst { opcode: ArmOpcode::MovReg, operands: vec![ MachineOperand::VReg(*vreg), MachineOperand::PhysReg(PhysReg::Gp32(*reg)), ], def: Some(*vreg), }); } IncomingParam::Narrow(vreg, _, AbiArgLoc::Gp(reg), _) => { mf.block_mut(MBlockId(0)).insts.push(MachineInst { opcode: ArmOpcode::MovReg, operands: vec![ MachineOperand::VReg(*vreg), MachineOperand::PhysReg(PhysReg::Gp(*reg)), ], def: Some(*vreg), }); } IncomingParam::Narrow(vreg, class, AbiArgLoc::Stack(stack_offset), ty) => { emit_load_stack_arg_into_vreg( &mut mf, MBlockId(0), *vreg, *class, ty, 16 + *stack_offset, ); } IncomingParam::Wide(_, other) => { panic!( "isel: unexpected ABI loc {:?} for incoming i128 param", other ); } IncomingParam::Narrow(_, class, other, _) => { panic!( "isel: unexpected ABI loc {:?} for incoming {:?} param", other, class ); } } } // Phase 4a: allocate vregs for EVERY block parameter AND every // instruction result *before* walking any instructions. We need // this upfront because: // // - A branch terminator needs to know the target block's // param vregs to emit "move branch arg → target param" // copies, and the target block may not have been walked yet. // // - An instruction in block A may reference an SSA value // defined in block B that appears later in `func.blocks` // vec order (perfectly legal under SSA dominance — block B // can dominate block A even if it comes later in the vec). // Without upfront allocation, the lookup fails. // // Allocation here doesn't emit machine instructions; it just // reserves vreg IDs for every IR ValueId so Phase 4b can use // `lookup_vreg` without ordering concerns. for block in &func.blocks { for bp in &block.params { if matches!(bp.ty, IrType::Int(IntWidth::I128)) { let offset = mf.alloc_local(16); ctx.wide_value_slots.insert(bp.id, offset); continue; } let class = type_to_reg_class(&bp.ty); let vreg = mf.new_vreg(class); ctx.value_map.insert(bp.id, vreg); } for inst in &block.insts { // Allocas already have their backing stack slots from // Phase 1, but the SSA value they produce is still a real // pointer that later blocks may pass to calls or branch // params before the defining block is selected. // // Reserve the vreg here so forward-dominating alloca uses // are safe even when block vec order puts the use before // the definition. // Void-typed insts (Store, RuntimeCall returning void, // etc.) don't produce a usable value. if matches!(inst.ty, IrType::Void) { continue; } if matches!(inst.ty, IrType::Int(IntWidth::I128)) { let offset = mf.alloc_local(16); ctx.wide_value_slots.insert(inst.id, offset); continue; } let class = type_to_reg_class(&inst.ty); let vreg = mf.new_vreg(class); ctx.value_map.insert(inst.id, vreg); } } // Snapshot just each IR block's params into ctx so // `select_terminator` can look them up while we hold a separate // &mut MachineFunction borrow. We don't need a full BasicBlock // clone — only the param list — so this avoids cloning every // instruction in the function for each terminator we visit. for block in &func.blocks { ctx.block_params.insert(block.id, block.params.clone()); } // Phase 4a.5: identify ICmp/FCmp → Select fusion candidates. // // An ICmp whose boolean result is used only by a single Select in // the same block (with no intervening flag-clobbering instruction) // can be fused: we suppress the CSET and pass the CMP flags // directly into the CSEL. This turns 4 instructions into 2: // // CMP a, b; CSET cond, LE; CMP cond, #0; CSEL dest, tv, fv, NE // → CMP a, b; CSEL dest, tv, fv, LE compute_csel_fusion(func, &mut ctx); // Phase 4b: select instructions and terminators for each block. for block in &func.blocks { let mb_id = ctx.block_map[&block.id]; for inst in &block.insts { select_inst(&mut mf, &mut ctx, mb_id, inst, func); } if let Some(term) = &block.terminator { select_terminator(&mut mf, &mut ctx, mb_id, term, block, func); } } mf } fn select_call_inst( mf: &mut MachineFunction, ctx: &mut ISelCtx, mb: MBlockId, inst: &Inst, func: &Function, ) { let (label, args, runtime_func, indirect_target) = match &inst.kind { InstKind::Call(FuncRef::External(name), args) => { (name.clone(), args.as_slice(), None, None) } InstKind::Call(FuncRef::Internal(idx), args) => { (format!("_func_{}", idx), args.as_slice(), None, None) } InstKind::Call(FuncRef::Indirect(target), args) => { (String::new(), args.as_slice(), None, Some(*target)) } InstKind::RuntimeCall(rf, args) => (String::new(), args.as_slice(), Some(rf), None), _ => unreachable!(), }; let mut abi_state = AbiArgState::default(); let mut arg_locs = Vec::with_capacity(args.len()); for &arg_val in args { let arg_ty = func .value_type(arg_val) .unwrap_or_else(|| panic!("isel: missing type for call arg %{}", arg_val.0)); arg_locs.push((arg_val, classify_abi_arg(&arg_ty, &mut abi_state), arg_ty)); } let label = runtime_func .map(|rf| runtime_func_symbol(rf, &arg_locs)) .unwrap_or(label); if abi_state.stack_offset > 0 { mf.reserve_outgoing_args(abi_state.stack_offset as u32); } let mut pending_reg_arg_moves: Vec<(ArmOpcode, PhysReg, VRegId)> = Vec::new(); for (arg_val, loc, arg_ty) in arg_locs { if matches!(arg_ty, IrType::Int(IntWidth::I128)) { let arg_slot = ctx.lookup_wide_slot(arg_val); match loc { AbiArgLoc::GpPair(reg) => { emit_load_phys_i128_pair( mf, mb, MachineOperand::PhysReg(PhysReg::FP), arg_slot as i64, PhysReg::Gp(reg), PhysReg::Gp(reg + 1), ); } AbiArgLoc::Stack(stack_offset) => { emit_load_phys_i128_pair( mf, mb, MachineOperand::PhysReg(PhysReg::FP), arg_slot as i64, PhysReg::Gp(16), PhysReg::Gp(17), ); emit_store_phys_i128_pair( mf, mb, MachineOperand::PhysReg(PhysReg::Sp), stack_offset, PhysReg::Gp(16), PhysReg::Gp(17), ); } other => { panic!("isel: unexpected ABI loc {:?} for outgoing i128 arg", other); } } continue; } let arg_vreg = ctx.lookup_vreg(arg_val); let arg_class = mf.vregs.iter().find(|v| v.id == arg_vreg).map(|v| v.class); match (arg_class, loc) { (Some(RegClass::Fp64), AbiArgLoc::Fp(reg)) => { pending_reg_arg_moves.push((ArmOpcode::FmovReg, PhysReg::Fp(reg), arg_vreg)); } (Some(RegClass::Fp32), AbiArgLoc::Fp32(reg)) => { pending_reg_arg_moves.push((ArmOpcode::FmovReg, PhysReg::Fp32(reg), arg_vreg)); } (Some(RegClass::Gp32), AbiArgLoc::Gp32(reg)) => { pending_reg_arg_moves.push((ArmOpcode::MovReg, PhysReg::Gp32(reg), arg_vreg)); } (Some(RegClass::Gp64), AbiArgLoc::Gp(reg)) => { pending_reg_arg_moves.push((ArmOpcode::MovReg, PhysReg::Gp(reg), arg_vreg)); } (Some(class), AbiArgLoc::Stack(stack_offset)) => { emit_store_stack_arg_from_vreg(mf, mb, arg_vreg, class, &arg_ty, stack_offset); } (Some(class), other) => { panic!( "isel: unexpected ABI loc {:?} for outgoing {:?} arg", other, class ); } (None, _) => { panic!("isel: call arg vreg class missing for %{}", arg_val.0); } } } for (opcode, dst, src) in pending_reg_arg_moves { mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![MachineOperand::PhysReg(dst), MachineOperand::VReg(src)], def: None, }); } if let Some(target) = indirect_target { mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Blr, operands: vec![MachineOperand::VReg(ctx.lookup_vreg(target))], def: None, }); } else { mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Bl, operands: vec![MachineOperand::Extern(label)], def: None, }); } if matches!(inst.ty, IrType::Int(IntWidth::I128)) { let dest_slot = ctx.lookup_wide_slot(inst.id); emit_store_phys_i128_pair( mf, mb, MachineOperand::PhysReg(PhysReg::FP), dest_slot as i64, PhysReg::Gp(0), PhysReg::Gp(1), ); } else if inst.ty != IrType::Void { let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); let (src_reg, opcode) = match class { RegClass::Fp64 => (PhysReg::Fp(0), ArmOpcode::FmovReg), RegClass::Fp32 => (PhysReg::Fp32(0), ArmOpcode::FmovReg), RegClass::V128 => (PhysReg::Fp(0), ArmOpcode::FmovReg), RegClass::Gp32 => (PhysReg::Gp32(0), ArmOpcode::MovReg), RegClass::Gp64 => (PhysReg::Gp(0), ArmOpcode::MovReg), }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![MachineOperand::VReg(dest), MachineOperand::PhysReg(src_reg)], def: Some(dest), }); } else { ctx.get_vreg(mf, inst.id, RegClass::Gp64); } } /// Instruction selection context. struct ISelCtx { /// IR ValueId → MIR VRegId. value_map: HashMap, /// IR wide scalar ValueId → stack slot offset used as its backing store. wide_value_slots: HashMap, /// IR BlockId → MIR MBlockId. block_map: HashMap, /// IR alloca ValueId → stack frame offset. alloca_offsets: HashMap, /// IR BlockId → its block params. Snapshotted before phase 4b /// so terminator selection can read each target's params /// without re-borrowing the function while &mut MachineFunction /// is held. Cloning just the param vec is dramatically cheaper /// than cloning the whole BasicBlock — instructions can be in /// the thousands, params are typically 0-3. block_params: HashMap>, /// ICmp/FCmp ValueIds that are exclusively consumed by a Select in /// the same block with no intervening flag-clobbering instruction. /// For these, we suppress CSET during ICmp lowering and use the /// flags directly from the CMP in the CSEL. select_fused: HashSet, /// For each fused ICmp/FCmp, the ARM condition code to use in the /// CSEL (determined at the time we suppress the CSET). fused_arm_cond: HashMap, } impl ISelCtx { fn new() -> Self { Self { value_map: HashMap::new(), wide_value_slots: HashMap::new(), block_map: HashMap::new(), alloca_offsets: HashMap::new(), block_params: HashMap::new(), select_fused: HashSet::new(), fused_arm_cond: HashMap::new(), } } /// Get the vreg for an IR value, or create one if needed. /// In debug builds, asserts that an existing mapping has the /// same register class as requested — a class mismatch means /// Phase 4a (vreg pre-allocation) and Phase 4b (instruction /// selection) disagree about a value's type, which would /// silently corrupt code. fn get_vreg(&mut self, mf: &mut MachineFunction, val: ValueId, class: RegClass) -> VRegId { if let Some(&vreg) = self.value_map.get(&val) { debug_assert!( mf.vregs.iter().find(|v| v.id == vreg).map(|v| v.class) == Some(class), "isel: vreg class mismatch for IR value %{} (existing class \ differs from requested {:?}) — phase 4a/4b disagreement", val.0, class, ); return vreg; } let vreg = mf.new_vreg(class); self.value_map.insert(val, vreg); vreg } /// Get the vreg for an IR value, assuming it was already mapped. fn lookup_vreg(&self, val: ValueId) -> VRegId { *self.value_map.get(&val).unwrap_or_else(|| { panic!( "isel: unmapped IR value %{} — phase 4a should have allocated \ a vreg for every IR value before phase 4b runs. {} values are \ currently mapped. This usually means a forward reference, \ a missing block param, or a value defined in an unreachable \ block.", val.0, self.value_map.len(), ) }) } /// Get machine block for an IR block. fn lookup_block(&self, block: BlockId) -> MBlockId { *self.block_map.get(&block).unwrap_or(&MBlockId(0)) } fn lookup_wide_slot(&self, val: ValueId) -> i32 { *self.wide_value_slots.get(&val).unwrap_or_else(|| { panic!( "isel: unmapped wide i128 value %{} — phase 4a should have allocated \ a backing slot for every supported i128 SSA value before phase 4b runs", val.0 ) }) } } /// Select machine instructions for a single IR instruction. fn select_inst( mf: &mut MachineFunction, ctx: &mut ISelCtx, mb: MBlockId, inst: &Inst, func: &Function, ) { if matches!(inst.ty, IrType::Int(IntWidth::I128)) { match &inst.kind { InstKind::ConstInt(val, IntWidth::I128) => { let dest_slot = ctx.lookup_wide_slot(inst.id); emit_const_i128_to_phys_pair(mf, mb, *val, PhysReg::Gp(16), PhysReg::Gp(17)); emit_store_phys_i128_pair( mf, mb, MachineOperand::PhysReg(PhysReg::FP), dest_slot as i64, PhysReg::Gp(16), PhysReg::Gp(17), ); return; } InstKind::Undef(_) => { let dest_slot = ctx.lookup_wide_slot(inst.id); emit_const_i128_to_phys_pair(mf, mb, 0, PhysReg::Gp(16), PhysReg::Gp(17)); emit_store_phys_i128_pair( mf, mb, MachineOperand::PhysReg(PhysReg::FP), dest_slot as i64, PhysReg::Gp(16), PhysReg::Gp(17), ); return; } InstKind::IAdd(a, b) => { emit_i128_binop_via_slots(mf, ctx, mb, I128BinOp::Add, inst.id, *a, *b); return; } InstKind::ISub(a, b) => { emit_i128_binop_via_slots(mf, ctx, mb, I128BinOp::Sub, inst.id, *a, *b); return; } InstKind::INeg(a) => { let dest_slot = ctx.lookup_wide_slot(inst.id); let src_slot = ctx.lookup_wide_slot(*a); emit_load_phys_i128_pair( mf, mb, MachineOperand::PhysReg(PhysReg::FP), src_slot as i64, PhysReg::Gp(16), PhysReg::Gp(17), ); emit_i128_neg(mf, mb, PhysReg::Gp(16), PhysReg::Gp(17)); emit_store_phys_i128_pair( mf, mb, MachineOperand::PhysReg(PhysReg::FP), dest_slot as i64, PhysReg::Gp(16), PhysReg::Gp(17), ); return; } InstKind::Load(addr) => { let dest_slot = ctx.lookup_wide_slot(inst.id); if let Some(&offset) = ctx.alloca_offsets.get(addr) { emit_load_phys_i128_pair( mf, mb, MachineOperand::PhysReg(PhysReg::FP), offset as i64, PhysReg::Gp(16), PhysReg::Gp(17), ); } else { let base = ctx.lookup_vreg(*addr); emit_load_phys_i128_pair( mf, mb, MachineOperand::VReg(base), 0, PhysReg::Gp(16), PhysReg::Gp(17), ); } emit_store_phys_i128_pair( mf, mb, MachineOperand::PhysReg(PhysReg::FP), dest_slot as i64, PhysReg::Gp(16), PhysReg::Gp(17), ); return; } InstKind::Select(cond, tv, fv) => { let arm_cond = if let Some(&fused_cond) = ctx.fused_arm_cond.get(cond) { fused_cond } else { let cond_reg = ctx.lookup_vreg(*cond); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::CmpImm, operands: vec![MachineOperand::VReg(cond_reg), MachineOperand::Imm(0)], def: None, }); ArmCond::Ne }; let dest_slot = ctx.lookup_wide_slot(inst.id); let true_slot = ctx.lookup_wide_slot(*tv); let false_slot = ctx.lookup_wide_slot(*fv); emit_load_phys_i128_pair( mf, mb, MachineOperand::PhysReg(PhysReg::FP), true_slot as i64, PhysReg::Gp(16), PhysReg::Gp(17), ); emit_load_phys_i128_pair( mf, mb, MachineOperand::PhysReg(PhysReg::FP), false_slot as i64, PhysReg::Gp(8), PhysReg::Gp(9), ); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::CselReg, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(16)), MachineOperand::PhysReg(PhysReg::Gp(16)), MachineOperand::PhysReg(PhysReg::Gp(8)), MachineOperand::Cond(arm_cond), ], def: None, }); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::CselReg, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(17)), MachineOperand::PhysReg(PhysReg::Gp(17)), MachineOperand::PhysReg(PhysReg::Gp(9)), MachineOperand::Cond(arm_cond), ], def: None, }); emit_store_phys_i128_pair( mf, mb, MachineOperand::PhysReg(PhysReg::FP), dest_slot as i64, PhysReg::Gp(16), PhysReg::Gp(17), ); return; } InstKind::Call(..) => { select_call_inst(mf, ctx, mb, inst, func); return; } _ => { panic!( "isel: unsupported i128 instruction reached backend despite gating: {:?}", inst.kind ); } } } match &inst.kind { // ---- Constants ---- InstKind::ConstInt(val, width) => { let class = int_width_class(width); let dest = ctx.get_vreg(mf, inst.id, class); emit_const_int(mf, mb, dest, *val, *width); } InstKind::ConstFloat(val, width) => { let class = float_width_class(width); let dest = ctx.get_vreg(mf, inst.id, class); let cp_idx = match width { FloatWidth::F32 => mf.add_const(ConstPoolEntry::F32(*val as f32)), FloatWidth::F64 => mf.add_const(ConstPoolEntry::F64(*val)), }; // ADRP + LDR from constant pool. mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::AdrpLdr, operands: vec![ MachineOperand::VReg(dest), MachineOperand::ConstPool(cp_idx), ], def: Some(dest), }); } InstKind::ConstBool(val) => { let dest = ctx.get_vreg(mf, inst.id, RegClass::Gp32); emit_const_int(mf, mb, dest, if *val { 1 } else { 0 }, IntWidth::I32); } InstKind::ConstString(bytes) => { let dest = ctx.get_vreg(mf, inst.id, RegClass::Gp64); let cp_idx = mf.add_const(ConstPoolEntry::Bytes(bytes.clone())); // Use ADRP+ADD to compute the address (not ADRP+LDR which loads the value). mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::AdrpAdd, operands: vec![ MachineOperand::VReg(dest), MachineOperand::ConstPool(cp_idx), ], def: Some(dest), }); } InstKind::Undef(_) => { // Emit a deterministic zero instead of leaving the vreg // undefined. A truly undefined vreg lets the register // allocator hand us whatever physical register is free, // and that register's stale contents leak into reads — // which makes optimization-level diffs nondeterministic // and turns "undef ⇒ anything" into "undef ⇒ whatever // happened to be in x14 at this point in the program." // // mem2reg synthesizes Undef as the initial value of a // promoted slot before any store. The Fortran semantics // for reading uninitialized storage are undefined, but // a hard zero is at least reproducible across opt // levels and friendly to debuggers. let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); match class { RegClass::Gp32 => { mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::MovReg, operands: vec![ MachineOperand::VReg(dest), MachineOperand::PhysReg(PhysReg::Wzr), ], def: Some(dest), }); } RegClass::Gp64 => { mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::MovReg, operands: vec![ MachineOperand::VReg(dest), MachineOperand::PhysReg(PhysReg::Xzr), ], def: Some(dest), }); } RegClass::Fp32 => { let cp_idx = mf.add_const(ConstPoolEntry::F32(0.0)); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::AdrpLdr, operands: vec![ MachineOperand::VReg(dest), MachineOperand::ConstPool(cp_idx), ], def: Some(dest), }); } RegClass::Fp64 => { let cp_idx = mf.add_const(ConstPoolEntry::F64(0.0)); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::AdrpLdr, operands: vec![ MachineOperand::VReg(dest), MachineOperand::ConstPool(cp_idx), ], def: Some(dest), }); } RegClass::V128 => { // Sprint 12 Stage 1 reserves the type/instr; no // path produces a V128 Undef yet. Bail rather // than emit a half-baked NEON zero — when the // vectorizer arrives it will have its own // VBroadcast(const 0) lowering. unreachable!("V128 Undef emission not implemented (Sprint 12 Stage 4 work)"); } } } // ---- Integer arithmetic ---- InstKind::IAdd(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::AddReg, *a, *b), InstKind::ISub(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::SubReg, *a, *b), InstKind::IMul(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::Mul, *a, *b), InstKind::IDiv(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::Sdiv, *a, *b), InstKind::IMod(a, b) => { // imod = a - (a / b) * b → SDIV + MSUB let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); let va = ctx.lookup_vreg(*a); let vb = ctx.lookup_vreg(*b); let tmp = mf.new_vreg(class); // tmp = sdiv a, b mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Sdiv, operands: vec![ MachineOperand::VReg(tmp), MachineOperand::VReg(va), MachineOperand::VReg(vb), ], def: Some(tmp), }); // dest = msub tmp, vb, va → va - tmp * vb = a - (a/b)*b mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Msub, operands: vec![ MachineOperand::VReg(dest), MachineOperand::VReg(tmp), MachineOperand::VReg(vb), MachineOperand::VReg(va), ], def: Some(dest), }); } InstKind::INeg(a) => { let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); let va = ctx.lookup_vreg(*a); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Neg, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(va)], def: Some(dest), }); } // ---- Float arithmetic ---- InstKind::FAdd(a, b) => emit_float_binop( mf, ctx, mb, inst, &inst.ty, *a, *b, ArmOpcode::FaddS, ArmOpcode::FaddD, ), InstKind::FSub(a, b) => emit_float_binop( mf, ctx, mb, inst, &inst.ty, *a, *b, ArmOpcode::FsubS, ArmOpcode::FsubD, ), InstKind::FMul(a, b) => emit_float_binop( mf, ctx, mb, inst, &inst.ty, *a, *b, ArmOpcode::FmulS, ArmOpcode::FmulD, ), InstKind::FDiv(a, b) => emit_float_binop( mf, ctx, mb, inst, &inst.ty, *a, *b, ArmOpcode::FdivS, ArmOpcode::FdivD, ), InstKind::FNeg(a) => { let (class, opcode) = match &inst.ty { IrType::Float(FloatWidth::F32) => (RegClass::Fp32, ArmOpcode::FnegS), _ => (RegClass::Fp64, ArmOpcode::FnegD), }; let dest = ctx.get_vreg(mf, inst.id, class); let va = ctx.lookup_vreg(*a); mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(va)], def: Some(dest), }); } InstKind::FPow(a, b) => { let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); let va = ctx.lookup_vreg(*a); let vb = ctx.lookup_vreg(*b); let (func_name, arg0, arg1, ret) = match &inst.ty { IrType::Float(FloatWidth::F32) => { ("powf", PhysReg::Fp32(0), PhysReg::Fp32(1), PhysReg::Fp32(0)) } _ => ("pow", PhysReg::Fp(0), PhysReg::Fp(1), PhysReg::Fp(0)), }; mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::FmovReg, operands: vec![MachineOperand::PhysReg(arg0), MachineOperand::VReg(va)], def: None, }); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::FmovReg, operands: vec![MachineOperand::PhysReg(arg1), MachineOperand::VReg(vb)], def: None, }); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Bl, operands: vec![MachineOperand::Extern(func_name.into())], def: None, }); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::FmovReg, operands: vec![MachineOperand::VReg(dest), MachineOperand::PhysReg(ret)], def: Some(dest), }); } // ---- Comparisons ---- InstKind::ICmp(op, a, b) => { if matches!(func.value_type(*a), Some(IrType::Int(IntWidth::I128))) || matches!(func.value_type(*b), Some(IrType::Int(IntWidth::I128))) { let dest = ctx.get_vreg(mf, inst.id, RegClass::Gp32); let lhs_slot = ctx.lookup_wide_slot(*a); let rhs_slot = ctx.lookup_wide_slot(*b); emit_load_phys_i128_pair( mf, mb, MachineOperand::PhysReg(PhysReg::FP), lhs_slot as i64, PhysReg::Gp(16), PhysReg::Gp(17), ); emit_load_phys_i128_pair( mf, mb, MachineOperand::PhysReg(PhysReg::FP), rhs_slot as i64, PhysReg::Gp(8), PhysReg::Gp(9), ); match op { CmpOp::Eq | CmpOp::Ne => { mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::CmpReg, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(16)), MachineOperand::PhysReg(PhysReg::Gp(8)), ], def: None, }); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Cset, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp32(10)), MachineOperand::Cond(cmp_to_arm_cond(*op)), ], def: None, }); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::CmpReg, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(17)), MachineOperand::PhysReg(PhysReg::Gp(9)), ], def: None, }); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Cset, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp32(11)), MachineOperand::Cond(cmp_to_arm_cond(*op)), ], def: None, }); let combine = match op { CmpOp::Eq => ArmOpcode::AndReg, CmpOp::Ne => ArmOpcode::OrrReg, _ => unreachable!(), }; mf.block_mut(mb).insts.push(MachineInst { opcode: combine, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp32(10)), MachineOperand::PhysReg(PhysReg::Gp32(10)), MachineOperand::PhysReg(PhysReg::Gp32(11)), ], def: None, }); } CmpOp::Lt | CmpOp::Le | CmpOp::Gt | CmpOp::Ge => { let (hi_cond, lo_cond) = i128_ordered_conds(*op); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::CmpReg, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(17)), MachineOperand::PhysReg(PhysReg::Gp(9)), ], def: None, }); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Cset, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp32(10)), MachineOperand::Cond(hi_cond), ], def: None, }); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Cset, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp32(11)), MachineOperand::Cond(ArmCond::Eq), ], def: None, }); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::CmpReg, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(16)), MachineOperand::PhysReg(PhysReg::Gp(8)), ], def: None, }); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Cset, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp32(8)), MachineOperand::Cond(lo_cond), ], def: None, }); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::AndReg, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp32(11)), MachineOperand::PhysReg(PhysReg::Gp32(11)), MachineOperand::PhysReg(PhysReg::Gp32(8)), ], def: None, }); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::OrrReg, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp32(10)), MachineOperand::PhysReg(PhysReg::Gp32(10)), MachineOperand::PhysReg(PhysReg::Gp32(11)), ], def: None, }); } } mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::MovReg, operands: vec![ MachineOperand::VReg(dest), MachineOperand::PhysReg(PhysReg::Gp32(10)), ], def: Some(dest), }); return; } let dest = ctx.get_vreg(mf, inst.id, RegClass::Gp32); let va = icmp_operand_vreg(mf, ctx, mb, func, *a, *b); let vb = icmp_operand_vreg(mf, ctx, mb, func, *b, *a); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::CmpReg, operands: vec![MachineOperand::VReg(va), MachineOperand::VReg(vb)], def: None, }); // If this ICmp feeds exclusively into a Select (detected in the // pre-pass), suppress CSET. The Select will use the flags directly. if !ctx.select_fused.contains(&inst.id) { mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Cset, operands: vec![ MachineOperand::VReg(dest), MachineOperand::Cond(cmp_to_arm_cond(*op)), ], def: Some(dest), }); } } InstKind::FCmp(op, a, b) => { let dest = ctx.get_vreg(mf, inst.id, RegClass::Gp32); let va = ctx.lookup_vreg(*a); let vb = ctx.lookup_vreg(*b); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::FCmpReg, operands: vec![MachineOperand::VReg(va), MachineOperand::VReg(vb)], def: None, }); if !ctx.select_fused.contains(&inst.id) { mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::FCset, operands: vec![ MachineOperand::VReg(dest), MachineOperand::Cond(fcmp_to_arm_cond(*op)), ], def: Some(dest), }); } } // ---- Logic ---- InstKind::And(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::AndReg, *a, *b), InstKind::Or(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::OrrReg, *a, *b), InstKind::Not(a) => { // Logical NOT: CMP src, #0; CSET dest, EQ // If src == 0 (false), EQ is true → dest = 1 (true). // If src != 0 (true), EQ is false → dest = 0 (false). // This correctly handles any truthy value, not just 0/1. let dest = ctx.get_vreg(mf, inst.id, RegClass::Gp32); let va = ctx.lookup_vreg(*a); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::CmpImm, operands: vec![MachineOperand::VReg(va), MachineOperand::Imm(0)], def: None, }); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Cset, operands: vec![ MachineOperand::VReg(dest), MachineOperand::Cond(ArmCond::Eq), ], def: Some(dest), }); } // ---- Select (CSEL) ---- // // Fast path: if the condition was produced by an ICmp/FCmp in the // same block with no other users, the pre-pass marked it as fused. // We already emitted `CMP a, b` (no CSET), so the flags are live. // Use them directly: `CSEL dest, tv, fv, `. // // Slow path (unfused): the condition is an arbitrary boolean in a // register. Materialize with `CMP cond, #0; CSEL dest, tv, fv, NE`. InstKind::Select(cond, tv, fv) => { let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); let true_reg = coerce_select_operand_vreg(mf, ctx, mb, func, *tv, &inst.ty); let false_reg = coerce_select_operand_vreg(mf, ctx, mb, func, *fv, &inst.ty); let arm_cond = if let Some(&fused_cond) = ctx.fused_arm_cond.get(cond) { // Flags already set by the fused CMP — no extra compare needed. fused_cond } else { // Unfused: compare the boolean register against 0. let cond_reg = ctx.lookup_vreg(*cond); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::CmpImm, operands: vec![MachineOperand::VReg(cond_reg), MachineOperand::Imm(0)], def: None, }); ArmCond::Ne }; let opcode = if class == RegClass::Fp32 || class == RegClass::Fp64 { ArmOpcode::FcselReg } else { ArmOpcode::CselReg }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![ MachineOperand::VReg(dest), MachineOperand::VReg(true_reg), MachineOperand::VReg(false_reg), MachineOperand::Cond(arm_cond), ], def: Some(dest), }); } // ---- Float: fabs, fsqrt ---- InstKind::FAbs(a) => { let src = ctx.lookup_vreg(*a); let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); let opcode = if class == RegClass::Fp64 { ArmOpcode::FabsD } else { ArmOpcode::FabsS }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } InstKind::FSqrt(a) => { let src = ctx.lookup_vreg(*a); let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); let opcode = if class == RegClass::Fp64 { ArmOpcode::FsqrtD } else { ArmOpcode::FsqrtS }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } // ---- Bitwise ---- InstKind::BitAnd(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::AndReg, *a, *b), InstKind::BitOr(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::OrrReg, *a, *b), InstKind::BitXor(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::EorReg, *a, *b), InstKind::BitNot(a) => { let src = ctx.lookup_vreg(*a); let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Mvn, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } InstKind::Shl(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::LslReg, *a, *b), InstKind::LShr(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::LsrReg, *a, *b), InstKind::AShr(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::AsrReg, *a, *b), InstKind::CountLeadingZeros(a) => { let src = ctx.lookup_vreg(*a); let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Clz, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } InstKind::CountTrailingZeros(a) => { // CTZ = CLZ(RBIT(x)) let src = ctx.lookup_vreg(*a); let class = type_to_reg_class(&inst.ty); let tmp = mf.new_vreg(class); let dest = ctx.get_vreg(mf, inst.id, class); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Rbit, operands: vec![MachineOperand::VReg(tmp), MachineOperand::VReg(src)], def: Some(tmp), }); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Clz, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(tmp)], def: Some(dest), }); } InstKind::PopCount(a) => { // ARM64 popcount: FMOV Vd.8B, Xn; CNT Vd.8B, Vd.8B; ADDV Bd, Vd.8B; FMOV Wd, Sd // For simplicity, emit a runtime call. let src = ctx.lookup_vreg(*a); let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); // Placeholder: use CLZ-based Hamming weight or runtime call. // For now, move src to dest (will be replaced with proper popcount later). mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::MovReg, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } // ---- Conversions ---- InstKind::IntToFloat(a, fw) => { let src = ctx.lookup_vreg(*a); let src_class = mf.vregs.iter().find(|v| v.id == src).map(|v| v.class); let is_64bit_src = matches!(src_class, Some(RegClass::Gp64)); let (class, opcode) = match (fw, is_64bit_src) { (FloatWidth::F32, false) => (RegClass::Fp32, ArmOpcode::ScvtfSW), (FloatWidth::F32, true) => (RegClass::Fp32, ArmOpcode::ScvtfSX), (FloatWidth::F64, false) => (RegClass::Fp64, ArmOpcode::ScvtfDW), (FloatWidth::F64, true) => (RegClass::Fp64, ArmOpcode::ScvtfDX), }; let dest = ctx.get_vreg(mf, inst.id, class); mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } InstKind::FloatToInt(a, iw) => { let src = ctx.lookup_vreg(*a); let src_class = mf.vregs.iter().find(|v| v.id == src).map(|v| v.class); let is_f64_src = matches!(src_class, Some(RegClass::Fp64)); let is_64bit_dest = matches!(iw, IntWidth::I64); let class = int_width_class(iw); let opcode = match (is_64bit_dest, is_f64_src) { (false, false) => ArmOpcode::FcvtzsWS, (false, true) => ArmOpcode::FcvtzsWD, (true, false) => ArmOpcode::FcvtzsXS, (true, true) => ArmOpcode::FcvtzsXD, }; let dest = ctx.get_vreg(mf, inst.id, class); mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } InstKind::FloatExtend(a, _) => { let src = ctx.lookup_vreg(*a); let dest = ctx.get_vreg(mf, inst.id, RegClass::Fp64); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::FcvtDS, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } InstKind::FloatTrunc(a, _) => { let src = ctx.lookup_vreg(*a); let dest = ctx.get_vreg(mf, inst.id, RegClass::Fp32); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::FcvtSD, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } // ---- Memory ---- InstKind::GlobalAddr(name) => { // Materialize the address of a module-level global into // a Gp64 vreg via ADRP+ADD against `_globalname`. Loads // and stores then operate on this pointer the same way // they operate on an alloca address. let dest = ctx.get_vreg(mf, inst.id, RegClass::Gp64); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::AdrpAdd, operands: vec![ MachineOperand::VReg(dest), MachineOperand::GlobalLabel(name.clone()), ], def: Some(dest), }); } InstKind::Alloca(_) => { // Alloca is handled in Phase 1 (stack slot allocation). // The "address" is a frame slot offset. Map the ValueId to the offset. if let Some(&offset) = ctx.alloca_offsets.get(&inst.id) { let dest = ctx.get_vreg(mf, inst.id, RegClass::Gp64); // Materialize address: SUB dest, FP, #abs(offset) // Offsets are negative from FP, so we subtract the absolute value. let abs_offset = (-offset) as i64; mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::SubImm, operands: vec![ MachineOperand::VReg(dest), MachineOperand::PhysReg(PhysReg::FP), MachineOperand::Imm(abs_offset), ], def: Some(dest), }); } } InstKind::Load(addr) => { // Audit CRITICAL-2: dispatch on the IR result type so the // load opcode width matches the value, not the pointer. // Previously every integer load used `ldr w_, [_]` regardless // of width, silently reading 4 bytes for an i8 load. let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); let opcode = load_opcode_for(&inst.ty, class); let (base_op, offset_op) = narrow_load_store_addr(ctx, *addr); mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![MachineOperand::VReg(dest), base_op, offset_op], def: Some(dest), }); } InstKind::Store(val, addr) => { if matches!(func.value_type(*val), Some(IrType::Int(IntWidth::I128))) { let src_slot = ctx.lookup_wide_slot(*val); emit_load_phys_i128_pair( mf, mb, MachineOperand::PhysReg(PhysReg::FP), src_slot as i64, PhysReg::Gp(16), PhysReg::Gp(17), ); if let Some(&offset) = ctx.alloca_offsets.get(addr) { emit_store_phys_i128_pair( mf, mb, MachineOperand::PhysReg(PhysReg::FP), offset as i64, PhysReg::Gp(16), PhysReg::Gp(17), ); } else { let base = ctx.lookup_vreg(*addr); emit_store_phys_i128_pair( mf, mb, MachineOperand::VReg(base), 0, PhysReg::Gp(16), PhysReg::Gp(17), ); } return; } let val_vreg = ctx.lookup_vreg(*val); // Audit CRITICAL-2: dispatch on the *value*'s declared IR // type, not the pointer's pointee — byte-level GEPs into // derived types and array constructors reuse `Ptr` as a // generic offset cursor, so dispatching by the pointee // would silently truncate non-byte stores. let val_ty = func.value_type(*val); let val_class = mf.vregs .iter() .find(|v| v.id == val_vreg) .map(|v| v.class) .unwrap_or(RegClass::Gp64); let opcode = store_opcode_for(val_ty.as_ref(), val_class); let (base_op, offset_op) = narrow_load_store_addr(ctx, *addr); mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![MachineOperand::VReg(val_vreg), base_op, offset_op], def: None, }); } InstKind::GetElementPtr(base, indices) => { // GEP: base + index * elem_size let dest = ctx.get_vreg(mf, inst.id, RegClass::Gp64); let base_src = ctx.lookup_vreg(*base); let base_vreg = if mf.vregs.iter().find(|v| v.id == base_src).map(|v| v.class) != Some(RegClass::Gp64) { let widened = mf.new_vreg(RegClass::Gp64); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::MovReg, operands: vec![ MachineOperand::VReg(widened), MachineOperand::VReg(base_src), ], def: Some(widened), }); widened } else { base_src }; // Determine element size from the GEP result type (Ptr). // Bool occupies 1 byte both in SSA and in `alloca [Bool x N]` // storage; the prior 4-byte override here desynced GEP byte // strides from `alloca` byte strides, so `arr(i) = .true.` for // a stack `logical :: arr(N)` wrote 3 bytes past the slot. let elem_size = match &inst.ty { IrType::Ptr(inner) => match inner.as_ref() { IrType::Struct(_) => alloca_size(inner) as i64, _ => inner.size_bytes() as i64, }, _ => 4, // fallback }; if let Some(idx) = indices.first() { let idx_src = ctx.lookup_vreg(*idx); let idx_vreg = if mf.vregs.iter().find(|v| v.id == idx_src).map(|v| v.class) == Some(RegClass::Gp64) { idx_src } else { let widened = mf.new_vreg(RegClass::Gp64); let opcode = if matches!(func.value_type(*idx), Some(IrType::Bool)) { ArmOpcode::MovReg } else { ArmOpcode::Sxtw }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![ MachineOperand::VReg(widened), MachineOperand::VReg(idx_src), ], def: Some(widened), }); widened }; let tmp = mf.new_vreg(RegClass::Gp64); emit_const_int(mf, mb, tmp, elem_size as i128, IntWidth::I64); let scaled = mf.new_vreg(RegClass::Gp64); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Mul, operands: vec![ MachineOperand::VReg(scaled), MachineOperand::VReg(idx_vreg), MachineOperand::VReg(tmp), ], def: Some(scaled), }); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::AddReg, operands: vec![ MachineOperand::VReg(dest), MachineOperand::VReg(base_vreg), MachineOperand::VReg(scaled), ], def: Some(dest), }); } else { // No indices — just copy the base. mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::MovReg, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(base_vreg)], def: Some(dest), }); } } // ---- Calls ---- InstKind::Call(..) | InstKind::RuntimeCall(..) => { select_call_inst(mf, ctx, mb, inst, func); } // ---- Integer extend/truncate ---- InstKind::IntExtend(a, _target_width, signed) => { let src = ctx.lookup_vreg(*a); // Pick the opcode based on the SOURCE width, not the // target. ARM64 has distinct SXTB/SXTH/SXTW instructions // for 8/16/32-bit sources; using SXTW on anything other // than a 32-bit source (or with a non-X dest) yields // "invalid operand for instruction" at the assembler. let src_ty = func.value_type(*a); let src_width = match src_ty.as_ref() { Some(IrType::Int(IntWidth::I8)) => 8, Some(IrType::Int(IntWidth::I16)) => 16, Some(IrType::Int(IntWidth::I32)) | Some(IrType::Bool) => 32, Some(IrType::Int(IntWidth::I64)) => 64, _ => 32, // conservative default }; let dest_width = match &inst.ty { IrType::Int(IntWidth::I8) | IrType::Int(IntWidth::I16) | IrType::Int(IntWidth::I32) | IrType::Bool => 32, IrType::Int(IntWidth::I64) => 64, _ => 32, }; // Dest register class follows the declared target // bit-width, with one exception: SXTW requires an // X-register destination, so promote to Gp64 when // source is 32 AND target is 64. let dest_class = if dest_width == 64 { RegClass::Gp64 } else { RegClass::Gp32 }; let dest = ctx.get_vreg(mf, inst.id, dest_class); if !*signed { // Zero-extend: MOV (ARM64 implicitly zero-extends W→X). mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::MovReg, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } else if src_width >= dest_width { // Same-width or wider source (bogus from lowering's // perspective but observed in practice when a // function-result intrinsic mis-resolves). Emit MOV // rather than an illegal SXTW Wd, Wn. mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::MovReg, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } else { let opcode = match src_width { 8 => ArmOpcode::Sxtb, 16 => ArmOpcode::Sxth, 32 => ArmOpcode::Sxtw, _ => ArmOpcode::MovReg, // unreachable given the bool above }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } } InstKind::IntTrunc(a, _) => { let src = ctx.lookup_vreg(*a); let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); // Truncate: just MOV — the 32-bit register naturally truncates. mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::MovReg, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } InstKind::PtrToInt(a) => { // Pointer is already an i64 in a GP register — just mov. let src = ctx.lookup_vreg(*a); let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::MovReg, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } InstKind::IntToPtr(a, _) => { // Integer already in a GP register — treat as pointer via mov. let src = ctx.lookup_vreg(*a); let dest = ctx.get_vreg(mf, inst.id, RegClass::Gp64); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::MovReg, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } // ---- SIMD vector ops (Sprint 12 Stage 2 isel hookup) ---- // // The vectorizer (Stage 4) is what will start producing // these. Each arm picks a NEON ArmOpcode based on the result // vector's lane shape. Mixed-shape ops (e.g. integer 8×i16 // narrow lanes) aren't selected here — Stage 4 will only // emit the four shapes covered by `VShape`. InstKind::VAdd(a, b) => emit_vbinop(mf, ctx, mb, inst, *a, *b, |s| match s { VShape::V4S => ArmOpcode::AddV4S, VShape::V2D => ArmOpcode::AddV2D, VShape::F4S => ArmOpcode::FaddV4S, VShape::F2D => ArmOpcode::FaddV2D, }), InstKind::VSub(a, b) => emit_vbinop(mf, ctx, mb, inst, *a, *b, |s| match s { VShape::V4S => ArmOpcode::SubV4S, VShape::V2D => ArmOpcode::SubV2D, VShape::F4S => ArmOpcode::FsubV4S, VShape::F2D => ArmOpcode::FsubV2D, }), InstKind::VMul(a, b) => emit_vbinop(mf, ctx, mb, inst, *a, *b, |s| match s { VShape::V4S => ArmOpcode::MulV4S, // NEON has no integer 2D mul — Stage 4 should not request // it; if it does we fall through to a placeholder. VShape::V2D => ArmOpcode::Nop, VShape::F4S => ArmOpcode::FmulV4S, VShape::F2D => ArmOpcode::FmulV2D, }), InstKind::VDiv(a, b) => emit_vbinop(mf, ctx, mb, inst, *a, *b, |s| match s { // No integer NEON divide — emit a placeholder; the // vectorizer should refuse to pick V128 lanes for VDiv // on integer types. Float forms exist. VShape::V4S | VShape::V2D => ArmOpcode::Nop, VShape::F4S => ArmOpcode::FdivV4S, VShape::F2D => ArmOpcode::FdivV2D, }), InstKind::VNeg(a) => emit_vunop(mf, ctx, mb, inst, *a, |s| match s { VShape::V4S => ArmOpcode::NegV4S, VShape::V2D => ArmOpcode::NegV2D, VShape::F4S => ArmOpcode::FnegV4S, VShape::F2D => ArmOpcode::FnegV2D, }), InstKind::VAbs(a) => emit_vunop(mf, ctx, mb, inst, *a, |s| match s { VShape::F4S => ArmOpcode::FabsV4S, VShape::F2D => ArmOpcode::FabsV2D, // NEON `abs` exists for integer too but the four-shape // alias isn't generated yet; placeholder. VShape::V4S | VShape::V2D => ArmOpcode::Nop, }), InstKind::VSqrt(a) => emit_vunop(mf, ctx, mb, inst, *a, |s| match s { VShape::F4S => ArmOpcode::FsqrtV4S, VShape::F2D => ArmOpcode::FsqrtV2D, // sqrt is float-only. VShape::V4S | VShape::V2D => ArmOpcode::Nop, }), InstKind::VFma(a, b, c) => { // FMLA is dest += a*b. Conventional 3-operand call // assumes dest is a fresh vreg — emit a copy-from-c // followed by FMLA. Stage 4 should fold the copy when it // tracks SSA destinations more carefully. let shape = match VShape::from_ir(&inst.ty) { Some(s) if s.is_float() => s, _ => { // unsupported shape — placeholder let dest = ctx.get_vreg(mf, inst.id, RegClass::V128); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Nop, operands: vec![], def: Some(dest), }); return; } }; let opcode = match shape { VShape::F4S => ArmOpcode::FmlaV4S, VShape::F2D => ArmOpcode::FmlaV2D, _ => unreachable!(), }; let va = ctx.lookup_vreg(*a); let vb = ctx.lookup_vreg(*b); let vc = ctx.lookup_vreg(*c); let dest = ctx.get_vreg(mf, inst.id, RegClass::V128); // dest = c (init accumulator). Must use Mov16B (mov.16b) // for V128 — fmov d, d truncates to 64 bits and silently // drops the upper lanes. mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Mov16B, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(vc)], def: Some(dest), }); // dest += a * b mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![ MachineOperand::VReg(dest), MachineOperand::VReg(va), MachineOperand::VReg(vb), ], def: Some(dest), }); } InstKind::VSelect(mask, t, f) => { // BSL is destructive: bsl Vd.16b, Vn.16b, Vm.16b → for // each bit, if Vd then Vn else Vm. So we copy the mask // into the dest first (mov.16b), then bsl with t/f. let vmask = ctx.lookup_vreg(*mask); let vt = ctx.lookup_vreg(*t); let vf = ctx.lookup_vreg(*f); let dest = ctx.get_vreg(mf, inst.id, RegClass::V128); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Mov16B, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(vmask)], def: Some(dest), }); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::BslV16B, operands: vec![ MachineOperand::VReg(dest), MachineOperand::VReg(vt), MachineOperand::VReg(vf), ], def: Some(dest), }); } InstKind::VLoad(addr) => { let dest = ctx.get_vreg(mf, inst.id, RegClass::V128); let base = ctx.lookup_vreg(*addr); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::LdrQ, operands: vec![ MachineOperand::VReg(dest), MachineOperand::VReg(base), MachineOperand::Imm(0), ], def: Some(dest), }); } InstKind::VStore(val, addr) => { let v = ctx.lookup_vreg(*val); let base = ctx.lookup_vreg(*addr); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::StrQ, operands: vec![ MachineOperand::VReg(v), MachineOperand::VReg(base), MachineOperand::Imm(0), ], def: None, }); } InstKind::VFCmp(op, a, b) => { // NEON fcmp produces an all-ones / all-zeros mask per lane. // Eq/Ge/Gt are direct; Ne/Le/Lt swap operands or invert. // For Lt: fcmgt swapped operands. For Le: fcmge swapped. // Ne is not a single-instruction in NEON; we don't handle // it yet (vectorizer doesn't emit Ne). let dest = ctx.get_vreg(mf, inst.id, RegClass::V128); let va = ctx.lookup_vreg(*a); let vb = ctx.lookup_vreg(*b); let shape = VShape::from_ir(&inst.ty); let (opcode, swap) = match (shape, op) { (Some(VShape::F4S), CmpOp::Gt) => (ArmOpcode::FcmgtV4S, false), (Some(VShape::F2D), CmpOp::Gt) => (ArmOpcode::FcmgtV2D, false), (Some(VShape::F4S), CmpOp::Ge) => (ArmOpcode::FcmgeV4S, false), (Some(VShape::F2D), CmpOp::Ge) => (ArmOpcode::FcmgeV2D, false), (Some(VShape::F4S), CmpOp::Eq) => (ArmOpcode::FcmeqV4S, false), (Some(VShape::F2D), CmpOp::Eq) => (ArmOpcode::FcmeqV2D, false), (Some(VShape::F4S), CmpOp::Lt) => (ArmOpcode::FcmgtV4S, true), (Some(VShape::F2D), CmpOp::Lt) => (ArmOpcode::FcmgtV2D, true), (Some(VShape::F4S), CmpOp::Le) => (ArmOpcode::FcmgeV4S, true), (Some(VShape::F2D), CmpOp::Le) => (ArmOpcode::FcmgeV2D, true), _ => (ArmOpcode::Nop, false), }; let (lhs, rhs) = if swap { (vb, va) } else { (va, vb) }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![ MachineOperand::VReg(dest), MachineOperand::VReg(lhs), MachineOperand::VReg(rhs), ], def: Some(dest), }); } InstKind::VICmp(op, a, b) => { let dest = ctx.get_vreg(mf, inst.id, RegClass::V128); let va = ctx.lookup_vreg(*a); let vb = ctx.lookup_vreg(*b); let shape = VShape::from_ir(&inst.ty); let (opcode, swap) = match (shape, op) { (Some(VShape::V4S), CmpOp::Gt) => (ArmOpcode::CmgtV4S, false), (Some(VShape::V4S), CmpOp::Ge) => (ArmOpcode::CmgeV4S, false), (Some(VShape::V4S), CmpOp::Eq) => (ArmOpcode::CmeqV4S, false), (Some(VShape::V4S), CmpOp::Lt) => (ArmOpcode::CmgtV4S, true), (Some(VShape::V4S), CmpOp::Le) => (ArmOpcode::CmgeV4S, true), _ => (ArmOpcode::Nop, false), }; let (lhs, rhs) = if swap { (vb, va) } else { (va, vb) }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![ MachineOperand::VReg(dest), MachineOperand::VReg(lhs), MachineOperand::VReg(rhs), ], def: Some(dest), }); } InstKind::VBroadcast(scalar) => { let s = ctx.lookup_vreg(*scalar); let dest = ctx.get_vreg(mf, inst.id, RegClass::V128); // Float scalars live in S/D registers — splatting from // those uses the lane-dup form (`dup.4s vN, vM.s[0]`). // Integer scalars live in W/X registers — splatting from // those uses the gp-dup form (`dup.4s vN, wM`). let opcode = match VShape::from_ir(&inst.ty) { Some(VShape::V4S) => ArmOpcode::DupGen4S, Some(VShape::V2D) => ArmOpcode::DupGen2D, Some(VShape::F4S) => ArmOpcode::DupEl4S, Some(VShape::F2D) => ArmOpcode::DupEl2D, None => ArmOpcode::Nop, }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(s)], def: Some(dest), }); } InstKind::VReduceSum(v) => { // Cross-lane sum. The reduction instruction writes its // 32/64-bit result into the FP register file (sN/dN view // of vN). For float results that's already what we want; // for int results we follow up with a `umov.s/.d` move // from the FP lane back into a GP register. // // F4S → faddv s_dest, v_src.4s // F2D → faddp d_dest, v_src.2d // int(I32) → addv s_tmp, v_src.4s; umov.s w_dest, v_tmp[0] // int(I64) → addv s_tmp, v_src.4s; umov.s w_dest, v_tmp[0] // (4-lane i32 sum widens into a single i32; the // caller is expected to sign-extend if it // wanted i64 semantics — matches scalar IAdd) let src = ctx.lookup_vreg(*v); match &inst.ty { IrType::Float(FloatWidth::F32) => { // NEON has no `faddv.4s`. Reduce 4 f32 lanes // with two pairwise adds: // 1) `faddp.4s v_tmp, v_src, v_src` // → [a+b, c+d, a+b, c+d] // 2) `faddp.2s s_dest, v_tmp` // → (a+b)+(c+d) — the full sum let tmp = mf.new_vreg(RegClass::V128); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::FaddpV4S, operands: vec![ MachineOperand::VReg(tmp), MachineOperand::VReg(src), MachineOperand::VReg(src), ], def: Some(tmp), }); let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::FaddpV2S, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(tmp)], def: Some(dest), }); } IrType::Float(FloatWidth::F64) => { let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::FaddpV2D, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } IrType::Int(IntWidth::I32) => { // 4×i32 → scalar via `addv.4s s_tmp, v_src` then // `umov.s w_dest, v_tmp[0]`. let tmp = mf.new_vreg(RegClass::V128); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Addv4S, operands: vec![MachineOperand::VReg(tmp), MachineOperand::VReg(src)], def: Some(tmp), }); let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Umov4S, operands: vec![ MachineOperand::VReg(dest), MachineOperand::VReg(tmp), MachineOperand::Imm(0), ], def: Some(dest), }); } IrType::Int(IntWidth::I64) => { // 2×i64 → scalar via pairwise add (`addp.2d // v_tmp, v_src, v_src`) then `umov.d x_dest, // v_tmp[0]`. NEON has no `addv.2d`, so the // pairwise form is the canonical i64 reduce. let tmp = mf.new_vreg(RegClass::V128); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::AddpV2D, operands: vec![ MachineOperand::VReg(tmp), MachineOperand::VReg(src), MachineOperand::VReg(src), ], def: Some(tmp), }); let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Umov2D, operands: vec![ MachineOperand::VReg(dest), MachineOperand::VReg(tmp), MachineOperand::Imm(0), ], def: Some(dest), }); } IrType::Int(_) => { let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Nop, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } _ => { let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Nop, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } } } InstKind::VExtract(v, lane) => { let src = ctx.lookup_vreg(*v); let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); let opcode = match &inst.ty { IrType::Int(IntWidth::I32) => ArmOpcode::Umov4S, IrType::Int(IntWidth::I64) => ArmOpcode::Umov2D, IrType::Float(FloatWidth::F32) => ArmOpcode::FmovEl4S, IrType::Float(FloatWidth::F64) => ArmOpcode::FmovEl2D, _ => ArmOpcode::Nop, }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![ MachineOperand::VReg(dest), MachineOperand::VReg(src), MachineOperand::Imm(*lane as i64), ], def: Some(dest), }); } InstKind::VMin(a, b) | InstKind::VMax(a, b) => { let va = ctx.lookup_vreg(*a); let vb = ctx.lookup_vreg(*b); let dest = ctx.get_vreg(mf, inst.id, RegClass::V128); let is_max = matches!(inst.kind, InstKind::VMax(..)); let opcode = match (VShape::from_ir(&inst.ty), is_max) { (Some(VShape::V4S), true) => ArmOpcode::SmaxV4S, (Some(VShape::V4S), false) => ArmOpcode::SminV4S, (Some(VShape::F4S), true) => ArmOpcode::FmaxV4S, (Some(VShape::F4S), false) => ArmOpcode::FminV4S, (Some(VShape::F2D), true) => ArmOpcode::FmaxV2D, (Some(VShape::F2D), false) => ArmOpcode::FminV2D, _ => ArmOpcode::Nop, }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![ MachineOperand::VReg(dest), MachineOperand::VReg(va), MachineOperand::VReg(vb), ], def: Some(dest), }); } InstKind::VReduceMin(v) | InstKind::VReduceMax(v) => { let src = ctx.lookup_vreg(*v); let is_max = matches!(inst.kind, InstKind::VReduceMax(..)); match &inst.ty { IrType::Int(IntWidth::I32) => { let tmp = mf.new_vreg(RegClass::V128); let opcode = if is_max { ArmOpcode::Smaxv4S } else { ArmOpcode::Sminv4S }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![MachineOperand::VReg(tmp), MachineOperand::VReg(src)], def: Some(tmp), }); let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Umov4S, operands: vec![ MachineOperand::VReg(dest), MachineOperand::VReg(tmp), MachineOperand::Imm(0), ], def: Some(dest), }); } IrType::Float(FloatWidth::F32) => { // fmaxv.4s / fminv.4s s_dest, v_src let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); let opcode = if is_max { ArmOpcode::FmaxvV4S } else { ArmOpcode::FminvV4S }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } IrType::Float(FloatWidth::F64) => { // NEON has no fmaxv.2d; the pairwise scalar form // (fmaxp.2d d_dest, v_src) is the across-lane // reduction for two f64 lanes. let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); let opcode = if is_max { ArmOpcode::FmaxpV2DScalar } else { ArmOpcode::FminpV2DScalar }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } _ => { let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Nop, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); } } } // Remaining: ExtractField, InsertField, and other vector ops // (VInsert, VICmp, VFCmp, VBitcast) — placeholder. Land // per-op as the vectorizer grows in Stage 4. _ => { let class = type_to_reg_class(&inst.ty); let _dest = ctx.get_vreg(mf, inst.id, class); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Nop, operands: vec![], def: None, }); } } } /// Select machine instructions for a terminator. fn select_terminator( mf: &mut MachineFunction, ctx: &mut ISelCtx, mb: MBlockId, term: &Terminator, src_block: &BasicBlock, func: &Function, ) { let _ = src_block; // used implicitly via `term`'s args; kept for clarity match term { Terminator::Return(None) => { emit_epilogue(mf, mb); } Terminator::Return(Some(val)) => { if matches!(func.value_type(*val), Some(IrType::Int(IntWidth::I128))) { let src_slot = ctx.lookup_wide_slot(*val); emit_load_phys_i128_pair( mf, mb, MachineOperand::PhysReg(PhysReg::FP), src_slot as i64, PhysReg::Gp(0), PhysReg::Gp(1), ); emit_epilogue(mf, mb); return; } // Move result to X0 (integer) or D0 (float). let src = ctx.lookup_vreg(*val); let class = mf.vregs.iter().find(|v| v.id == src).map(|v| v.class); let (reg, opcode) = match class { Some(RegClass::Fp64) => (PhysReg::Fp(0), ArmOpcode::FmovReg), Some(RegClass::Fp32) => (PhysReg::Fp32(0), ArmOpcode::FmovReg), Some(RegClass::Gp32) => (PhysReg::Gp32(0), ArmOpcode::MovReg), _ => (PhysReg::Gp(0), ArmOpcode::MovReg), }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![MachineOperand::PhysReg(reg), MachineOperand::VReg(src)], def: None, }); emit_epilogue(mf, mb); } Terminator::Branch(dest, args) => { // Emit parallel copy from each branch arg into the // target block's corresponding param vreg BEFORE the // actual branch instruction. Without this, block // parameters introduced by mem2reg or the lowerer // would never receive their incoming values at edge // points, producing infinite loops or stale data. emit_branch_arg_copies(mf, ctx, mb, *dest, args); let target = ctx.lookup_block(*dest); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::B, operands: vec![MachineOperand::BlockRef(target)], def: None, }); } Terminator::CondBranch { cond, true_dest, true_args, false_dest, false_args, } => { let cond_vreg = ctx.lookup_vreg(*cond); let true_mb = ctx.lookup_block(*true_dest); let false_mb = ctx.lookup_block(*false_dest); // For a conditional branch, the parallel copies for // the two arms must happen only on the taken edge. We // emit the copies inside per-arm trampoline sequences: // // CMP cond, #0 // B.EQ false_copies_then_jump (conditional jump to // false-side copies) // // B true_dest // false_copies_then_jump: // // B false_dest // // To keep the machine CFG simple we instead emit the // false-side copies + jump as a new machine block. // But that's invasive. For the common case where // neither arm has copies, fall back to the original // shape. When either arm has copies, materialize a // shim block for that arm. mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::CmpImm, operands: vec![MachineOperand::VReg(cond_vreg), MachineOperand::Imm(0)], def: None, }); // True arm: if there are branch args to copy, create // a shim block that does the copies then jumps to the // true destination. Otherwise, branch directly. let true_target = if true_args.is_empty() { true_mb } else { // Prefix with the function name so labels stay // unique across functions in the same .s file. Two // functions could otherwise both emit `L3_true_shim`. let label = format!("L{}_{}_true_shim", mf.name, mb.0); let shim = mf.new_block(&label); emit_branch_arg_copies(mf, ctx, shim, *true_dest, true_args); mf.block_mut(shim).insts.push(MachineInst { opcode: ArmOpcode::B, operands: vec![MachineOperand::BlockRef(true_mb)], def: None, }); shim }; mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::BCond, operands: vec![ MachineOperand::Cond(ArmCond::Ne), MachineOperand::BlockRef(true_target), ], def: None, }); // False arm: same treatment. let false_target = if false_args.is_empty() { false_mb } else { let label = format!("L{}_{}_false_shim", mf.name, mb.0); let shim = mf.new_block(&label); emit_branch_arg_copies(mf, ctx, shim, *false_dest, false_args); mf.block_mut(shim).insts.push(MachineInst { opcode: ArmOpcode::B, operands: vec![MachineOperand::BlockRef(false_mb)], def: None, }); shim }; mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::B, operands: vec![MachineOperand::BlockRef(false_target)], def: None, }); } Terminator::Switch { selector, cases, default, } => { let sel_vreg = ctx.lookup_vreg(*selector); let default_mb = ctx.lookup_block(*default); for (val, dest) in cases { let dest_mb = ctx.lookup_block(*dest); // CMP selector, #val; B.EQ case_block mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::CmpImm, operands: vec![MachineOperand::VReg(sel_vreg), MachineOperand::Imm(*val)], def: None, }); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::BCond, operands: vec![ MachineOperand::Cond(ArmCond::Eq), MachineOperand::BlockRef(dest_mb), ], def: None, }); } // Default: unconditional branch. mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::B, operands: vec![MachineOperand::BlockRef(default_mb)], def: None, }); } Terminator::Unreachable => { // Debug trap — should never execute. brk #1 triggers SIGTRAP. mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Brk, operands: vec![MachineOperand::Imm(1)], def: None, }); } } } /// Emit the parallel-copy that materializes branch arguments into /// the target block's parameter vregs. /// /// At an SSA block boundary the IR semantics say "all the new values /// arrive in the target's params simultaneously." On a register /// machine that means we have to perform multiple `mov` operations /// such that none of them clobbers a value still needed by another /// pending move. The classical solution: /// /// 1. Skip identity copies (`dst == src`). /// 2. Repeatedly find a pending copy whose `dst` is **not** also /// the `src` of some other pending copy. Such a copy is "safe" /// — emitting it can't trample anything still needed. /// 3. If every remaining copy is part of a cycle (no safe copy /// exists), break the cycle by moving the head of any pending /// copy through a freshly-allocated scratch vreg, then continue. /// /// Cycles arise when block params swap with each other across an /// edge. The lowerer doesn't currently produce that shape, but /// mem2reg may once we have more sophisticated reaching-definition /// flow, so handling it now keeps a future bug out of the IR. fn emit_branch_arg_copies( mf: &mut MachineFunction, ctx: &ISelCtx, mb: MBlockId, target_block: BlockId, args: &[ValueId], ) { if args.is_empty() { return; } // Look up the target block's param vregs in the same order // they appear in the IR (which is also the order they were // allocated in Phase 4a, so the i-th arg corresponds to the // i-th param). let target_params = ctx .block_params .get(&target_block) .expect("isel: branch target not in block_params snapshot"); if target_params.len() != args.len() { // Verifier should reject this — but if it leaks through // we want a clear panic, not silent corruption. panic!( "isel: branch arg count {} ≠ target block param count {}", args.len(), target_params.len() ); } // Build the pending copy lists. Narrow SSA values move through // vregs; wide i128 values stay stack-backed and must copy slot to // slot through a temporary register pair. let mut pending_narrow: Vec<(VRegId, VRegId)> = Vec::with_capacity(args.len()); let mut pending_wide: Vec<(i32, i32)> = Vec::new(); for (arg, bp) in args.iter().zip(target_params.iter()) { if matches!(bp.ty, IrType::Int(IntWidth::I128)) { let dst = ctx.lookup_wide_slot(bp.id); let src = ctx.lookup_wide_slot(*arg); if dst != src { pending_wide.push((dst, src)); } continue; } let dst = ctx.lookup_vreg(bp.id); let src = ctx.lookup_vreg(*arg); if dst != src { pending_narrow.push((dst, src)); } } // Helper to look up a vreg's RegClass via mf.vregs. fn class_of(mf: &MachineFunction, v: VRegId) -> RegClass { mf.vregs .iter() .find(|r| r.id == v) .map(|r| r.class) .expect("isel: vreg not registered") } // Helper to choose the right move opcode for a vreg's class. fn move_opcode_for(class: RegClass) -> ArmOpcode { match class { // V128 needs `mov.16b` to copy all 128 bits — `fmov d, d` // would corrupt the upper lanes. Fp64/Fp32 still use // `fmov` which is the canonical narrow form. RegClass::V128 => ArmOpcode::Mov16B, RegClass::Fp64 | RegClass::Fp32 => ArmOpcode::FmovReg, RegClass::Gp64 | RegClass::Gp32 => ArmOpcode::MovReg, } } let emit_move = |mf: &mut MachineFunction, mb: MBlockId, dst: VRegId, src: VRegId| { let class = class_of(mf, dst); let opcode = move_opcode_for(class); mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![MachineOperand::VReg(dst), MachineOperand::VReg(src)], def: Some(dst), }); }; // Iteratively emit safe narrow moves; break cycles via a scratch // vreg of the same class. let mut pending = pending_narrow; while !pending.is_empty() { let safe_idx = (0..pending.len()).find(|&i| { let (d, _) = pending[i]; !pending .iter() .enumerate() .any(|(j, &(_, s))| j != i && s == d) }); if let Some(idx) = safe_idx { let (d, s) = pending.remove(idx); emit_move(mf, mb, d, s); } else { let (d, s) = pending[0]; let class = class_of(mf, s); let temp = mf.new_vreg(class); emit_move(mf, mb, temp, s); pending[0] = (d, temp); } } // Wide i128 block params stay stack-backed, so the same parallel-copy // algorithm runs on stack slots instead of vregs. let mut pending = pending_wide; let mut scratch_slot: Option = None; while !pending.is_empty() { let safe_idx = (0..pending.len()).find(|&i| { let (d, _) = pending[i]; !pending .iter() .enumerate() .any(|(j, &(_, s))| j != i && s == d) }); if let Some(idx) = safe_idx { let (d, s) = pending.remove(idx); emit_copy_wide_slot(mf, mb, s, d); } else { let (d, s) = pending[0]; let temp = if let Some(slot) = scratch_slot { slot } else { let slot = mf.alloc_local(16); scratch_slot = Some(slot); slot }; emit_copy_wide_slot(mf, mb, s, temp); pending[0] = (d, temp); } } } fn emit_copy_wide_slot(mf: &mut MachineFunction, mb: MBlockId, src_slot: i32, dst_slot: i32) { emit_load_phys_i128_pair( mf, mb, MachineOperand::PhysReg(PhysReg::FP), src_slot as i64, PhysReg::Gp(16), PhysReg::Gp(17), ); emit_store_phys_i128_pair( mf, mb, MachineOperand::PhysReg(PhysReg::FP), dst_slot as i64, PhysReg::Gp(16), PhysReg::Gp(17), ); } // ---- Helpers ---- /// Emit function prologue: /// stp x29, x30, [sp, #-FRAME_SIZE]! /// add x29, sp, #FRAME_SIZE - 16 /// FP points at the saved FP/LR pair at the top of the frame. fn emit_prologue(mf: &mut MachineFunction, mb: MBlockId) { // STP x29, x30, [sp, #-FRAME_SIZE]! mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::StpPre, operands: vec![ MachineOperand::PhysReg(PhysReg::FP), MachineOperand::PhysReg(PhysReg::LR), MachineOperand::PhysReg(PhysReg::Sp), ], def: None, }); // ADD x29, sp, #FRAME_SIZE - 16 // (frame_size - 16 computed during emission when final size is known) mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::AddImm, operands: vec![ MachineOperand::PhysReg(PhysReg::FP), MachineOperand::PhysReg(PhysReg::Sp), MachineOperand::Imm(-1), // sentinel: replaced with frame_size-16 during emit ], def: None, }); } /// Emit function epilogue: /// ldp x29, x30, [sp, #FRAME_SIZE-16] /// add sp, sp, #FRAME_SIZE /// ret fn emit_epilogue(mf: &mut MachineFunction, mb: MBlockId) { // LDP + ADD emitted as a single LdpPost pseudo-op, expanded during emit. mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::LdpPost, operands: vec![ MachineOperand::PhysReg(PhysReg::FP), MachineOperand::PhysReg(PhysReg::LR), MachineOperand::PhysReg(PhysReg::Sp), ], def: None, }); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::Ret, operands: vec![], def: None, }); } fn split_i128_words(value: i128) -> (u64, u64) { let bits = value as u128; (bits as u64, (bits >> 64) as u64) } fn emit_const_u64_phys(mf: &mut MachineFunction, mb: MBlockId, dest: PhysReg, value: u64) { if value == 0 { mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::MovReg, operands: vec![ MachineOperand::PhysReg(dest), MachineOperand::PhysReg(PhysReg::Xzr), ], def: None, }); return; } let mut first = true; for i in 0..4 { let shift = i * 16; let chunk = ((value >> shift) & 0xFFFF) as u16; if chunk != 0 || (first && i == 3) { let opcode = if first { ArmOpcode::Movz } else { ArmOpcode::Movk }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![ MachineOperand::PhysReg(dest), MachineOperand::Imm(chunk as i64), MachineOperand::Shift(shift as u8), ], def: None, }); first = false; } } } fn emit_const_i128_to_phys_pair( mf: &mut MachineFunction, mb: MBlockId, value: i128, lo: PhysReg, hi: PhysReg, ) { let (low_word, high_word) = split_i128_words(value); emit_const_u64_phys(mf, mb, lo, low_word); emit_const_u64_phys(mf, mb, hi, high_word); } fn emit_store_phys_i128_pair( mf: &mut MachineFunction, mb: MBlockId, base: MachineOperand, offset: i64, lo: PhysReg, hi: PhysReg, ) { mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::StpOffset, operands: vec![ MachineOperand::PhysReg(lo), MachineOperand::PhysReg(hi), base, MachineOperand::Imm(offset), ], def: None, }); } fn emit_load_phys_u64( mf: &mut MachineFunction, mb: MBlockId, base: MachineOperand, offset: i64, dest: PhysReg, ) { mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::LdrImm, operands: vec![ MachineOperand::PhysReg(dest), base, MachineOperand::Imm(offset), ], def: None, }); } fn emit_load_phys_i128_pair( mf: &mut MachineFunction, mb: MBlockId, base: MachineOperand, offset: i64, lo: PhysReg, hi: PhysReg, ) { mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::LdpOffset, operands: vec![ MachineOperand::PhysReg(lo), MachineOperand::PhysReg(hi), base, MachineOperand::Imm(offset), ], def: None, }); } fn emit_load_stack_arg_into_vreg( mf: &mut MachineFunction, mb: MBlockId, dest: VRegId, class: RegClass, ty: &IrType, offset: i64, ) { let opcode = load_opcode_for(ty, class); mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![ MachineOperand::VReg(dest), MachineOperand::PhysReg(PhysReg::FP), MachineOperand::Imm(offset), ], def: Some(dest), }); } fn emit_store_stack_arg_from_vreg( mf: &mut MachineFunction, mb: MBlockId, src: VRegId, class: RegClass, ty: &IrType, offset: i64, ) { let opcode = store_opcode_for(Some(ty), class); mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![ MachineOperand::VReg(src), MachineOperand::PhysReg(PhysReg::Sp), MachineOperand::Imm(offset), ], def: None, }); } fn emit_i128_add_from_slot( mf: &mut MachineFunction, mb: MBlockId, rhs_base: MachineOperand, rhs_offset: i64, lo: PhysReg, hi: PhysReg, scratch: PhysReg, ) { emit_load_phys_u64(mf, mb, rhs_base.clone(), rhs_offset, scratch); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::AddsReg, operands: vec![ MachineOperand::PhysReg(lo), MachineOperand::PhysReg(lo), MachineOperand::PhysReg(scratch), ], def: None, }); emit_load_phys_u64(mf, mb, rhs_base, rhs_offset + 8, scratch); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::AdcReg, operands: vec![ MachineOperand::PhysReg(hi), MachineOperand::PhysReg(hi), MachineOperand::PhysReg(scratch), ], def: None, }); } fn emit_i128_sub_from_slot( mf: &mut MachineFunction, mb: MBlockId, rhs_base: MachineOperand, rhs_offset: i64, lo: PhysReg, hi: PhysReg, scratch: PhysReg, ) { emit_load_phys_u64(mf, mb, rhs_base.clone(), rhs_offset, scratch); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::SubsReg, operands: vec![ MachineOperand::PhysReg(lo), MachineOperand::PhysReg(lo), MachineOperand::PhysReg(scratch), ], def: None, }); emit_load_phys_u64(mf, mb, rhs_base, rhs_offset + 8, scratch); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::SbcReg, operands: vec![ MachineOperand::PhysReg(hi), MachineOperand::PhysReg(hi), MachineOperand::PhysReg(scratch), ], def: None, }); } fn emit_i128_neg(mf: &mut MachineFunction, mb: MBlockId, lo: PhysReg, hi: PhysReg) { mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::SubsReg, operands: vec![ MachineOperand::PhysReg(lo), MachineOperand::PhysReg(PhysReg::Xzr), MachineOperand::PhysReg(lo), ], def: None, }); mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::SbcReg, operands: vec![ MachineOperand::PhysReg(hi), MachineOperand::PhysReg(PhysReg::Xzr), MachineOperand::PhysReg(hi), ], def: None, }); } /// Emit a constant integer using movz/movk sequence. /// Respects width: 32-bit values mask to 32 bits and only emit shifts 0/16. fn emit_const_int( mf: &mut MachineFunction, mb: MBlockId, dest: VRegId, val: i128, width: IntWidth, ) { debug_assert!( width != IntWidth::I128, "backend should reject i128 before isel" ); let is_32 = matches!(width, IntWidth::I8 | IntWidth::I16 | IntWidth::I32); // Mask to the appropriate width to prevent sign-extension artifacts. let uval = if is_32 { (val as u32) as u64 } else { val as u64 }; let max_shift = if is_32 { 2 } else { 4 }; // 2 chunks for 32-bit, 4 for 64-bit if uval == 0 { let zr = if is_32 { PhysReg::Wzr } else { PhysReg::Xzr }; mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::MovReg, operands: vec![MachineOperand::VReg(dest), MachineOperand::PhysReg(zr)], def: Some(dest), }); return; } // MOVZ for the first non-zero 16-bit chunk, MOVK for the rest. let mut first = true; for i in 0..max_shift { let shift = i * 16; let chunk = ((uval >> shift) & 0xFFFF) as u16; if chunk != 0 || (first && i == max_shift - 1) { let opcode = if first { ArmOpcode::Movz } else { ArmOpcode::Movk }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![ MachineOperand::VReg(dest), MachineOperand::Imm(chunk as i64), MachineOperand::Shift(shift as u8), ], def: Some(dest), }); first = false; } } if first { let zr = if is_32 { PhysReg::Wzr } else { PhysReg::Xzr }; mf.block_mut(mb).insts.push(MachineInst { opcode: ArmOpcode::MovReg, operands: vec![MachineOperand::VReg(dest), MachineOperand::PhysReg(zr)], def: Some(dest), }); } } /// Emit a register-register binary op. fn emit_binop( mf: &mut MachineFunction, ctx: &mut ISelCtx, mb: MBlockId, inst: &Inst, opcode: ArmOpcode, a: ValueId, b: ValueId, ) { let class = type_to_reg_class(&inst.ty); let dest = ctx.get_vreg(mf, inst.id, class); let va = ctx.lookup_vreg(a); let vb = ctx.lookup_vreg(b); mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![ MachineOperand::VReg(dest), MachineOperand::VReg(va), MachineOperand::VReg(vb), ], def: Some(dest), }); } /// Emit a NEON vector binary op. The `pick` closure resolves the /// concrete `ArmOpcode` from the result vector's lane shape — that /// keeps the per-op InstKind arms one-line. fn emit_vbinop( mf: &mut MachineFunction, ctx: &mut ISelCtx, mb: MBlockId, inst: &Inst, a: ValueId, b: ValueId, pick: impl FnOnce(VShape) -> ArmOpcode, ) { let dest = ctx.get_vreg(mf, inst.id, RegClass::V128); let va = ctx.lookup_vreg(a); let vb = ctx.lookup_vreg(b); let opcode = match VShape::from_ir(&inst.ty) { Some(s) => pick(s), None => ArmOpcode::Nop, }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![ MachineOperand::VReg(dest), MachineOperand::VReg(va), MachineOperand::VReg(vb), ], def: Some(dest), }); } /// Emit a NEON vector unary op (one source, one result, both V128). fn emit_vunop( mf: &mut MachineFunction, ctx: &mut ISelCtx, mb: MBlockId, inst: &Inst, a: ValueId, pick: impl FnOnce(VShape) -> ArmOpcode, ) { let dest = ctx.get_vreg(mf, inst.id, RegClass::V128); let va = ctx.lookup_vreg(a); let opcode = match VShape::from_ir(&inst.ty) { Some(s) => pick(s), None => ArmOpcode::Nop, }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(va)], def: Some(dest), }); } /// Emit a float binary op, selecting single or double precision. #[allow(clippy::too_many_arguments)] fn emit_float_binop( mf: &mut MachineFunction, ctx: &mut ISelCtx, mb: MBlockId, inst: &Inst, ty: &IrType, a: ValueId, b: ValueId, op_s: ArmOpcode, op_d: ArmOpcode, ) { let (class, opcode) = match ty { IrType::Float(FloatWidth::F32) => (RegClass::Fp32, op_s), _ => (RegClass::Fp64, op_d), }; let dest = ctx.get_vreg(mf, inst.id, class); let va = ctx.lookup_vreg(a); let vb = ctx.lookup_vreg(b); mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![ MachineOperand::VReg(dest), MachineOperand::VReg(va), MachineOperand::VReg(vb), ], def: Some(dest), }); } /// Map IR type to register class. /// Pick the load opcode for a value of the given IR type and reg class. /// Narrow integer types use the sign-extending byte/half loads; floats /// route to the FP-imm load; everything else falls through to `LdrImm` /// or `LdrFpImm` per reg class. The reg-class fallback matters when /// `ty` is a generic pointer or aggregate (e.g., a stack-arg copy that /// only knows the destination's register kind). fn load_opcode_for(ty: &IrType, class: RegClass) -> ArmOpcode { match ty { IrType::Int(IntWidth::I8) | IrType::Bool => ArmOpcode::LdrsbImm, IrType::Int(IntWidth::I16) => ArmOpcode::LdrshImm, IrType::Float(_) => ArmOpcode::LdrFpImm, _ => match class { RegClass::Fp64 | RegClass::Fp32 => ArmOpcode::LdrFpImm, RegClass::V128 => ArmOpcode::LdrQ, RegClass::Gp32 | RegClass::Gp64 => ArmOpcode::LdrImm, }, } } /// Mirror of `load_opcode_for` for stores. Audit CRITICAL-2: the /// `ty` here must be the *value's* declared IR type, not the pointer /// or pointee — byte-level GEPs reuse `ptr` as a generic offset /// cursor, so dispatching by pointee width would silently truncate /// non-byte stores. Pass `None` for `ty` when only the reg class is /// available; in that case the helper falls through to the class-only /// branch. fn store_opcode_for(ty: Option<&IrType>, class: RegClass) -> ArmOpcode { match ty { Some(IrType::Int(IntWidth::I8)) | Some(IrType::Bool) => ArmOpcode::StrbImm, Some(IrType::Int(IntWidth::I16)) => ArmOpcode::StrhImm, Some(IrType::Float(_)) => ArmOpcode::StrFpImm, _ => match class { RegClass::Fp64 | RegClass::Fp32 => ArmOpcode::StrFpImm, RegClass::V128 => ArmOpcode::StrQ, RegClass::Gp32 | RegClass::Gp64 => ArmOpcode::StrImm, }, } } /// Resolve an IR address value to the (base, offset) operand pair /// expected by `LdrImm`/`StrImm`-family instructions. Alloca addresses /// fold to `(FP, FrameSlot(offset))` so the assembler can pick the /// final stack-relative form; everything else becomes /// `(VReg(addr_vreg), Imm(0))`. Used by both narrow-width Load/Store /// arms in `select_inst`. The wide-i128 paths build their own operand /// pairs directly because they target the `emit_*_phys_i128_pair` /// helpers, which take `i64` offsets and only need a base operand. fn narrow_load_store_addr( ctx: &ISelCtx, addr: ValueId, ) -> (MachineOperand, MachineOperand) { if let Some(&offset) = ctx.alloca_offsets.get(&addr) { ( MachineOperand::PhysReg(PhysReg::FP), MachineOperand::FrameSlot(offset), ) } else { let base = ctx.lookup_vreg(addr); (MachineOperand::VReg(base), MachineOperand::Imm(0)) } } /// Operation tag for `emit_i128_binop_via_slots`. Add and Sub share a /// load-binop-store skeleton that differs only in which intermediate /// helper does the arithmetic. #[derive(Clone, Copy)] enum I128BinOp { Add, Sub, } /// Lower an i128 IAdd/ISub: load `lhs_id`'s slot into x16/x17, run the /// matching `emit_i128__from_slot` against `rhs_id`, then store /// the result to `dest_id`'s slot. Replaces three near-identical 30-LOC /// blocks in the i128 dispatch (IAdd / ISub). fn emit_i128_binop_via_slots( mf: &mut MachineFunction, ctx: &ISelCtx, mb: MBlockId, op: I128BinOp, dest_id: ValueId, lhs_id: ValueId, rhs_id: ValueId, ) { let dest_slot = ctx.lookup_wide_slot(dest_id); let lhs_slot = ctx.lookup_wide_slot(lhs_id); let rhs_slot = ctx.lookup_wide_slot(rhs_id); let fp = || MachineOperand::PhysReg(PhysReg::FP); emit_load_phys_i128_pair(mf, mb, fp(), lhs_slot as i64, PhysReg::Gp(16), PhysReg::Gp(17)); match op { I128BinOp::Add => emit_i128_add_from_slot( mf, mb, fp(), rhs_slot as i64, PhysReg::Gp(16), PhysReg::Gp(17), PhysReg::Gp(8), ), I128BinOp::Sub => emit_i128_sub_from_slot( mf, mb, fp(), rhs_slot as i64, PhysReg::Gp(16), PhysReg::Gp(17), PhysReg::Gp(8), ), } emit_store_phys_i128_pair(mf, mb, fp(), dest_slot as i64, PhysReg::Gp(16), PhysReg::Gp(17)); } fn type_to_reg_class(ty: &IrType) -> RegClass { match ty { IrType::Float(FloatWidth::F32) => RegClass::Fp32, IrType::Float(FloatWidth::F64) => RegClass::Fp64, IrType::Vector { .. } => RegClass::V128, IrType::Int(IntWidth::I8) | IrType::Int(IntWidth::I16) | IrType::Int(IntWidth::I32) | IrType::Bool => RegClass::Gp32, _ => RegClass::Gp64, } } /// Vector lane shape for NEON opcode dispatch. #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum VShape { /// 4 × i32 V4S, /// 2 × i64 V2D, /// 4 × f32 F4S, /// 2 × f64 F2D, } impl VShape { fn from_ir(ty: &IrType) -> Option { let (lanes, elem) = ty.vector_shape()?; match (lanes, elem) { (4, IrType::Int(IntWidth::I32)) => Some(Self::V4S), (2, IrType::Int(IntWidth::I64)) => Some(Self::V2D), (4, IrType::Float(FloatWidth::F32)) => Some(Self::F4S), (2, IrType::Float(FloatWidth::F64)) => Some(Self::F2D), _ => None, } } fn is_float(self) -> bool { matches!(self, Self::F4S | Self::F2D) } } fn needs_wide_icmp_operand(ty: Option<&IrType>, other_ty: Option<&IrType>) -> bool { matches!( (ty, other_ty), ( Some(IrType::Int(IntWidth::I64) | IrType::Ptr(_) | IrType::FuncPtr(_)), Some(_) ) | ( Some(_), Some(IrType::Int(IntWidth::I64) | IrType::Ptr(_) | IrType::FuncPtr(_)) ) ) } fn zero_extend_cmp_type(ty: Option<&IrType>) -> bool { matches!(ty, Some(IrType::Bool)) } fn icmp_operand_vreg( mf: &mut MachineFunction, ctx: &mut ISelCtx, mb: MBlockId, func: &Function, value: ValueId, other: ValueId, ) -> VRegId { let value_ty = func.value_type(value); let other_ty = func.value_type(other); let src = ctx.lookup_vreg(value); if !needs_wide_icmp_operand(value_ty.as_ref(), other_ty.as_ref()) { return src; } if matches!( value_ty, Some(IrType::Int(IntWidth::I64) | IrType::Ptr(_) | IrType::FuncPtr(_)) ) { return src; } let dest = mf.new_vreg(RegClass::Gp64); let opcode = if zero_extend_cmp_type(value_ty.as_ref()) { ArmOpcode::MovReg } else { ArmOpcode::Sxtw }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); dest } fn machine_vreg_class(mf: &MachineFunction, vreg: VRegId) -> RegClass { mf.vregs .iter() .find(|r| r.id == vreg) .map(|r| r.class) .expect("isel: vreg not registered") } fn coerce_select_operand_vreg( mf: &mut MachineFunction, ctx: &mut ISelCtx, mb: MBlockId, func: &Function, value: ValueId, target_ty: &IrType, ) -> VRegId { let src = ctx.lookup_vreg(value); let src_class = machine_vreg_class(mf, src); let target_class = type_to_reg_class(target_ty); if src_class == target_class { return src; } let dest = mf.new_vreg(target_class); let src_ty = func.value_type(value); let opcode = match (src_class, target_class) { (RegClass::Gp32, RegClass::Gp64) => { if matches!(target_ty, IrType::Ptr(_) | IrType::FuncPtr(_)) || zero_extend_cmp_type(src_ty.as_ref()) { ArmOpcode::MovReg } else { match src_ty.as_ref() { Some(IrType::Int(IntWidth::I8)) => ArmOpcode::Sxtb, Some(IrType::Int(IntWidth::I16)) => ArmOpcode::Sxth, Some(IrType::Int(IntWidth::I32)) | Some(IrType::Bool) => ArmOpcode::Sxtw, _ => ArmOpcode::MovReg, } } } (RegClass::Gp64, RegClass::Gp32) => ArmOpcode::MovReg, (RegClass::Fp32, RegClass::Fp64) => ArmOpcode::FcvtDS, (RegClass::Fp64, RegClass::Fp32) => ArmOpcode::FcvtSD, _ => ArmOpcode::MovReg, }; mf.block_mut(mb).insts.push(MachineInst { opcode, operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)], def: Some(dest), }); dest } fn int_width_class(w: &IntWidth) -> RegClass { match w { IntWidth::I64 => RegClass::Gp64, _ => RegClass::Gp32, } } fn float_width_class(w: &FloatWidth) -> RegClass { match w { FloatWidth::F32 => RegClass::Fp32, FloatWidth::F64 => RegClass::Fp64, } } /// Map IR comparison op to ARM64 condition code (for integer CMP). /// Pre-scan a function to find ICmp/FCmp → Select fusion candidates. /// /// An ICmp/FCmp is a fusion candidate when: /// 1. Its result is used exactly once in the entire function. /// 2. That single use is a `Select` instruction in the same block. /// 3. No intervening instruction between the ICmp and the Select in /// that block clobbers NZCV flags (another ICmp/FCmp or a Call). /// /// For candidates, we suppress CSET during ICmp lowering and store /// the ARM condition in `ctx.fused_arm_cond` so the Select can pick /// it up and emit `CSEL dest, tv, fv, ` directly. fn compute_csel_fusion(func: &Function, ctx: &mut ISelCtx) { // Build global use counts. let mut use_count: HashMap = HashMap::new(); for block in &func.blocks { for inst in &block.insts { for vid in crate::ir::walk::inst_uses(&inst.kind) { *use_count.entry(vid).or_insert(0) += 1; } } if let Some(term) = &block.terminator { for vid in crate::ir::walk::terminator_uses(term) { *use_count.entry(vid).or_insert(0) += 1; } } } // Build a map of ValueId → the block that defines it (instruction defs only). let mut def_block: HashMap = HashMap::new(); for block in &func.blocks { for inst in &block.insts { def_block.insert(inst.id, block.id); } } // Per-block scan: walk instructions in order, tracking the most // recent ICmp/FCmp that hasn't been consumed by a Select yet. // Any flag-clobbering instruction (another ICmp/FCmp, a call) // resets the pending set. for block in &func.blocks { // The most recently emitted CMP that hasn't been consumed. // We use a Vec so that `pending = {last_icmp}` is O(1) to update. let mut pending: Option = None; for inst in &block.insts { match &inst.kind { InstKind::ICmp(op, _, _) => { if crate::ir::walk::inst_uses(&inst.kind) .into_iter() .filter_map(|vid| func.value_type(vid)) .any(|ty| matches!(ty, IrType::Int(IntWidth::I128))) { pending = None; ctx.fused_arm_cond.remove(&inst.id); continue; } // New CMP overwrites NZCV — previous pending is no longer valid. pending = Some(inst.id); // Temporarily store the arm cond so we can retrieve it when // we confirm the Select is the sole user. ctx.fused_arm_cond.insert(inst.id, cmp_to_arm_cond(*op)); } InstKind::FCmp(op, _, _) => { pending = Some(inst.id); ctx.fused_arm_cond.insert(inst.id, fcmp_to_arm_cond(*op)); } InstKind::Select(cond, _, _) => { if let Some(p) = pending { if p == *cond && use_count.get(cond) == Some(&1) && def_block.get(cond) == Some(&block.id) { // Confirmed: fuse this ICmp into the Select. ctx.select_fused.insert(*cond); pending = None; } else { // The Select isel for an unfused cond emits // its own `cmp cond_reg, #0` to set NZCV, // which clobbers any pending fused ICmp's // flags. Drop the pending so a later Select // doesn't try to read stale flags. pending = None; } } } // Calls may clobber NZCV (per AAPCS64, flags are not preserved). InstKind::Call(_, _) | InstKind::RuntimeCall(_, _) => { pending = None; } _ => {} } } // Clean up fused_arm_cond for ICmps that turned out NOT to be fused // (e.g., they had use_count > 1, or were never consumed by a Select). // Leave only the fused ones. // // We delay cleanup to after all blocks are scanned because the same // ValueId can't appear in multiple blocks (SSA), so there's no cross- // block confusion. } // Remove arm_cond entries for non-fused ICmps. ctx.fused_arm_cond .retain(|vid, _| ctx.select_fused.contains(vid)); } fn cmp_to_arm_cond(op: CmpOp) -> ArmCond { match op { CmpOp::Eq => ArmCond::Eq, CmpOp::Ne => ArmCond::Ne, CmpOp::Lt => ArmCond::Lt, CmpOp::Le => ArmCond::Le, CmpOp::Gt => ArmCond::Gt, CmpOp::Ge => ArmCond::Ge, } } fn i128_ordered_conds(op: CmpOp) -> (ArmCond, ArmCond) { match op { CmpOp::Lt => (ArmCond::Lt, ArmCond::Lo), CmpOp::Le => (ArmCond::Lt, ArmCond::Ls), CmpOp::Gt => (ArmCond::Gt, ArmCond::Hi), CmpOp::Ge => (ArmCond::Gt, ArmCond::Hs), _ => panic!("ordered i128 compare requires lt/le/gt/ge, got {:?}", op), } } /// Map IR comparison op to ARM64 condition code (for float FCMP). fn fcmp_to_arm_cond(op: CmpOp) -> ArmCond { match op { CmpOp::Eq => ArmCond::Eq, CmpOp::Ne => ArmCond::Ne, CmpOp::Lt => ArmCond::Mi, // minus flag for less-than CmpOp::Le => ArmCond::Ls, // unsigned LE maps to float LE CmpOp::Gt => ArmCond::Gt, CmpOp::Ge => ArmCond::Ge, } } /// Compute allocation size for an IR type. fn alloca_size(ty: &IrType) -> u32 { match ty { IrType::Void => 0, IrType::Bool => 4, // use 4 bytes for alignment IrType::Int(w) => w.bytes(), IrType::Float(w) => w.bytes(), IrType::Ptr(_) => 8, IrType::Array(elem, count) => { // Stack storage uses ABI-sized elements. Fortran LOGICAL arrays are // stored as default-kind 4-byte elements, even though Bool SSA // values themselves remain byte-sized. let elem_size = match elem.as_ref() { IrType::Bool => 4, IrType::Struct(_) => alloca_size(elem), _ => elem.size_bytes() as u32, }; elem_size * (*count as u32) } IrType::FuncPtr(_) => 8, IrType::Struct(_) => 8, // placeholder IrType::Vector { .. } => 16, // 128-bit NEON } } /// Get the symbol name for a runtime function. /// Get the C-level symbol name for a runtime function. /// The emitter adds the Mach-O `_` prefix when emitting assembly. fn runtime_func_symbol(rf: &RuntimeFunc, args: &[(ValueId, AbiArgLoc, IrType)]) -> String { match rf { RuntimeFunc::PrintInt => { if args .first() .is_some_and(|(_, _, ty)| matches!(ty, IrType::Int(IntWidth::I128))) { "afs_print_int128".into() } else if args .first() .is_some_and(|(_, _, ty)| matches!(ty, IrType::Int(IntWidth::I64))) { "afs_print_int64".into() } else { "afs_print_int".into() } } RuntimeFunc::PrintReal => "afs_print_real".into(), RuntimeFunc::PrintString => "afs_print_string".into(), RuntimeFunc::PrintLogical => "afs_print_logical".into(), RuntimeFunc::PrintNewline => "afs_print_newline".into(), RuntimeFunc::Allocate => "afs_allocate".into(), RuntimeFunc::Deallocate => "afs_deallocate".into(), RuntimeFunc::StringConcat => "afs_string_concat".into(), RuntimeFunc::StringCopy => "afs_string_copy".into(), RuntimeFunc::StringCompare => "afs_string_compare".into(), RuntimeFunc::Stop => "afs_stop".into(), RuntimeFunc::ErrorStop => "afs_error_stop".into(), RuntimeFunc::CheckBounds => "afs_check_bounds".into(), } } #[cfg(test)] mod tests { use super::*; use crate::ir::builder::FuncBuilder; fn select_simple(build: impl FnOnce(&mut FuncBuilder)) -> MachineFunction { let mut func = Function::new("test".into(), vec![], IrType::Void); { let mut b = FuncBuilder::new(&mut func); build(&mut b); } select_function(&func) } #[test] fn select_const_int() { let mf = select_simple(|b| { b.const_i32(42); b.ret_void(); }); let insts = &mf.blocks[0].insts; // Should have: prologue (STP, MOV), MOVZ #42, epilogue (LDP, RET). assert!(insts.iter().any(|i| i.opcode == ArmOpcode::Movz)); } #[test] fn select_iadd() { let mf = select_simple(|b| { let x = b.const_i32(10); let y = b.const_i32(20); let _z = b.iadd(x, y); b.ret_void(); }); assert!(mf.blocks[0] .insts .iter() .any(|i| i.opcode == ArmOpcode::AddReg)); } #[test] fn select_icmp() { // ICmp whose result is NOT fed into a Select → CSET must appear. let mf = select_simple(|b| { let x = b.const_i32(5); let y = b.const_i32(10); let _c = b.icmp(CmpOp::Lt, x, y); b.ret_void(); }); assert!(mf.blocks[0] .insts .iter() .any(|i| i.opcode == ArmOpcode::CmpReg)); assert!(mf.blocks[0] .insts .iter() .any(|i| i.opcode == ArmOpcode::Cset)); } #[test] fn select_i128_icmp_eq_combines_limb_results() { let mf = select_simple(|b| { let x = b.const_i128(1); let y = b.const_i128(1); let _c = b.icmp(CmpOp::Eq, x, y); b.ret_void(); }); let insts = &mf.blocks[0].insts; assert!( insts .iter() .filter(|i| i.opcode == ArmOpcode::CmpReg) .count() >= 2 ); assert!(insts.iter().filter(|i| i.opcode == ArmOpcode::Cset).count() >= 2); assert!(insts.iter().any(|i| i.opcode == ArmOpcode::AndReg)); } #[test] fn select_i128_icmp_lt_uses_high_signed_and_low_unsigned_conds() { let mf = select_simple(|b| { let x = b.const_i128(1); let y = b.const_i128(2); let _c = b.icmp(CmpOp::Lt, x, y); b.ret_void(); }); let insts = &mf.blocks[0].insts; assert!( insts .iter() .filter(|i| i.opcode == ArmOpcode::CmpReg) .count() >= 2 ); assert!(insts.iter().filter(|i| i.opcode == ArmOpcode::Cset).count() >= 3); assert!(insts.iter().any(|i| i.opcode == ArmOpcode::AndReg)); assert!(insts.iter().any(|i| i.opcode == ArmOpcode::OrrReg)); } #[test] fn select_i128_uses_pair_csel_ops() { let mf = select_simple(|b| { let cond = b.const_bool(true); let x = b.const_i128(1); let y = b.const_i128(2); let _s = b.select(cond, x, y); b.ret_void(); }); let insts = &mf.blocks[0].insts; assert!(insts.iter().any(|i| i.opcode == ArmOpcode::CmpImm)); assert_eq!( insts .iter() .filter(|i| i.opcode == ArmOpcode::CselReg) .count(), 2, "wide i128 selects should lower with one CSEL per limb" ); } #[test] fn select_coerces_mixed_gp_widths_before_csel() { let mf = select_simple(|b| { let cond = b.const_bool(true); let wide = b.const_i64(7); let narrow = b.const_i32(-1); let _s = b.select(cond, wide, narrow); b.ret_void(); }); let csel = mf.blocks[0] .insts .iter() .find(|i| i.opcode == ArmOpcode::CselReg) .expect("expected CSEL for mixed-width select"); for operand in csel.operands.iter().take(3) { let MachineOperand::VReg(vreg) = operand else { continue; }; assert_eq!( machine_vreg_class(&mf, *vreg), RegClass::Gp64, "mixed-width select operands should be coerced to the result width before CSEL" ); } } #[test] fn csel_fusion_eliminates_cset_and_extra_cmp() { // ICmp used solely by a Select → CSET and CMP cond, #0 must NOT appear. // Only CmpReg + CselReg should be present. let mf = select_simple(|b| { let x = b.const_i32(5); let y = b.const_i32(10); let c = b.icmp(CmpOp::Le, x, y); // use_count[c] = 1, only in Select let _s = b.select(c, x, y); b.ret_void(); }); let insts = &mf.blocks[0].insts; // Must have a CMP to set flags. assert!( insts.iter().any(|i| i.opcode == ArmOpcode::CmpReg), "expected CmpReg for ICmp" ); // Must have CSEL to select the value. assert!( insts.iter().any(|i| i.opcode == ArmOpcode::CselReg), "expected CselReg for Select" ); // Must NOT have CSET (ICmp boolean materialization is suppressed). assert!( !insts.iter().any(|i| i.opcode == ArmOpcode::Cset), "CSET should be suppressed when ICmp feeds only a Select" ); // Must NOT have a second CmpImm (CMP cond, #0 is suppressed). assert!( !insts.iter().any(|i| i.opcode == ArmOpcode::CmpImm), "CMP cond,#0 should be suppressed when CSEL uses flags directly" ); } #[test] fn csel_no_fusion_when_icmp_has_multiple_uses() { // ICmp used by both a Select and another instruction → CSET is kept. let mf = select_simple(|b| { let x = b.const_i32(5); let y = b.const_i32(10); let c = b.icmp(CmpOp::Le, x, y); // use_count[c] = 2 let _s = b.select(c, x, y); // Also use `c` in a logical NOT to force a second use. let _n = b.not(c); b.ret_void(); }); let insts = &mf.blocks[0].insts; // CSET must still be emitted because `c` has multiple uses. assert!( insts.iter().any(|i| i.opcode == ArmOpcode::Cset), "CSET should remain when ICmp has multiple uses" ); } #[test] fn select_fadd() { let mf = select_simple(|b| { let x = b.const_f64(1.0); let y = b.const_f64(2.0); let _z = b.fadd(x, y); b.ret_void(); }); assert!(mf.blocks[0] .insts .iter() .any(|i| i.opcode == ArmOpcode::FaddD)); } #[test] fn select_alloca_and_store() { let mf = select_simple(|b| { let addr = b.alloca(IrType::Int(IntWidth::I32)); let val = b.const_i32(42); b.store(val, addr); b.ret_void(); }); // Should have SubImm (address materialization from FP) and StrImm. assert!(mf.blocks[0] .insts .iter() .any(|i| i.opcode == ArmOpcode::SubImm)); assert!(mf.blocks[0] .insts .iter() .any(|i| i.opcode == ArmOpcode::StrImm)); } #[test] fn select_branch() { let mf = select_simple(|b| { let cond = b.const_bool(true); let bb_t = b.create_block("then"); let bb_f = b.create_block("else"); b.cond_branch(cond, bb_t, vec![], bb_f, vec![]); b.set_block(bb_t); b.ret_void(); b.set_block(bb_f); b.ret_void(); }); // Entry block should have CmpImm + BCond + B. assert!(mf.blocks[0] .insts .iter() .any(|i| i.opcode == ArmOpcode::BCond)); } #[test] fn select_call() { let mf = select_simple(|b| { b.runtime_call(crate::ir::inst::RuntimeFunc::PrintInt, vec![], IrType::Void); b.ret_void(); }); assert!(mf.blocks[0].insts.iter().any(|i| i.opcode == ArmOpcode::Bl)); } #[test] fn select_call_arg_from_later_block_alloca_has_preallocated_vreg() { let mut func = Function::new("test".into(), vec![], IrType::Void); { let mut b = FuncBuilder::new(&mut func); let use_block = b.create_block("use"); let def_block = b.create_block("def"); b.branch(def_block, vec![]); b.set_block(use_block); let dummy = b.const_i64(7); b.call( FuncRef::External("_callee".into()), vec![dummy], IrType::Void, ); b.ret_void(); b.set_block(def_block); let slot = b.alloca(IrType::Ptr(Box::new(IrType::Int(IntWidth::I8)))); b.call( FuncRef::External("_callee".into()), vec![slot], IrType::Void, ); b.branch(use_block, vec![]); } let mf = select_function(&func); assert!( mf.blocks.iter().any(|block| { block.insts.iter().any(|inst| { inst.opcode == ArmOpcode::SubImm && matches!(inst.operands.first(), Some(MachineOperand::VReg(_))) }) }), "alloca address should materialize into a preallocated vreg", ); assert!( mf.blocks .iter() .flat_map(|block| block.insts.iter()) .filter(|inst| inst.opcode == ArmOpcode::Bl) .count() >= 2, "both calls should lower successfully without an unmapped alloca arg vreg", ); } #[test] fn select_i128_runtime_print_uses_wide_symbol_and_pair_regs() { let mf = select_simple(|b| { let wide = b.const_i128(170141183460469231731687303715884105727i128); b.runtime_call( crate::ir::inst::RuntimeFunc::PrintInt, vec![wide], IrType::Void, ); b.ret_void(); }); let asm = crate::codegen::emit::emit_function(&mf); assert!( asm.contains("bl _afs_print_int128"), "runtime i128 print should call the wide symbol:\n{}", asm ); assert!( asm.contains("ldp x0, x1"), "runtime i128 print should marshal the value through the pair-register ABI:\n{}", asm ); } #[test] fn prologue_and_epilogue() { let mf = select_simple(|b| { b.ret_void(); }); let insts = &mf.blocks[0].insts; assert_eq!( insts[0].opcode, ArmOpcode::StpPre, "first inst should be STP (prologue)" ); assert_eq!( insts[1].opcode, ArmOpcode::AddImm, "second inst should be ADD FP, SP, #offset" ); assert!( insts.iter().any(|i| i.opcode == ArmOpcode::Ret), "should have RET" ); } #[test] fn const_zero_uses_zr() { let mf = select_simple(|b| { b.const_i32(0); b.ret_void(); }); // const_i32(0) should use MOV dest, WZR (32-bit zero register). let insts = &mf.blocks[0].insts; let has_mov_zr = insts.iter().any(|i| { i.opcode == ArmOpcode::MovReg && i.operands.iter().any(|o| { matches!( o, MachineOperand::PhysReg(PhysReg::Xzr) | MachineOperand::PhysReg(PhysReg::Wzr) ) }) }); assert!(has_mov_zr, "const 0 should use MOV from XZR or WZR"); } // ---- Parallel-copy / branch arg copy tests ---- // // The branch arg copy resolver in `emit_branch_arg_copies` handles // cross-edge moves into block params. When the source/destination // graph contains a cycle, the resolver routes one copy through a // scratch vreg. These tests construct minimal IR functions that // exercise each topology, run isel, and inspect the resulting move // count in the source machine block. /// Helper: count vreg→vreg moves of the given opcode in a block, /// excluding moves that target a physical register (those are /// epilogue/return marshaling, not parallel copies). fn count_vreg_moves(block: &MachineBlock, opcode: ArmOpcode) -> usize { block .insts .iter() .filter(|i| i.opcode == opcode) .filter(|i| { // True parallel copies are VReg → VReg. matches!(i.operands.first(), Some(MachineOperand::VReg(_))) && matches!(i.operands.get(1), Some(MachineOperand::VReg(_))) }) .count() } fn find_block<'a>(mf: &'a MachineFunction, contains: &str) -> &'a MachineBlock { mf.blocks .iter() .find(|b| b.label.contains(contains)) .unwrap_or_else(|| { panic!( "no machine block containing '{}' (have: {:?})", contains, mf.blocks.iter().map(|b| &b.label).collect::>(), ) }) } #[test] fn branch_arg_2_cycle_routes_through_scratch() { // body branches to header swapping the two int params: // br header(pb, pa) // pending = [(pa,pb), (pb,pa)] — pure 2-cycle, requires: // tmp = pb; pb = pa; pa = tmp (3 moves) let mut func = Function::new("test".into(), vec![], IrType::Void); { let mut b = FuncBuilder::new(&mut func); let header = b.create_block("header"); let pa = b.add_block_param(header, IrType::Int(IntWidth::I32)); let pb = b.add_block_param(header, IrType::Int(IntWidth::I32)); let body = b.create_block("body"); let exit = b.create_block("exit"); let v0 = b.const_i32(1); let v1 = b.const_i32(2); b.branch(header, vec![v0, v1]); b.set_block(header); b.cond_branch(pa, body, vec![], exit, vec![]); b.set_block(body); b.branch(header, vec![pb, pa]); b.set_block(exit); b.ret_void(); } let mf = select_function(&func); let body_mb = find_block(&mf, "body"); let moves = count_vreg_moves(body_mb, ArmOpcode::MovReg); assert_eq!( moves, 3, "2-cycle should emit 3 vreg→vreg moves (scratch + 2 swaps), got {}: {:#?}", moves, body_mb.insts, ); } #[test] fn branch_arg_3_cycle_routes_through_scratch() { // br header(pb, pc, pa) — rotate three params left. // pending = [(pa,pb),(pb,pc),(pc,pa)] // Resolution: tmp = pb; pb = pc; pc = pa; pa = tmp (4 moves) let mut func = Function::new("test".into(), vec![], IrType::Void); { let mut b = FuncBuilder::new(&mut func); let header = b.create_block("header"); let pa = b.add_block_param(header, IrType::Int(IntWidth::I32)); let pb = b.add_block_param(header, IrType::Int(IntWidth::I32)); let pc = b.add_block_param(header, IrType::Int(IntWidth::I32)); let body = b.create_block("body"); let exit = b.create_block("exit"); let v0 = b.const_i32(1); let v1 = b.const_i32(2); let v2 = b.const_i32(3); b.branch(header, vec![v0, v1, v2]); b.set_block(header); b.cond_branch(pa, body, vec![], exit, vec![]); b.set_block(body); b.branch(header, vec![pb, pc, pa]); b.set_block(exit); b.ret_void(); } let mf = select_function(&func); let body_mb = find_block(&mf, "body"); let moves = count_vreg_moves(body_mb, ArmOpcode::MovReg); assert_eq!( moves, 4, "3-cycle should emit 4 vreg→vreg moves (scratch + 3 rotates), got {}: {:#?}", moves, body_mb.insts, ); } #[test] fn branch_arg_cycle_plus_independent_tail() { // 2-cycle on (pa,pb) plus an independent (pc <- v_extra) tail. // br header(pb, pa, v_extra) // The tail (pc, v_extra) is always safe and emits as a single // move; the 2-cycle adds 3 moves for a total of 4. let mut func = Function::new("test".into(), vec![], IrType::Void); { let mut b = FuncBuilder::new(&mut func); let header = b.create_block("header"); let pa = b.add_block_param(header, IrType::Int(IntWidth::I32)); let pb = b.add_block_param(header, IrType::Int(IntWidth::I32)); let _pc = b.add_block_param(header, IrType::Int(IntWidth::I32)); let body = b.create_block("body"); let exit = b.create_block("exit"); let v0 = b.const_i32(1); let v1 = b.const_i32(2); let v2 = b.const_i32(3); b.branch(header, vec![v0, v1, v2]); b.set_block(header); b.cond_branch(pa, body, vec![], exit, vec![]); b.set_block(body); // Body needs a fresh value for pc so it's not part of the // cycle and so it can't degenerate into pa/pb. let v3 = b.const_i32(99); b.branch(header, vec![pb, pa, v3]); b.set_block(exit); b.ret_void(); } let mf = select_function(&func); let body_mb = find_block(&mf, "body"); let moves = count_vreg_moves(body_mb, ArmOpcode::MovReg); assert_eq!( moves, 4, "cycle+tail should emit 4 vreg→vreg moves (3 for cycle + 1 for tail), got {}: {:#?}", moves, body_mb.insts, ); } #[test] fn branch_arg_mixed_gp_fp_classes() { // Two int params and two float params, all swapped pairwise. // pending splits into a GP 2-cycle and an FP 2-cycle, each of // which independently needs a scratch. // Expected: 3 GP MovReg + 3 FP FmovReg = 6 total moves. let mut func = Function::new("test".into(), vec![], IrType::Void); { let mut b = FuncBuilder::new(&mut func); let header = b.create_block("header"); let ia = b.add_block_param(header, IrType::Int(IntWidth::I32)); let ib = b.add_block_param(header, IrType::Int(IntWidth::I32)); let fa = b.add_block_param(header, IrType::Float(FloatWidth::F64)); let fb = b.add_block_param(header, IrType::Float(FloatWidth::F64)); let body = b.create_block("body"); let exit = b.create_block("exit"); let v0 = b.const_i32(1); let v1 = b.const_i32(2); let f0 = b.const_f64(1.0); let f1 = b.const_f64(2.0); b.branch(header, vec![v0, v1, f0, f1]); b.set_block(header); b.cond_branch(ia, body, vec![], exit, vec![]); b.set_block(body); // Swap both pairs: ints (ib, ia) and floats (fb, fa). b.branch(header, vec![ib, ia, fb, fa]); b.set_block(exit); b.ret_void(); } let mf = select_function(&func); let body_mb = find_block(&mf, "body"); let gp_moves = count_vreg_moves(body_mb, ArmOpcode::MovReg); let fp_moves = count_vreg_moves(body_mb, ArmOpcode::FmovReg); assert_eq!( gp_moves, 3, "GP 2-cycle should emit 3 MovReg, got {}: {:#?}", gp_moves, body_mb.insts, ); assert_eq!( fp_moves, 3, "FP 2-cycle should emit 3 FmovReg, got {}: {:#?}", fp_moves, body_mb.insts, ); } #[test] fn logical_arrays_use_default_kind_storage_for_stack_slots() { assert_eq!(alloca_size(&IrType::Array(Box::new(IrType::Bool), 3)), 12); assert_eq!( alloca_size(&IrType::Array(Box::new(IrType::Int(IntWidth::I32)), 3)), 12 ); } // ---- VShape mapping tests (Sprint 12 Stage 2 isel hookup) ---- #[test] fn vshape_recognizes_4xi32() { let ty = IrType::Vector { lanes: 4, elem: Box::new(IrType::Int(IntWidth::I32)), }; assert_eq!(VShape::from_ir(&ty), Some(VShape::V4S)); assert!(!VShape::V4S.is_float()); } #[test] fn vshape_recognizes_2xf64() { let ty = IrType::Vector { lanes: 2, elem: Box::new(IrType::Float(FloatWidth::F64)), }; assert_eq!(VShape::from_ir(&ty), Some(VShape::F2D)); assert!(VShape::F2D.is_float()); } #[test] fn vshape_rejects_unsupported_shape() { // 3 lanes is not a NEON shape; we already verified that // verify.rs rejects it. VShape::from_ir simply returns None // and the isel arm falls back to Nop. let ty = IrType::Vector { lanes: 3, elem: Box::new(IrType::Int(IntWidth::I32)), }; assert_eq!(VShape::from_ir(&ty), None); } #[test] fn vector_type_to_reg_class_returns_v128() { let ty = IrType::Vector { lanes: 4, elem: Box::new(IrType::Float(FloatWidth::F32)), }; assert_eq!(type_to_reg_class(&ty), RegClass::V128); } /// End-to-end: build a tiny IR function that adds two 4×f32 /// vectors and walk through isel. The result MachineFunction /// must contain at least one `FaddV4S` opcode. #[test] fn isel_lowers_vadd_4xf32_to_faddv4s() { use crate::codegen::mir::ArmOpcode; let v_ty = IrType::Vector { lanes: 4, elem: Box::new(IrType::Float(FloatWidth::F32)), }; let mut func = Function::new("vadd_test".into(), vec![], IrType::Void); { let mut b = FuncBuilder::new(&mut func); // Two pointer params synthesized via alloca for the // smoke test — keeps the body small but exercises the // VLoad / VAdd / VStore chain. let p_a = b.alloca(v_ty.clone()); let p_b = b.alloca(v_ty.clone()); let p_dst = b.alloca(v_ty.clone()); let va = b.vload(p_a, v_ty.clone()); let vb = b.vload(p_b, v_ty.clone()); let vc = b.vadd(va, vb); b.vstore(vc, p_dst); b.ret_void(); } let mf = select_function(&func); let opcodes: Vec = mf.blocks.iter().flat_map(|b| b.insts.iter()).map(|i| i.opcode).collect(); assert!( opcodes.contains(&ArmOpcode::FaddV4S), "expected FaddV4S in MIR, got {:?}", opcodes ); assert!( opcodes.contains(&ArmOpcode::LdrQ), "expected LdrQ in MIR, got {:?}", opcodes ); assert!( opcodes.contains(&ArmOpcode::StrQ), "expected StrQ in MIR, got {:?}", opcodes ); } #[test] fn vector_abi_arg_uses_v0_to_v7() { // First 8 vector args should land in v0-v7. The 9th should // overflow to the stack at the next 16-byte slot. let mut state = AbiArgState::default(); let v_ty = IrType::Vector { lanes: 4, elem: Box::new(IrType::Float(FloatWidth::F32)), }; for expected in 0u8..8 { assert_eq!( classify_abi_arg(&v_ty, &mut state), AbiArgLoc::V128(expected), "vector arg #{} should be v{}", expected, expected ); } // 9th vector arg overflows to stack. match classify_abi_arg(&v_ty, &mut state) { AbiArgLoc::Stack(_) => {} other => panic!("expected Stack overflow, got {:?}", other), } } #[test] fn vector_args_share_idx_with_float_args() { // AAPCS64: vector and float args draw from the same v0-v7 // pool. A float arg should bump fp_idx, then a vector arg // should land at the next slot. let mut state = AbiArgState::default(); let f_ty = IrType::Float(FloatWidth::F64); let v_ty = IrType::Vector { lanes: 2, elem: Box::new(IrType::Int(IntWidth::I64)), }; assert_eq!(classify_abi_arg(&f_ty, &mut state), AbiArgLoc::Fp(0)); assert_eq!(classify_abi_arg(&v_ty, &mut state), AbiArgLoc::V128(1)); assert_eq!(classify_abi_arg(&f_ty, &mut state), AbiArgLoc::Fp(2)); } #[test] fn isel_lowers_vfma_2xf64_to_fmlav2d() { use crate::codegen::mir::ArmOpcode; let v_ty = IrType::Vector { lanes: 2, elem: Box::new(IrType::Float(FloatWidth::F64)), }; let mut func = Function::new("vfma_test".into(), vec![], IrType::Void); { let mut b = FuncBuilder::new(&mut func); let p_a = b.alloca(v_ty.clone()); let p_b = b.alloca(v_ty.clone()); let p_c = b.alloca(v_ty.clone()); let va = b.vload(p_a, v_ty.clone()); let vb = b.vload(p_b, v_ty.clone()); let vc = b.vload(p_c, v_ty.clone()); let _ = b.vfma(va, vb, vc); b.ret_void(); } let mf = select_function(&func); let opcodes: Vec = mf.blocks.iter().flat_map(|b| b.insts.iter()).map(|i| i.opcode).collect(); assert!( opcodes.contains(&ArmOpcode::FmlaV2D), "expected FmlaV2D, got {:?}", opcodes ); } }