//! Tail call optimization (post-regalloc peephole). //! //! After register allocation and callee-save insertion, the machine code for //! a call in tail position looks like: //! //! ```text //! ; arg setup (MOV xi, …) //! Bl _callee //! ; callee-save restores (LdpOffset / LdrImm / LdrFpImm) — zero or more //! LdpPost x29, x30, [sp], #16 ; epilogue frame restore //! Ret //! ``` //! //! We convert this to: //! //! ```text //! ; arg setup (unchanged — all spill loads happen before LdpPost) //! ; callee-save restores (unchanged) //! LdpPost x29, x30, [sp], #16 //! B _callee ; tail jump (not BL) //! ``` //! //! Correctness argument //! -------------------- //! * Argument registers (x0–x7, d0–d7) are loaded BEFORE the LdpPost, so //! they hold the correct values when the tail branch executes. //! * Callee-saved restores (x19–x28, d8–d15) are disjoint from the //! argument registers; restoring them cannot clobber the args. //! * LdpPost restores x29 (our FP) and x30 (our LR). After it fires, LR //! holds our *caller's* return address. When _callee executes its own //! RET, it returns to *our* caller directly — exactly what TCO requires. //! * We only recognize this pattern when there are **no instructions between //! Bl and the callee-restore cluster**. In particular, non-void calls //! whose return-value register survives coalesce_moves are handled because //! `coalesce_moves` already eliminated any `MOV x0, x0` self-moves, leaving //! the Bl immediately adjacent to the callee restores / LdpPost. //! * Gate: we don't fire on non-void calls where a non-trivial result-capture //! sequence remains (e.g., `MOV x1, x0`) — those are left alone. use super::mir::{ArmOpcode, MachineFunction, MachineInst, MachineOperand, PhysReg}; use std::collections::{HashMap, HashSet}; /// Run tail call optimization on a single machine function. /// /// Safe to call at any optimization level; the transformation never changes /// visible behavior and is always a code-size win (removes one instruction). pub fn tail_call_opt(mf: &mut MachineFunction) { for block in &mut mf.blocks { let n = block.insts.len(); if n < 2 { continue; } // Epilogue is always `LdpPost; Ret` at the very end. if block.insts[n - 1].opcode != ArmOpcode::Ret { continue; } if block.insts[n - 2].opcode != ArmOpcode::LdpPost { continue; } let ldp_idx = n - 2; // Walk backwards from just before LdpPost, skipping callee-save // restore instructions (LdpOffset, LdrImm, LdrFpImm). Stop when // we find something that isn't a callee restore. let mut bl_candidate = ldp_idx; while bl_candidate > 0 { bl_candidate -= 1; match block.insts[bl_candidate].opcode { ArmOpcode::LdpOffset | ArmOpcode::LdrImm | ArmOpcode::LdrFpImm => { // Callee-save restore — keep scanning backwards. } ArmOpcode::Bl => { // Found the BL — stop here. break; } _ => { // Non-callee-restore, non-BL — pattern doesn't match. bl_candidate = usize::MAX; // sentinel break; } } } // Sentinel or scanned to index 0 without finding Bl. if bl_candidate == usize::MAX { continue; } if block.insts[bl_candidate].opcode != ArmOpcode::Bl { continue; } // SAFETY: reject TCO when any argument register (x0–x7) holds a value // derived from our frame pointer (e.g. a pointer to a stack-allocated // local / derived-type struct). After the epilogue tears down our // frame, the callee's prologue reuses that memory; any pointer into it // becomes dangling. Taint analysis: track GP registers set from // `sub xN, x29, #M` (alloca) and propagated through MovReg / AddReg / // AddImm / Mul. If any x0–x7 is tainted, the tail call is unsafe. if has_frame_derived_arg(&block.insts[..bl_candidate]) { continue; } // Extract the call target from the Bl operand. let label = match block.insts[bl_candidate].operands.first() { Some(MachineOperand::Extern(s)) => s.clone(), _ => continue, // indirect call or unexpected operand — skip }; // Transform: // Remove `Bl _label` at bl_candidate. // Remove `Ret` (last instruction). // Append `B _label` (tail branch to external symbol). // // The callee restores and LdpPost between bl_candidate and ldp_idx // shift down by 1 (because we removed bl_candidate), but stay in // the right relative order. block.insts.remove(bl_candidate); block.insts.pop(); // remove Ret (was the last instruction) block.insts.push(MachineInst { opcode: ArmOpcode::B, operands: vec![MachineOperand::Extern(label)], def: None, }); } } // --------------------------------------------------------------------------- // Safety helpers // --------------------------------------------------------------------------- /// Returns true if any GP argument register (x0–x7) contains a frame-derived /// pointer at the point of the Bl. /// /// "Frame-derived" means the register was set — directly or transitively — from /// a `sub xN, x29, #M` (alloca materialization). /// /// The analysis is a forward taint propagation over both registers AND /// FP-relative stack slots, so it correctly handles the spill/reload pattern: /// /// ```text /// sub x10, x29, #4 ; x10 = frame addr (tainted) /// str x10, [x29, #-16] ; slot -16 now tainted /// ... /// ldr x9, [x29, #-16] ; x9 = frame addr (tainted via slot) /// mov x0, x9 ; x0 tainted → unsafe TCO /// ``` fn has_frame_derived_arg(insts: &[MachineInst]) -> bool { // GP register numbers whose current value is derived from the frame pointer. let mut tainted_regs: HashSet = HashSet::new(); // FP-relative offsets whose memory contents are frame-derived pointers. let mut tainted_slots: HashSet = HashSet::new(); // GP registers known to hold `fp + offset` or `fp - offset`. let mut frame_addr_regs: HashMap = HashMap::new(); for inst in insts { if let Some(dst) = written_gp_reg(inst) { frame_addr_regs.remove(&dst); } match inst.opcode { // sub xN, x29, #imm → xN holds a frame-relative address. ArmOpcode::SubImm if op_is_fp(inst, 1) => { if let Some(n) = op_gp(inst, 0) { tainted_regs.insert(n); if let Some(imm) = op_imm(inst, 2) { frame_addr_regs.insert(n, -imm); } } } // add xN, x29, #imm → xN holds a frame-relative address. ArmOpcode::AddImm if op_is_fp(inst, 1) => { if let Some(n) = op_gp(inst, 0) { tainted_regs.insert(n); if let Some(imm) = op_imm(inst, 2) { frame_addr_regs.insert(n, imm); } } } // add xN, xM, #imm where xM is a known frame address. ArmOpcode::AddImm => { if let (Some(dst), Some(src), Some(imm)) = (op_gp(inst, 0), op_gp(inst, 1), op_imm(inst, 2)) { if let Some(base_off) = frame_addr_regs.get(&src).copied() { tainted_regs.insert(dst); frame_addr_regs.insert(dst, base_off + imm); } else if tainted_regs.contains(&src) { tainted_regs.insert(dst); } } } // sub xN, xM, #imm where xM is a known frame address. ArmOpcode::SubImm => { if let (Some(dst), Some(src), Some(imm)) = (op_gp(inst, 0), op_gp(inst, 1), op_imm(inst, 2)) { if let Some(base_off) = frame_addr_regs.get(&src).copied() { tainted_regs.insert(dst); frame_addr_regs.insert(dst, base_off - imm); } else if tainted_regs.contains(&src) { tainted_regs.insert(dst); } } } // add xN, xM, xP (GEP: propagate taint from either source) ArmOpcode::AddReg if op_gp(inst, 1).is_some_and(|n| tainted_regs.contains(&n)) || op_gp(inst, 2).is_some_and(|n| tainted_regs.contains(&n)) => { if let Some(n) = op_gp(inst, 0) { tainted_regs.insert(n); } } // mov xN, xM (register copy — propagates taint to arg reg) ArmOpcode::MovReg if op_gp(inst, 1).is_some_and(|n| tainted_regs.contains(&n)) => { if let Some(n) = op_gp(inst, 0) { tainted_regs.insert(n); if let Some(src) = op_gp(inst, 1) { if let Some(off) = frame_addr_regs.get(&src).copied() { frame_addr_regs.insert(n, off); } } } } // mul xN, xM, xP (index computation in GEP; conservative) ArmOpcode::Mul if op_gp(inst, 1).is_some_and(|n| tainted_regs.contains(&n)) || op_gp(inst, 2).is_some_and(|n| tainted_regs.contains(&n)) => { if let Some(n) = op_gp(inst, 0) { tainted_regs.insert(n); } } // str xN, [x29, #off] — if xN is tainted, the slot becomes tainted. ArmOpcode::StrImm if op_gp(inst, 0).is_some_and(|n| tainted_regs.contains(&n)) && effective_frame_slot_offset(inst, 1, 2, &frame_addr_regs).is_some() => { if let Some(off) = effective_frame_slot_offset(inst, 1, 2, &frame_addr_regs) { tainted_slots.insert(off); } } // ldr xN, [frame] — if the slot is known tainted, xN becomes // tainted. Also conservatively reject tail calls when any 64-bit // GP register is reloaded from our frame in the tail block: the // slot may have been populated in a predecessor with an escaped // local address, then copied into x0–x7 later in the block. ArmOpcode::LdrImm => { if let Some(off) = effective_frame_slot_offset(inst, 1, 2, &frame_addr_regs) { if let Some(n) = op_gp(inst, 0) { if tainted_slots.contains(&off) || n <= 30 { tainted_regs.insert(n); } } } } _ => {} } } // Argument registers are x0–x7 (PhysReg::Gp(0)–Gp(7)). (0u8..8).any(|n| tainted_regs.contains(&n)) } /// True if operand at `idx` is the frame pointer (x29 = PhysReg::Gp(29)). #[inline] fn op_is_fp(inst: &MachineInst, idx: usize) -> bool { matches!( inst.operands.get(idx), Some(MachineOperand::PhysReg(p)) if *p == PhysReg::FP ) } /// GP register number (0–30) for the PhysReg::Gp operand at `idx`, or None. #[inline] fn op_gp(inst: &MachineInst, idx: usize) -> Option { match inst.operands.get(idx)? { MachineOperand::PhysReg(PhysReg::Gp(n)) => Some(*n), _ => None, } } /// Integer immediate operand at `idx`, or None. #[inline] fn op_imm(inst: &MachineInst, idx: usize) -> Option { match inst.operands.get(idx)? { MachineOperand::Imm(v) => Some(*v), MachineOperand::FrameSlot(v) => Some(*v as i64), _ => None, } } /// Frame-pointer-relative offset for the operand at `idx`, or None. /// Accepts both `Imm` and `FrameSlot` variants. #[inline] fn op_fp_offset(inst: &MachineInst, idx: usize) -> Option { match inst.operands.get(idx)? { MachineOperand::Imm(v) => Some(*v), MachineOperand::FrameSlot(v) => Some(*v as i64), _ => None, } } /// Effective FP-relative offset addressed by `[base, #off]`, where `base` is /// either FP directly or a GP register previously materialized from FP. #[inline] fn effective_frame_slot_offset( inst: &MachineInst, base_idx: usize, off_idx: usize, frame_addr_regs: &HashMap, ) -> Option { let off = op_imm(inst, off_idx).unwrap_or(0); match inst.operands.get(base_idx)? { MachineOperand::PhysReg(p) if *p == PhysReg::FP => Some(off), MachineOperand::PhysReg(PhysReg::Gp(n)) => frame_addr_regs.get(n).map(|base| base + off), _ => None, } } /// GP register written by this instruction, if operand 0 is a GP destination. #[inline] fn written_gp_reg(inst: &MachineInst) -> Option { match inst.opcode { ArmOpcode::AddReg | ArmOpcode::AddImm | ArmOpcode::SubReg | ArmOpcode::SubImm | ArmOpcode::Mul | ArmOpcode::MovReg | ArmOpcode::LdrImm => op_gp(inst, 0), _ => None, } } // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- #[cfg(test)] mod tests { use super::*; use crate::codegen::mir::*; fn make_block(insts: Vec) -> MachineBlock { MachineBlock { label: "test".into(), insts, id: MBlockId(0), } } fn bl(label: &str) -> MachineInst { MachineInst { opcode: ArmOpcode::Bl, operands: vec![MachineOperand::Extern(label.into())], def: None, } } fn ldp_post() -> MachineInst { MachineInst { opcode: ArmOpcode::LdpPost, operands: vec![ MachineOperand::PhysReg(PhysReg::FP), MachineOperand::PhysReg(PhysReg::LR), MachineOperand::PhysReg(PhysReg::Sp), ], def: None, } } fn ret() -> MachineInst { MachineInst { opcode: ArmOpcode::Ret, operands: vec![], def: None, } } fn ldr_callee_restore() -> MachineInst { MachineInst { opcode: ArmOpcode::LdrImm, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(19)), MachineOperand::PhysReg(PhysReg::FP), MachineOperand::Imm(-8), ], def: Some(VRegId(19)), } } fn build_mf(blocks: Vec>) -> MachineFunction { let mut mf = MachineFunction::new("test".into()); // MachineFunction starts with one empty block; overwrite it. mf.blocks[0].insts = blocks[0].clone(); for blk_insts in blocks.into_iter().skip(1) { let id = mf.new_block("bb"); mf.block_mut(id).insts = blk_insts; } mf } #[test] fn void_tail_call_no_callee_saves() { // Pattern: Bl; LdpPost; Ret → LdpPost; B let mut mf = build_mf(vec![vec![bl("_foo"), ldp_post(), ret()]]); tail_call_opt(&mut mf); let insts = &mf.blocks[0].insts; // Should now be: LdpPost, B _foo assert_eq!(insts.len(), 2); assert_eq!(insts[0].opcode, ArmOpcode::LdpPost); assert_eq!(insts[1].opcode, ArmOpcode::B); assert_eq!(insts[1].operands[0], MachineOperand::Extern("_foo".into())); } #[test] fn void_tail_call_with_callee_restore() { // Pattern: Bl; LdrImm(restore); LdpPost; Ret → LdrImm; LdpPost; B let mut mf = build_mf(vec![vec![ bl("_bar"), ldr_callee_restore(), ldp_post(), ret(), ]]); tail_call_opt(&mut mf); let insts = &mf.blocks[0].insts; assert_eq!(insts.len(), 3); assert_eq!(insts[0].opcode, ArmOpcode::LdrImm); assert_eq!(insts[1].opcode, ArmOpcode::LdpPost); assert_eq!(insts[2].opcode, ArmOpcode::B); } #[test] fn no_tco_when_non_callee_restore_between_bl_and_ldp() { // A non-trivial instruction (e.g., MovReg for result capture) blocks TCO. let mov_result = MachineInst { opcode: ArmOpcode::MovReg, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(1)), // x1, not x0 MachineOperand::PhysReg(PhysReg::Gp(0)), // x0 (result) ], def: None, }; let mut mf = build_mf(vec![vec![bl("_baz"), mov_result, ldp_post(), ret()]]); tail_call_opt(&mut mf); let insts = &mf.blocks[0].insts; // No transformation — Bl should still be present. assert!( insts.iter().any(|i| i.opcode == ArmOpcode::Bl), "Bl should NOT be removed when result is captured in non-return register" ); assert!( insts.iter().any(|i| i.opcode == ArmOpcode::Ret), "Ret should still be present" ); } #[test] fn no_tco_when_not_ending_in_ret() { // Block ending in B (not Ret) — no TCO. let b_inst = MachineInst { opcode: ArmOpcode::B, operands: vec![MachineOperand::BlockRef(MBlockId(1))], def: None, }; let mut mf = build_mf(vec![vec![bl("_qux"), ldp_post(), b_inst]]); tail_call_opt(&mut mf); // Should still have Bl (TCO not fired because no Ret). assert!(mf.blocks[0].insts.iter().any(|i| i.opcode == ArmOpcode::Bl)); } #[test] fn no_tco_when_frame_pointer_is_spilled_through_large_offset_slot() { // Repro shape from fortbite O1: // sub x10, fp, #104 // sub x8, fp, #1936 // str x10, [x8] // sub x8, fp, #1936 // ldr x22, [x8] // mov x1, x22 // bl _callee // // The arg in x1 is a pointer into our frame, just spilled/reloaded // through the large-offset materialization form. Tail-branching here // would tear down the frame before the callee consumes x1. let mut mf = build_mf(vec![vec![ MachineInst { opcode: ArmOpcode::SubImm, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(10)), MachineOperand::PhysReg(PhysReg::FP), MachineOperand::Imm(104), ], def: None, }, MachineInst { opcode: ArmOpcode::SubImm, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(8)), MachineOperand::PhysReg(PhysReg::FP), MachineOperand::Imm(1936), ], def: None, }, MachineInst { opcode: ArmOpcode::StrImm, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(10)), MachineOperand::PhysReg(PhysReg::Gp(8)), MachineOperand::Imm(0), ], def: None, }, MachineInst { opcode: ArmOpcode::SubImm, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(8)), MachineOperand::PhysReg(PhysReg::FP), MachineOperand::Imm(1936), ], def: None, }, MachineInst { opcode: ArmOpcode::LdrImm, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(22)), MachineOperand::PhysReg(PhysReg::Gp(8)), MachineOperand::Imm(0), ], def: None, }, MachineInst { opcode: ArmOpcode::MovReg, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(1)), MachineOperand::PhysReg(PhysReg::Gp(22)), ], def: None, }, bl("_callee"), ldp_post(), ret(), ]]); tail_call_opt(&mut mf); assert!( mf.blocks[0].insts.iter().any(|i| i.opcode == ArmOpcode::Bl), "tail-call optimization must not erase the call when an arg reloads a spilled frame pointer" ); assert!( mf.blocks[0] .insts .iter() .any(|i| i.opcode == ArmOpcode::Ret), "tail-call optimization must leave the normal return path intact" ); } #[test] fn no_tco_when_arg_register_is_reloaded_from_frame_in_tail_block() { let mut mf = build_mf(vec![ vec![ MachineInst { opcode: ArmOpcode::SubImm, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(10)), MachineOperand::PhysReg(PhysReg::FP), MachineOperand::Imm(104), ], def: None, }, MachineInst { opcode: ArmOpcode::StrImm, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(10)), MachineOperand::PhysReg(PhysReg::FP), MachineOperand::Imm(-1936), ], def: None, }, MachineInst { opcode: ArmOpcode::B, operands: vec![MachineOperand::BlockRef(MBlockId(1))], def: None, }, ], vec![ MachineInst { opcode: ArmOpcode::LdrImm, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(1)), MachineOperand::PhysReg(PhysReg::FP), MachineOperand::Imm(-1936), ], def: None, }, bl("_callee"), ldp_post(), ret(), ], ]); tail_call_opt(&mut mf); assert!( mf.blocks[1].insts.iter().any(|i| i.opcode == ArmOpcode::Bl), "tail-call optimization must not fire when x1 is reloaded from our frame in the tail block" ); assert!( mf.blocks[1].insts.iter().any(|i| i.opcode == ArmOpcode::Ret), "tail-call optimization must preserve the return when the arg comes from a frame reload" ); } #[test] fn no_tco_when_temp_register_reloads_frame_slot_before_copying_to_arg() { let mut mf = build_mf(vec![ vec![ MachineInst { opcode: ArmOpcode::SubImm, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(10)), MachineOperand::PhysReg(PhysReg::FP), MachineOperand::Imm(104), ], def: None, }, MachineInst { opcode: ArmOpcode::StrImm, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(10)), MachineOperand::PhysReg(PhysReg::FP), MachineOperand::Imm(-1936), ], def: None, }, MachineInst { opcode: ArmOpcode::B, operands: vec![MachineOperand::BlockRef(MBlockId(1))], def: None, }, ], vec![ MachineInst { opcode: ArmOpcode::LdrImm, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(22)), MachineOperand::PhysReg(PhysReg::FP), MachineOperand::Imm(-1936), ], def: None, }, MachineInst { opcode: ArmOpcode::MovReg, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(1)), MachineOperand::PhysReg(PhysReg::Gp(22)), ], def: None, }, bl("_callee"), ldp_post(), ret(), ], ]); tail_call_opt(&mut mf); assert!( mf.blocks[1].insts.iter().any(|i| i.opcode == ArmOpcode::Bl), "tail-call optimization must not fire when a temp reloads a frame slot before copying it to x1" ); assert!( mf.blocks[1].insts.iter().any(|i| i.opcode == ArmOpcode::Ret), "tail-call optimization must preserve the return when a frame reload feeds x1 indirectly" ); } }