| 1 | //! Linear scan register allocator. |
| 2 | //! |
| 3 | //! Replaces the naive spill-everything strategy with proper register assignment. |
| 4 | //! Algorithm: sort live intervals by start, walk in order, assign physical registers. |
| 5 | //! When no register is available, spill the interval ending furthest (or the current |
| 6 | //! one if it ends first). Insert ldr/str for spill code. |
| 7 | //! |
| 8 | //! Callee-saved registers (x19-x28, d8-d15) are tracked — if used, the function |
| 9 | //! prologue/epilogue must save/restore them. |
| 10 | |
| 11 | use super::liveness::compute_liveness; |
| 12 | use super::mir::*; |
| 13 | use std::collections::{HashMap, HashSet}; |
| 14 | |
| 15 | /// GP registers available for allocation (excludes x18, x29, x30, x31/sp). |
| 16 | /// Ordered: caller-saved first (prefer these to avoid save/restore overhead), |
| 17 | /// then callee-saved. |
| 18 | /// |
| 19 | /// x8 is reserved for large-offset addressing during emission. |
| 20 | /// x16, x17 are the linker scratch registers (clobbered by dynamic |
| 21 | /// dispatch stubs). |
| 22 | /// |
| 23 | /// x9, x10, x11 are reserved **exclusively** for spill reloads |
| 24 | /// (see GP_SPILL_SCRATCH). They used to be in the allocation pool |
| 25 | /// under the theory that spill code could "borrow" them when |
| 26 | /// they're temporarily free, but that assumption is only valid |
| 27 | /// when the reload is inserted at a point where the borrowed |
| 28 | /// register is dead — and the spill code had no reliable way to |
| 29 | /// determine that. The result was a reload clobbering a |
| 30 | /// freshly-computed live value (the slice-print crash at -O1+). |
| 31 | /// Pinning x9-x11 out of the allocation pool costs 3 vregs worth |
| 32 | /// of pressure and guarantees reloads never clash. |
| 33 | const GP_ALLOC_ORDER: [u8; 22] = [ |
| 34 | // Caller-saved (temporary, no save needed) |
| 35 | 12, 13, 14, 15, // x12-x15 |
| 36 | 0, 1, 2, 3, 4, 5, 6, 7, // x0-x7 (args, but available between calls) |
| 37 | // Callee-saved (must save/restore if used) |
| 38 | 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, |
| 39 | ]; |
| 40 | |
| 41 | /// FP registers available for allocation. Same rationale as GP: |
| 42 | /// d29, d30, d31 are reserved exclusively as spill reload scratch. |
| 43 | const FP_ALLOC_ORDER: [u8; 29] = [ |
| 44 | // Caller-saved |
| 45 | 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, 3, 4, 5, 6, 7, |
| 46 | // Callee-saved |
| 47 | 8, 9, 10, 11, 12, 13, 14, 15, |
| 48 | ]; |
| 49 | |
| 50 | /// Callee-saved GP register range. |
| 51 | const GP_CALLEE_SAVED: std::ops::RangeInclusive<u8> = 19..=28; |
| 52 | /// Callee-saved FP register range. |
| 53 | const FP_CALLEE_SAVED: std::ops::RangeInclusive<u8> = 8..=15; |
| 54 | |
| 55 | /// Bytes a single spill of a vreg of class `class` occupies. NEON |
| 56 | /// vectors need 16; everything else fits in 8. Used to size frame |
| 57 | /// slots so the LdrQ/StrQ pair on a V128 spill operates on a slot |
| 58 | /// that's actually 16 bytes wide and 16-byte aligned. |
| 59 | fn spill_slot_size(class: RegClass) -> u32 { |
| 60 | match class { |
| 61 | RegClass::V128 => 16, |
| 62 | _ => 8, |
| 63 | } |
| 64 | } |
| 65 | |
| 66 | /// Where a split interval's *post-call* half lives. `Allocated` |
| 67 | /// is the win — the post-half got a register, so the only memory |
| 68 | /// traffic is the str/ldr that bridges the call. `Spilled` is the |
| 69 | /// fall-back: the post-half couldn't be allocated to a register |
| 70 | /// either, so post-call uses load directly from `bridge_slot` |
| 71 | /// (which is the same slot the pre-half wrote into before the |
| 72 | /// call), and we omit the redundant ldr. |
| 73 | #[derive(Debug, Clone, Copy)] |
| 74 | pub enum PostHalf { |
| 75 | Allocated(PhysReg), |
| 76 | Spilled, |
| 77 | } |
| 78 | |
| 79 | /// One record per call boundary at which a live range was split. |
| 80 | /// Drives `apply_allocation` to (a) rewrite operands of `vreg` |
| 81 | /// based on whether they precede or follow the call, and the |
| 82 | /// post-pass `insert_split_bridges` to (b) emit `str pre_phys, |
| 83 | /// [bridge_slot]` before the call plus, if the post-half got a |
| 84 | /// register, `ldr post_phys, [bridge_slot]` after. |
| 85 | /// |
| 86 | /// `call_position` is the BL/BLR's instruction position in the |
| 87 | /// liveness numbering — used by apply_allocation to compare with |
| 88 | /// each operand's cur_pos. `call_index` is the same call's |
| 89 | /// 0-based walk-order ordinal among all calls in the function — |
| 90 | /// used by insert_split_bridges to match the BL/BLR by ordinal, |
| 91 | /// since intervening passes (apply_allocation's spill code) shift |
| 92 | /// numerical positions but preserve walk order. |
| 93 | #[derive(Debug, Clone)] |
| 94 | pub struct SplitRecord { |
| 95 | pub vreg: VRegId, |
| 96 | pub call_position: u32, |
| 97 | pub call_index: u32, |
| 98 | pub pre_phys: PhysReg, |
| 99 | pub post: PostHalf, |
| 100 | pub bridge_slot: i32, |
| 101 | } |
| 102 | |
| 103 | /// Result of register allocation. |
| 104 | pub struct AllocResult { |
| 105 | /// VRegId → assigned PhysReg. For a split vreg, this holds the |
| 106 | /// pre-call (first-half) physreg; the post-half lives in the |
| 107 | /// matching `SplitRecord`. |
| 108 | pub assignments: HashMap<VRegId, PhysReg>, |
| 109 | /// Spilled vregs → stack frame offset. |
| 110 | pub spills: HashMap<VRegId, i32>, |
| 111 | /// Callee-saved registers that were used and need save/restore. |
| 112 | pub callee_saved_used: Vec<PhysReg>, |
| 113 | /// Live-range splits performed at call boundaries. Sorted by |
| 114 | /// `(vreg, call_position)` for deterministic lookup. |
| 115 | pub split_records: Vec<SplitRecord>, |
| 116 | } |
| 117 | |
| 118 | /// Run linear scan register allocation on a machine function. |
| 119 | pub fn linear_scan(mf: &mut MachineFunction) -> AllocResult { |
| 120 | let liveness = compute_liveness(mf); |
| 121 | |
| 122 | let mut assignments: HashMap<VRegId, PhysReg> = HashMap::new(); |
| 123 | let mut spills: HashMap<VRegId, i32> = HashMap::new(); |
| 124 | // Active intervals: (reg_num, interval_end, current_vreg). The |
| 125 | // vreg is tracked here so spill victim selection can identify |
| 126 | // the *current* holder of a physical register without iterating |
| 127 | // the `assignments` HashMap (which accumulates stale historical |
| 128 | // entries when registers are reused after expiry, and whose |
| 129 | // iteration order is non-deterministic). Audit fix surfaced by |
| 130 | // mem2reg work — produced flaky `select_type.f90` failures and |
| 131 | // non-reproducible builds across compiles. |
| 132 | let mut active_gp: Vec<(u8, u32, VRegId)> = Vec::new(); |
| 133 | let mut active_fp: Vec<(u8, u32, VRegId)> = Vec::new(); |
| 134 | // Two free pools per class. Non-call-crossing intervals draw |
| 135 | // from caller-saved first (no save/restore overhead in the |
| 136 | // prologue/epilogue); the callee-saved tier is reserved for |
| 137 | // call-crossing intervals, with caller-tier exhaustion as a |
| 138 | // fall-through. Previously this was a single Vec consumed via |
| 139 | // pop(), which returned the *last* element first — quietly |
| 140 | // inverting the policy documented above on GP_ALLOC_ORDER and |
| 141 | // forcing every function with even a single allocated vreg to |
| 142 | // mark x28/x27/... as callee_saved_used. |
| 143 | let split_pool = |order: &[u8], callee: &std::ops::RangeInclusive<u8>| -> (Vec<u8>, Vec<u8>) { |
| 144 | let mut caller = Vec::new(); |
| 145 | let mut callees = Vec::new(); |
| 146 | for &r in order { |
| 147 | if callee.contains(&r) { |
| 148 | callees.push(r); |
| 149 | } else { |
| 150 | caller.push(r); |
| 151 | } |
| 152 | } |
| 153 | (caller, callees) |
| 154 | }; |
| 155 | let (mut free_gp_caller, mut free_gp_callee) = |
| 156 | split_pool(&GP_ALLOC_ORDER, &GP_CALLEE_SAVED); |
| 157 | let (mut free_fp_caller, mut free_fp_callee) = |
| 158 | split_pool(&FP_ALLOC_ORDER, &FP_CALLEE_SAVED); |
| 159 | let mut callee_saved_used: HashSet<PhysReg> = HashSet::new(); |
| 160 | |
| 161 | // Hard-coded PhysReg writes inside instructions: positions |
| 162 | // where operand 0 is a PhysReg, e.g. arg-setup |
| 163 | // `mov PhysReg(x_i), VReg(value)` immediately before each BL. |
| 164 | // The splitter consults this map to avoid pre_phys candidates |
| 165 | // that will be clobbered by an arg-setup inside the pre-half's |
| 166 | // range — picking such a register would corrupt the bridge str |
| 167 | // because by the time the bridge runs the register no longer |
| 168 | // holds the pre-half value. We only constrain the splitter |
| 169 | // here (not the general allocator) to keep the inflated- |
| 170 | // liveness compensations elsewhere intact. |
| 171 | let mut phys_writes_gp: HashMap<u8, Vec<u32>> = HashMap::new(); |
| 172 | let mut phys_writes_fp: HashMap<u8, Vec<u32>> = HashMap::new(); |
| 173 | { |
| 174 | let mut p: u32 = 0; |
| 175 | for block in &mf.blocks { |
| 176 | for inst in &block.insts { |
| 177 | if let Some(MachineOperand::PhysReg(phys)) = inst.operands.first() { |
| 178 | match phys { |
| 179 | PhysReg::Gp(n) | PhysReg::Gp32(n) => { |
| 180 | phys_writes_gp.entry(*n).or_default().push(p); |
| 181 | } |
| 182 | PhysReg::Fp(n) | PhysReg::Fp32(n) => { |
| 183 | phys_writes_fp.entry(*n).or_default().push(p); |
| 184 | } |
| 185 | _ => {} |
| 186 | } |
| 187 | } |
| 188 | p += 2; |
| 189 | } |
| 190 | } |
| 191 | } |
| 192 | let phys_written_in = |phys_n: u8, is_fp: bool, start: u32, end: u32| -> bool { |
| 193 | let map = if is_fp { |
| 194 | &phys_writes_fp |
| 195 | } else { |
| 196 | &phys_writes_gp |
| 197 | }; |
| 198 | if let Some(positions) = map.get(&phys_n) { |
| 199 | positions.iter().any(|&pp| pp > start && pp <= end) |
| 200 | } else { |
| 201 | false |
| 202 | } |
| 203 | }; |
| 204 | |
| 205 | // Tracks live-range splits in progress: each entry maps a |
| 206 | // synthetic post-half vreg to (original vreg, call position, |
| 207 | // bridge slot). After allocation we walk this map to |
| 208 | // materialize SplitRecords using whatever physreg/slot the |
| 209 | // post-half ended up with. |
| 210 | let mut splits_in_progress: HashMap<VRegId, (VRegId, u32, i32)> = HashMap::new(); |
| 211 | |
| 212 | // Per-vreg actual def/use position range, computed by direct |
| 213 | // walk of the MIR. Used by the splitter to sanity-check |
| 214 | // crosses_call before deciding to split: the global liveness |
| 215 | // dataflow has a quirk that extends every dest vreg's live |
| 216 | // range backward over its own def (op0 of `mov vreg_dst, |
| 217 | // vreg_src` is treated as both def and use), inflating short |
| 218 | // vreg-to-vreg copy chains to span entire blocks. The |
| 219 | // splitter would then see a "call crossing" for a vreg whose |
| 220 | // real def/use range is well before any call. Splitting such a |
| 221 | // vreg corrupts the chain. We don't fix the dataflow globally |
| 222 | // — other regalloc decisions depend on the inflated ranges in |
| 223 | // ways that surface latent bugs in arg-setup emission — but we |
| 224 | // do consult this tighter map before committing to a split. |
| 225 | let mut vreg_actual_range: HashMap<VRegId, (u32, u32)> = HashMap::new(); |
| 226 | // Vregs defined in more than one MIR block — these are |
| 227 | // phi-like (block params receiving values via the parallel-copy |
| 228 | // sequence emit_branch_arg_copies inserts at the end of every |
| 229 | // predecessor). Linear-position liveness can't represent the |
| 230 | // true CFG-aware live set for these, so the splitter must not |
| 231 | // act on them: a call block lexically wedged between the def |
| 232 | // edge and the use block fakes a crossing point that no actual |
| 233 | // control-flow path traverses, and the resulting split assigns |
| 234 | // the post-half a different physreg than the pre-half — every |
| 235 | // predecessor's parallel-copy lands in pre_phys but every use |
| 236 | // inside the loop reads post_phys. (`realworld_affine_shift.f90` |
| 237 | // at -O2+ exhibits this: V_iv defined in `if_end_1`, used in |
| 238 | // `do_check_3`/`do_body_4`, but `if_then_2` carrying the |
| 239 | // recursive call is lexically between if_end_1 and do_check_3.) |
| 240 | let mut vreg_def_blocks: HashMap<VRegId, std::collections::HashSet<usize>> = |
| 241 | HashMap::new(); |
| 242 | { |
| 243 | let mut p: u32 = 0; |
| 244 | for (block_idx, block) in mf.blocks.iter().enumerate() { |
| 245 | for inst in &block.insts { |
| 246 | let mut touched: Vec<VRegId> = Vec::new(); |
| 247 | if let Some(d) = inst.def { |
| 248 | touched.push(d); |
| 249 | vreg_def_blocks.entry(d).or_default().insert(block_idx); |
| 250 | } |
| 251 | for op in &inst.operands { |
| 252 | if let MachineOperand::VReg(v) = op { |
| 253 | touched.push(*v); |
| 254 | } |
| 255 | } |
| 256 | for v in touched { |
| 257 | let entry = vreg_actual_range.entry(v).or_insert((p, p)); |
| 258 | if p < entry.0 { |
| 259 | entry.0 = p; |
| 260 | } |
| 261 | if p > entry.1 { |
| 262 | entry.1 = p; |
| 263 | } |
| 264 | } |
| 265 | p += 2; |
| 266 | } |
| 267 | } |
| 268 | } |
| 269 | let multi_def_vregs: std::collections::HashSet<VRegId> = vreg_def_blocks |
| 270 | .iter() |
| 271 | .filter(|(_, blocks)| blocks.len() > 1) |
| 272 | .map(|(v, _)| *v) |
| 273 | .collect(); |
| 274 | |
| 275 | // Worklist of intervals to allocate. Starts as a clone of the |
| 276 | // pre-sorted liveness intervals; splitting inserts post-half |
| 277 | // intervals at their sorted position so the linear-scan |
| 278 | // invariant (process in start-position order) is preserved. |
| 279 | let mut worklist: Vec<super::liveness::LiveInterval> = liveness.intervals.clone(); |
| 280 | let mut idx = 0; |
| 281 | while idx < worklist.len() { |
| 282 | let interval = worklist[idx].clone(); |
| 283 | // V128 lives in the same physical V/Q register file as |
| 284 | // Fp32/Fp64; the regalloc must draw from the FP pool for |
| 285 | // any of these classes or the vector vreg ends up in a Gp |
| 286 | // register and emit prints `ldr qx21` (invalid asm). |
| 287 | let is_fp = matches!( |
| 288 | interval.class, |
| 289 | RegClass::Fp32 | RegClass::Fp64 | RegClass::V128 |
| 290 | ); |
| 291 | |
| 292 | if is_fp { |
| 293 | expire_intervals( |
| 294 | &mut active_fp, |
| 295 | &mut free_fp_caller, |
| 296 | &mut free_fp_callee, |
| 297 | &FP_CALLEE_SAVED, |
| 298 | interval.start, |
| 299 | ); |
| 300 | } else { |
| 301 | expire_intervals( |
| 302 | &mut active_gp, |
| 303 | &mut free_gp_caller, |
| 304 | &mut free_gp_callee, |
| 305 | &GP_CALLEE_SAVED, |
| 306 | interval.start, |
| 307 | ); |
| 308 | } |
| 309 | |
| 310 | let (active, free_caller, free_callee) = if is_fp { |
| 311 | (&mut active_fp, &mut free_fp_caller, &mut free_fp_callee) |
| 312 | } else { |
| 313 | (&mut active_gp, &mut free_gp_caller, &mut free_gp_callee) |
| 314 | }; |
| 315 | |
| 316 | let reg_opt = if interval.crosses_call { |
| 317 | // Must use callee-saved (caller-saved would be clobbered |
| 318 | // by the call). |
| 319 | free_callee.pop() |
| 320 | } else if let Some(hint) = interval.hint { |
| 321 | // Try the hinted register first across both tiers. |
| 322 | if let Some(i) = free_caller.iter().position(|&r| r == hint) { |
| 323 | Some(free_caller.remove(i)) |
| 324 | } else if let Some(i) = free_callee.iter().position(|&r| r == hint) { |
| 325 | Some(free_callee.remove(i)) |
| 326 | } else { |
| 327 | free_caller.pop().or_else(|| free_callee.pop()) |
| 328 | } |
| 329 | } else { |
| 330 | free_caller.pop().or_else(|| free_callee.pop()) |
| 331 | }; |
| 332 | |
| 333 | // Live-range splitting: a call-crossing interval that |
| 334 | // can't grab a callee-saved register would otherwise be |
| 335 | // full-spilled. If it has a single call crossing AND a |
| 336 | // caller-saved register is available right now, split at |
| 337 | // the call boundary instead — the pre-half goes to the |
| 338 | // caller-saved register, the post-half is allocated as a |
| 339 | // fresh interval, and we bridge the call with str/ldr to |
| 340 | // a frame slot. Net win: the longer-lived half between |
| 341 | // the def and the call avoids spill loads on every use. |
| 342 | // Cross-check `crosses_call` against the actual def/use |
| 343 | // range from the MIR walk: a vreg whose real range doesn't |
| 344 | // straddle the supposed crossing point is a false positive |
| 345 | // from inflated liveness, and must NOT be split. |
| 346 | let real_crosses = if multi_def_vregs.contains(&interval.vreg) { |
| 347 | // Phi-like vreg: positions can't represent the true |
| 348 | // live set (see comment above where multi_def_vregs is |
| 349 | // built). Refuse to split — the parallel-copy at every |
| 350 | // predecessor must land in the same physreg as every |
| 351 | // use, and splitting fundamentally breaks that. |
| 352 | false |
| 353 | } else if let Some(&(real_start, real_end)) = |
| 354 | vreg_actual_range.get(&interval.vreg) |
| 355 | { |
| 356 | interval.call_crossings.len() == 1 |
| 357 | && interval.call_crossings[0] > real_start |
| 358 | && interval.call_crossings[0] < real_end |
| 359 | } else { |
| 360 | false |
| 361 | }; |
| 362 | // Find a caller-saved that survives the pre-half range |
| 363 | // unclobbered. Reject any register P such that an |
| 364 | // instruction inside (interval.start, call_pos - 1] writes |
| 365 | // P — by the time the bridge str fires (right above the |
| 366 | // arg-setup block), P would no longer hold the pre-half's |
| 367 | // value and the bridge would capture garbage. |
| 368 | let safe_pre_phys = if !free_caller.is_empty() && interval.call_crossings.len() == 1 { |
| 369 | let cp = interval.call_crossings[0]; |
| 370 | let pre_end = cp.saturating_sub(1); |
| 371 | free_caller.iter().rposition(|&r| { |
| 372 | !phys_written_in(r, is_fp, interval.start, pre_end) |
| 373 | }) |
| 374 | } else { |
| 375 | None |
| 376 | }; |
| 377 | if let (None, true, true, Some(safe_idx)) = |
| 378 | (reg_opt, interval.crosses_call, real_crosses, safe_pre_phys) |
| 379 | { |
| 380 | let call_pos = interval.call_crossings[0]; |
| 381 | let bridge_slot = mf.alloc_local(spill_slot_size(interval.class)); |
| 382 | let synthetic = mf.new_vreg(interval.class); |
| 383 | splits_in_progress.insert(synthetic, (interval.vreg, call_pos, bridge_slot)); |
| 384 | // Pull the safe register out of the caller-saved pool |
| 385 | // and assign it directly: re-running through the normal |
| 386 | // assignment path could pick a different register that |
| 387 | // *isn't* phys-write-safe (free_caller.pop() doesn't |
| 388 | // filter), which would defeat the whole point. |
| 389 | let safe_reg = free_caller.remove(safe_idx); |
| 390 | let pre_phys = if is_fp { |
| 391 | match interval.class { |
| 392 | RegClass::Fp32 => PhysReg::Fp32(safe_reg), |
| 393 | _ => PhysReg::Fp(safe_reg), |
| 394 | } |
| 395 | } else { |
| 396 | match interval.class { |
| 397 | RegClass::Gp32 => PhysReg::Gp32(safe_reg), |
| 398 | _ => PhysReg::Gp(safe_reg), |
| 399 | } |
| 400 | }; |
| 401 | assignments.insert(interval.vreg, pre_phys); |
| 402 | // Pre-half ends before the call — track so this slot |
| 403 | // becomes free for any post-call interval that needs a |
| 404 | // caller-saved. |
| 405 | active.push((safe_reg, call_pos.saturating_sub(1), interval.vreg)); |
| 406 | active.sort_by_key(|&(_, end, _)| end); |
| 407 | |
| 408 | // Post-half: a fresh synthetic vreg covering the |
| 409 | // post-call range. Single original crossing → no |
| 410 | // crossings remain; cannot recurse-split. |
| 411 | let post_half = super::liveness::LiveInterval { |
| 412 | vreg: synthetic, |
| 413 | class: interval.class, |
| 414 | start: call_pos.saturating_add(1), |
| 415 | end: interval.end, |
| 416 | crosses_call: false, |
| 417 | call_crossings: Vec::new(), |
| 418 | hint: None, |
| 419 | }; |
| 420 | // Insert post-half at its sorted position. |
| 421 | let post_start = post_half.start; |
| 422 | let post_vreg = post_half.vreg.0; |
| 423 | let mut insert_at = idx + 1; |
| 424 | while insert_at < worklist.len() { |
| 425 | let w = &worklist[insert_at]; |
| 426 | if (w.start, w.vreg.0) > (post_start, post_vreg) { |
| 427 | break; |
| 428 | } |
| 429 | insert_at += 1; |
| 430 | } |
| 431 | worklist.insert(insert_at, post_half); |
| 432 | idx += 1; |
| 433 | continue; |
| 434 | } |
| 435 | |
| 436 | if let Some(reg) = reg_opt { |
| 437 | // Register available. |
| 438 | let phys = if is_fp { |
| 439 | match interval.class { |
| 440 | RegClass::Fp32 => PhysReg::Fp32(reg), |
| 441 | _ => PhysReg::Fp(reg), |
| 442 | } |
| 443 | } else { |
| 444 | match interval.class { |
| 445 | RegClass::Gp32 => PhysReg::Gp32(reg), |
| 446 | _ => PhysReg::Gp(reg), |
| 447 | } |
| 448 | }; |
| 449 | assignments.insert(interval.vreg, phys); |
| 450 | active.push((reg, interval.end, interval.vreg)); |
| 451 | active.sort_by_key(|&(_, end, _)| end); |
| 452 | |
| 453 | // Track callee-saved usage. |
| 454 | if !is_fp && GP_CALLEE_SAVED.contains(®) { |
| 455 | callee_saved_used.insert(PhysReg::Gp(reg)); |
| 456 | } |
| 457 | if is_fp && FP_CALLEE_SAVED.contains(®) { |
| 458 | callee_saved_used.insert(PhysReg::Fp(reg)); |
| 459 | } |
| 460 | } else { |
| 461 | // No register available — spill. Find the active |
| 462 | // interval ending furthest. `active` is kept sorted by |
| 463 | // end (see the `active.sort_by_key(...)` calls above |
| 464 | // and below), so the last entry is always the furthest. |
| 465 | // The active list carries the current vreg directly so |
| 466 | // we don't have to scan the (non-deterministic, |
| 467 | // stale-entry-laden) assignments map for the victim. |
| 468 | if let Some(&(spill_reg, spill_end, victim)) = active.last() { |
| 469 | let last_idx = active.len() - 1; |
| 470 | if spill_end > interval.end { |
| 471 | // Spill the furthest active interval, give its register to us. |
| 472 | // The victim's class is what determines its slot size — not |
| 473 | // ours, since we're taking the victim's physreg. |
| 474 | let victim_class = mf |
| 475 | .vregs |
| 476 | .iter() |
| 477 | .find(|v| v.id == victim) |
| 478 | .map(|v| v.class) |
| 479 | .unwrap_or(RegClass::Gp64); |
| 480 | let offset = mf.alloc_local(spill_slot_size(victim_class)); |
| 481 | spills.insert(victim, offset); |
| 482 | assignments.remove(&victim); |
| 483 | |
| 484 | let phys = if is_fp { |
| 485 | match interval.class { |
| 486 | RegClass::Fp32 => PhysReg::Fp32(spill_reg), |
| 487 | _ => PhysReg::Fp(spill_reg), |
| 488 | } |
| 489 | } else { |
| 490 | match interval.class { |
| 491 | RegClass::Gp32 => PhysReg::Gp32(spill_reg), |
| 492 | _ => PhysReg::Gp(spill_reg), |
| 493 | } |
| 494 | }; |
| 495 | assignments.insert(interval.vreg, phys); |
| 496 | active.remove(last_idx); |
| 497 | active.push((spill_reg, interval.end, interval.vreg)); |
| 498 | active.sort_by_key(|&(_, end, _)| end); |
| 499 | } else { |
| 500 | // Current interval ends later — spill it. Use the |
| 501 | // current interval's class for slot sizing. |
| 502 | let offset = mf.alloc_local(spill_slot_size(interval.class)); |
| 503 | spills.insert(interval.vreg, offset); |
| 504 | } |
| 505 | } else { |
| 506 | // No active intervals — shouldn't happen but spill to be safe. |
| 507 | let offset = mf.alloc_local(spill_slot_size(interval.class)); |
| 508 | spills.insert(interval.vreg, offset); |
| 509 | } |
| 510 | } |
| 511 | idx += 1; |
| 512 | } |
| 513 | |
| 514 | // Materialize SplitRecords from the in-progress map. Each |
| 515 | // synthetic post-half vreg either landed on a phys (the win |
| 516 | // case) or was spilled (the fall-back). Either way we have a |
| 517 | // bridge slot that links the two halves across the call. |
| 518 | // |
| 519 | // Build a position→call-index map from the BL/BLR walk so |
| 520 | // we can attach a stable ordinal to each split. `liveness` |
| 521 | // already enumerated calls in walk order, so its |
| 522 | // `call_positions` would do — but liveness exposes only |
| 523 | // intervals. Re-derive locally; the call positions are |
| 524 | // small. |
| 525 | let call_positions_walk: Vec<u32> = { |
| 526 | let mut p: u32 = 0; |
| 527 | let mut cps = Vec::new(); |
| 528 | for block in &mf.blocks { |
| 529 | for inst in &block.insts { |
| 530 | if matches!(inst.opcode, ArmOpcode::Bl | ArmOpcode::Blr) { |
| 531 | cps.push(p); |
| 532 | } |
| 533 | p += 2; |
| 534 | } |
| 535 | } |
| 536 | cps |
| 537 | }; |
| 538 | let mut split_records: Vec<SplitRecord> = splits_in_progress |
| 539 | .into_iter() |
| 540 | .filter_map(|(synthetic, (orig, call_pos, bridge_slot))| { |
| 541 | let pre_phys = *assignments.get(&orig)?; |
| 542 | let post = if let Some(&p) = assignments.get(&synthetic) { |
| 543 | PostHalf::Allocated(p) |
| 544 | } else if spills.contains_key(&synthetic) { |
| 545 | PostHalf::Spilled |
| 546 | } else { |
| 547 | // No allocation either way — shouldn't happen, but |
| 548 | // skip the record rather than emit garbage. |
| 549 | return None; |
| 550 | }; |
| 551 | assignments.remove(&synthetic); |
| 552 | spills.remove(&synthetic); |
| 553 | let call_index = call_positions_walk.binary_search(&call_pos).ok()? as u32; |
| 554 | Some(SplitRecord { |
| 555 | vreg: orig, |
| 556 | call_position: call_pos, |
| 557 | call_index, |
| 558 | pre_phys, |
| 559 | post, |
| 560 | bridge_slot, |
| 561 | }) |
| 562 | }) |
| 563 | .collect(); |
| 564 | split_records.sort_by_key(|r| (r.vreg.0, r.call_position)); |
| 565 | |
| 566 | // Sort callee-saved for consistent prologue/epilogue ordering. |
| 567 | let mut callee_saved: Vec<PhysReg> = callee_saved_used.into_iter().collect(); |
| 568 | callee_saved.sort_by_key(|r| match r { |
| 569 | PhysReg::Gp(n) | PhysReg::Gp32(n) => *n as u16, |
| 570 | PhysReg::Fp(n) | PhysReg::Fp32(n) => 100 + *n as u16, |
| 571 | _ => 200, |
| 572 | }); |
| 573 | |
| 574 | AllocResult { |
| 575 | assignments, |
| 576 | spills, |
| 577 | split_records, |
| 578 | callee_saved_used: callee_saved, |
| 579 | } |
| 580 | } |
| 581 | |
| 582 | /// Position-resolved assignment for a vreg. A vreg that wasn't |
| 583 | /// split has the same assignment everywhere; a split vreg's |
| 584 | /// assignment depends on whether the use is before or after the |
| 585 | /// call boundary, and the post-call assignment may itself be a |
| 586 | /// memory slot (the spilled-post fall-back). |
| 587 | enum LogicalAssignment { |
| 588 | Reg(PhysReg), |
| 589 | Slot(i32), |
| 590 | } |
| 591 | |
| 592 | /// Apply allocation result: rewrite VReg operands to PhysReg, insert spill code. |
| 593 | /// For spilled operands, borrows a temporarily-free register from the allocation |
| 594 | /// pool rather than using dedicated scratch registers. This avoids wasting |
| 595 | /// registers in the pool and follows standard linear scan practice. |
| 596 | pub fn apply_allocation( |
| 597 | mf: &mut MachineFunction, |
| 598 | result: &AllocResult, |
| 599 | liveness: &super::liveness::LivenessResult, |
| 600 | ) { |
| 601 | let vreg_classes: HashMap<VRegId, RegClass> = |
| 602 | mf.vregs.iter().map(|v| (v.id, v.class)).collect(); |
| 603 | |
| 604 | // Build interval lookup: for each vreg, its live range. |
| 605 | let intervals: HashMap<VRegId, (u32, u32)> = liveness |
| 606 | .intervals |
| 607 | .iter() |
| 608 | .map(|i| (i.vreg, (i.start, i.end))) |
| 609 | .collect(); |
| 610 | |
| 611 | // Compute instruction positions for each block/instruction. |
| 612 | let mut inst_pos: HashMap<(usize, usize), u32> = HashMap::new(); |
| 613 | let mut pos: u32 = 0; |
| 614 | for (bi, block) in mf.blocks.iter().enumerate() { |
| 615 | for (ii, _) in block.insts.iter().enumerate() { |
| 616 | inst_pos.insert((bi, ii), pos); |
| 617 | pos += 2; |
| 618 | } |
| 619 | } |
| 620 | |
| 621 | // Position-aware lookup: returns the logical assignment of |
| 622 | // `vid` at `cur_pos`, consulting split_records first so that |
| 623 | // post-call uses pick up the post-half's location instead of |
| 624 | // the pre-half's (now-clobbered) caller-saved register. |
| 625 | let logical_assignment = |vid: VRegId, cur_pos: u32| -> Option<LogicalAssignment> { |
| 626 | for r in &result.split_records { |
| 627 | if r.vreg != vid { |
| 628 | continue; |
| 629 | } |
| 630 | if cur_pos > r.call_position { |
| 631 | return Some(match r.post { |
| 632 | PostHalf::Allocated(p) => LogicalAssignment::Reg(p), |
| 633 | PostHalf::Spilled => LogicalAssignment::Slot(r.bridge_slot), |
| 634 | }); |
| 635 | } |
| 636 | return Some(LogicalAssignment::Reg(r.pre_phys)); |
| 637 | } |
| 638 | if let Some(p) = result.assignments.get(&vid) { |
| 639 | Some(LogicalAssignment::Reg(*p)) |
| 640 | } else { |
| 641 | result.spills.get(&vid).copied().map(LogicalAssignment::Slot) |
| 642 | } |
| 643 | }; |
| 644 | |
| 645 | for block_idx in 0..mf.blocks.len() { |
| 646 | let mut new_insts = Vec::new(); |
| 647 | let insts = std::mem::take(&mut mf.blocks[block_idx].insts); |
| 648 | |
| 649 | for (inst_idx, inst) in insts.iter().enumerate() { |
| 650 | let mut rewritten = inst.clone(); |
| 651 | let cur_pos = inst_pos.get(&(block_idx, inst_idx)).copied().unwrap_or(0); |
| 652 | |
| 653 | // Find GP registers NOT in use at this position. A |
| 654 | // vreg's register is in use during its full live range |
| 655 | // unless the vreg was split — in which case the |
| 656 | // pre-half occupies pre_phys during [start, |
| 657 | // call_position] and the post-half occupies post_phys |
| 658 | // during (call_position, end]. |
| 659 | // |
| 660 | // Iterate `result.assignments` in **deterministic vreg |
| 661 | // order**, not raw HashMap iteration order — the latter |
| 662 | // varies between runs and produces different temp-reg |
| 663 | // assignments and therefore non-reproducible builds. The |
| 664 | // resulting `gp_temps`/`fp_temps` lists are consumed by |
| 665 | // index, so their order is load-bearing. |
| 666 | let mut gp_temps: Vec<u8> = Vec::new(); |
| 667 | let mut fp_temps: Vec<u8> = Vec::new(); |
| 668 | let mut sorted_assignments: Vec<(VRegId, PhysReg)> = |
| 669 | result.assignments.iter().map(|(&v, &p)| (v, p)).collect(); |
| 670 | sorted_assignments.sort_by_key(|(v, _)| v.0); |
| 671 | // Build the set of phys regs in use at cur_pos. For |
| 672 | // split vregs, pre_phys is in use up to call_position, |
| 673 | // post_phys (if Allocated) takes over after. |
| 674 | let mut used_gp: HashSet<u8> = HashSet::new(); |
| 675 | let mut used_fp: HashSet<u8> = HashSet::new(); |
| 676 | let mark_used = |phys: PhysReg, gp: &mut HashSet<u8>, fp: &mut HashSet<u8>| match phys { |
| 677 | PhysReg::Gp(n) | PhysReg::Gp32(n) => { |
| 678 | gp.insert(n); |
| 679 | } |
| 680 | PhysReg::Fp(n) | PhysReg::Fp32(n) => { |
| 681 | fp.insert(n); |
| 682 | } |
| 683 | _ => {} |
| 684 | }; |
| 685 | for (vreg, phys) in &sorted_assignments { |
| 686 | if let Some(&(start, end)) = intervals.get(vreg) { |
| 687 | if cur_pos < start || cur_pos > end { |
| 688 | continue; |
| 689 | } |
| 690 | // Split vregs: pre_phys covers [start, |
| 691 | // call_position]; post_phys covers |
| 692 | // (call_position, end]. assignments[vreg] is |
| 693 | // the pre_phys (we removed the synthetic). |
| 694 | let split = result.split_records.iter().find(|r| r.vreg == *vreg); |
| 695 | if let Some(rec) = split { |
| 696 | if cur_pos <= rec.call_position { |
| 697 | mark_used(rec.pre_phys, &mut used_gp, &mut used_fp); |
| 698 | } |
| 699 | if let PostHalf::Allocated(p) = rec.post { |
| 700 | if cur_pos > rec.call_position { |
| 701 | mark_used(p, &mut used_gp, &mut used_fp); |
| 702 | } |
| 703 | } |
| 704 | } else { |
| 705 | mark_used(*phys, &mut used_gp, &mut used_fp); |
| 706 | } |
| 707 | } |
| 708 | } |
| 709 | for (_, phys) in &sorted_assignments { |
| 710 | match phys { |
| 711 | PhysReg::Gp(n) | PhysReg::Gp32(n) |
| 712 | if !used_gp.contains(n) && !gp_temps.contains(n) => |
| 713 | { |
| 714 | gp_temps.push(*n); |
| 715 | } |
| 716 | PhysReg::Fp(n) | PhysReg::Fp32(n) |
| 717 | if !used_fp.contains(n) && !fp_temps.contains(n) => |
| 718 | { |
| 719 | fp_temps.push(*n); |
| 720 | } |
| 721 | _ => {} |
| 722 | } |
| 723 | } |
| 724 | // post_phys registers from split records are also |
| 725 | // candidates for temp use when *they're* not in use. |
| 726 | for r in &result.split_records { |
| 727 | if let PostHalf::Allocated(p) = r.post { |
| 728 | let in_use = match p { |
| 729 | PhysReg::Gp(n) | PhysReg::Gp32(n) => used_gp.contains(&n), |
| 730 | PhysReg::Fp(n) | PhysReg::Fp32(n) => used_fp.contains(&n), |
| 731 | _ => true, |
| 732 | }; |
| 733 | if !in_use { |
| 734 | match p { |
| 735 | PhysReg::Gp(n) | PhysReg::Gp32(n) if !gp_temps.contains(&n) => { |
| 736 | gp_temps.push(n); |
| 737 | } |
| 738 | PhysReg::Fp(n) | PhysReg::Fp32(n) if !fp_temps.contains(&n) => { |
| 739 | fp_temps.push(n); |
| 740 | } |
| 741 | _ => {} |
| 742 | } |
| 743 | } |
| 744 | } |
| 745 | } |
| 746 | // Also include the dedicated fallback scratches in case no free regs found. |
| 747 | for &s in &GP_SPILL_SCRATCH { |
| 748 | if !gp_temps.contains(&s) { |
| 749 | gp_temps.push(s); |
| 750 | } |
| 751 | } |
| 752 | for &s in &FP_SPILL_SCRATCH { |
| 753 | if !fp_temps.contains(&s) { |
| 754 | fp_temps.push(s); |
| 755 | } |
| 756 | } |
| 757 | |
| 758 | // For spilled (or post-spilled-after-split) vreg uses: |
| 759 | // borrow a temp register and load from the slot. |
| 760 | let mut loads = Vec::new(); |
| 761 | let mut gp_temp_idx = 0usize; |
| 762 | let mut fp_temp_idx = 0usize; |
| 763 | for (i, op) in inst.operands.iter().enumerate() { |
| 764 | if let MachineOperand::VReg(vid) = op { |
| 765 | if let Some(LogicalAssignment::Slot(offset)) = |
| 766 | logical_assignment(*vid, cur_pos) |
| 767 | { |
| 768 | let class = vreg_classes.get(vid).copied().unwrap_or(RegClass::Gp64); |
| 769 | let (temp_reg, load_op) = match class { |
| 770 | RegClass::Fp32 => { |
| 771 | let r = fp_temps |
| 772 | .get(fp_temp_idx) |
| 773 | .copied() |
| 774 | .unwrap_or(FP_SPILL_SCRATCH[0]); |
| 775 | fp_temp_idx += 1; |
| 776 | (PhysReg::Fp32(r), ArmOpcode::LdrFpImm) |
| 777 | } |
| 778 | RegClass::Fp64 => { |
| 779 | let r = fp_temps |
| 780 | .get(fp_temp_idx) |
| 781 | .copied() |
| 782 | .unwrap_or(FP_SPILL_SCRATCH[0]); |
| 783 | fp_temp_idx += 1; |
| 784 | (PhysReg::Fp(r), ArmOpcode::LdrFpImm) |
| 785 | } |
| 786 | RegClass::V128 => { |
| 787 | let r = fp_temps |
| 788 | .get(fp_temp_idx) |
| 789 | .copied() |
| 790 | .unwrap_or(FP_SPILL_SCRATCH[0]); |
| 791 | fp_temp_idx += 1; |
| 792 | (PhysReg::Fp(r), ArmOpcode::LdrQ) |
| 793 | } |
| 794 | RegClass::Gp32 => { |
| 795 | let r = gp_temps |
| 796 | .get(gp_temp_idx) |
| 797 | .copied() |
| 798 | .unwrap_or(GP_SPILL_SCRATCH[0]); |
| 799 | gp_temp_idx += 1; |
| 800 | (PhysReg::Gp32(r), ArmOpcode::LdrImm) |
| 801 | } |
| 802 | RegClass::Gp64 => { |
| 803 | let r = gp_temps |
| 804 | .get(gp_temp_idx) |
| 805 | .copied() |
| 806 | .unwrap_or(GP_SPILL_SCRATCH[0]); |
| 807 | gp_temp_idx += 1; |
| 808 | (PhysReg::Gp(r), ArmOpcode::LdrImm) |
| 809 | } |
| 810 | }; |
| 811 | loads.push((i, temp_reg, load_op, offset)); |
| 812 | } |
| 813 | } |
| 814 | } |
| 815 | |
| 816 | for (op_idx, scratch, load_op, offset) in &loads { |
| 817 | if *op_idx == 0 |
| 818 | && inst |
| 819 | .def |
| 820 | .as_ref() |
| 821 | .map(|d| { |
| 822 | matches!( |
| 823 | logical_assignment(*d, cur_pos), |
| 824 | Some(LogicalAssignment::Slot(_)) |
| 825 | ) |
| 826 | }) |
| 827 | .unwrap_or(false) |
| 828 | { |
| 829 | continue; |
| 830 | } |
| 831 | emit_spill_access(&mut new_insts, *load_op, *scratch, *offset as i64); |
| 832 | rewritten.operands[*op_idx] = MachineOperand::PhysReg(*scratch); |
| 833 | } |
| 834 | |
| 835 | // Rewrite assigned vregs to physical registers using |
| 836 | // the position-aware lookup so split vregs pick up the |
| 837 | // correct half. |
| 838 | for op in &mut rewritten.operands { |
| 839 | if let MachineOperand::VReg(vid) = op { |
| 840 | if let Some(LogicalAssignment::Reg(phys)) = |
| 841 | logical_assignment(*vid, cur_pos) |
| 842 | { |
| 843 | *op = MachineOperand::PhysReg(phys); |
| 844 | } |
| 845 | } |
| 846 | } |
| 847 | |
| 848 | // Handle def — use a temp that doesn't alias any input temp. |
| 849 | let def_temp_idx = gp_temp_idx.max(fp_temp_idx); |
| 850 | let def_spill = if let Some(def_vid) = &inst.def { |
| 851 | if let Some(LogicalAssignment::Slot(offset)) = |
| 852 | logical_assignment(*def_vid, cur_pos) |
| 853 | { |
| 854 | let class = vreg_classes.get(def_vid).copied().unwrap_or(RegClass::Gp64); |
| 855 | let temp_reg = match class { |
| 856 | RegClass::Fp32 => { |
| 857 | let r = fp_temps |
| 858 | .get(def_temp_idx) |
| 859 | .copied() |
| 860 | .unwrap_or(FP_SPILL_SCRATCH[0]); |
| 861 | PhysReg::Fp32(r) |
| 862 | } |
| 863 | RegClass::Fp64 | RegClass::V128 => { |
| 864 | let r = fp_temps |
| 865 | .get(def_temp_idx) |
| 866 | .copied() |
| 867 | .unwrap_or(FP_SPILL_SCRATCH[0]); |
| 868 | PhysReg::Fp(r) |
| 869 | } |
| 870 | RegClass::Gp32 => { |
| 871 | let r = gp_temps |
| 872 | .get(def_temp_idx) |
| 873 | .copied() |
| 874 | .unwrap_or(GP_SPILL_SCRATCH[0]); |
| 875 | PhysReg::Gp32(r) |
| 876 | } |
| 877 | RegClass::Gp64 => { |
| 878 | let r = gp_temps |
| 879 | .get(def_temp_idx) |
| 880 | .copied() |
| 881 | .unwrap_or(GP_SPILL_SCRATCH[0]); |
| 882 | PhysReg::Gp(r) |
| 883 | } |
| 884 | }; |
| 885 | let scratch = temp_reg; |
| 886 | // Replace def operand with scratch. |
| 887 | if let Some(MachineOperand::VReg(vid)) = rewritten.operands.first() { |
| 888 | if vid == def_vid { |
| 889 | rewritten.operands[0] = MachineOperand::PhysReg(scratch); |
| 890 | } |
| 891 | } |
| 892 | Some((scratch, offset, class)) |
| 893 | } else { |
| 894 | None |
| 895 | } |
| 896 | } else { |
| 897 | None |
| 898 | }; |
| 899 | |
| 900 | // Bridge instructions for live-range splits are |
| 901 | // emitted by `insert_split_bridges`, which runs *after* |
| 902 | // `parallelize_call_arg_moves` so the canonical |
| 903 | // arg-setup mov sequence isn't fragmented by an |
| 904 | // interposed store and the parallel-copy resolver can |
| 905 | // see all the arg-setup movs as one block. |
| 906 | rewritten.def = None; |
| 907 | new_insts.push(rewritten); |
| 908 | |
| 909 | // Store after def for spilled vregs. |
| 910 | if let Some((scratch, offset, class)) = def_spill { |
| 911 | let store_op = match class { |
| 912 | RegClass::Fp32 | RegClass::Fp64 => ArmOpcode::StrFpImm, |
| 913 | RegClass::V128 => ArmOpcode::StrQ, |
| 914 | _ => ArmOpcode::StrImm, |
| 915 | }; |
| 916 | emit_spill_access(&mut new_insts, store_op, scratch, offset as i64); |
| 917 | } |
| 918 | } |
| 919 | |
| 920 | mf.blocks[block_idx].insts = new_insts; |
| 921 | } |
| 922 | } |
| 923 | |
| 924 | /// Emit a spill load or store, materializing the FP-relative |
| 925 | /// address through `x8` when the offset is out of the LDR/STR |
| 926 | /// immediate's reach. ARM64's LDUR / STUR (used as a fallback for |
| 927 | /// negative immediates) only encode signed 9-bit offsets |
| 928 | /// `(-256, 255)`. For wider frames we substitute |
| 929 | /// `sub x8, x29, #|offset|; ldr/str rt, [x8, #0]`. Positive |
| 930 | /// out-of-scaled-range offsets get the symmetric `add` treatment. |
| 931 | fn emit_spill_access( |
| 932 | insts: &mut Vec<MachineInst>, |
| 933 | op: ArmOpcode, |
| 934 | rt: PhysReg, |
| 935 | offset: i64, |
| 936 | ) { |
| 937 | // Most spill ops encode comfortably with FP-relative immediates. |
| 938 | // Only fall back to address materialization when out of LDUR |
| 939 | // range. (Positive offsets up to 65520 step 16 are handled by |
| 940 | // LDR's scaled form; negatives must use LDUR; we materialize |
| 941 | // anything outside the union.) |
| 942 | let in_range = (-256..=255).contains(&offset); |
| 943 | if in_range { |
| 944 | insts.push(MachineInst { |
| 945 | opcode: op, |
| 946 | operands: vec![ |
| 947 | MachineOperand::PhysReg(rt), |
| 948 | MachineOperand::PhysReg(PhysReg::FP), |
| 949 | MachineOperand::Imm(offset), |
| 950 | ], |
| 951 | def: None, |
| 952 | }); |
| 953 | return; |
| 954 | } |
| 955 | let addr = PhysReg::Gp(8); |
| 956 | let (mat_op, abs) = if offset >= 0 { |
| 957 | (ArmOpcode::AddImm, offset) |
| 958 | } else { |
| 959 | (ArmOpcode::SubImm, -offset) |
| 960 | }; |
| 961 | insts.push(MachineInst { |
| 962 | opcode: mat_op, |
| 963 | operands: vec![ |
| 964 | MachineOperand::PhysReg(addr), |
| 965 | MachineOperand::PhysReg(PhysReg::FP), |
| 966 | MachineOperand::Imm(abs), |
| 967 | ], |
| 968 | def: None, |
| 969 | }); |
| 970 | insts.push(MachineInst { |
| 971 | opcode: op, |
| 972 | operands: vec![ |
| 973 | MachineOperand::PhysReg(rt), |
| 974 | MachineOperand::PhysReg(addr), |
| 975 | MachineOperand::Imm(0), |
| 976 | ], |
| 977 | def: None, |
| 978 | }); |
| 979 | } |
| 980 | |
| 981 | /// Insert callee-saved register saves in prologue and restores in epilogue. |
| 982 | /// Must be called after apply_allocation so we know which callee-saved regs were used. |
| 983 | pub fn insert_callee_saves(mf: &mut MachineFunction, callee_saved: &[PhysReg]) { |
| 984 | if callee_saved.is_empty() { |
| 985 | return; |
| 986 | } |
| 987 | |
| 988 | // Allocate stack slots for callee-saved registers. |
| 989 | let mut save_slots: Vec<(PhysReg, i32)> = Vec::new(); |
| 990 | for ® in callee_saved { |
| 991 | let offset = mf.frame.alloc_local(8); |
| 992 | save_slots.push((reg, offset)); |
| 993 | } |
| 994 | |
| 995 | // Insert saves at the start of the entry block (after prologue setup). |
| 996 | // Find the insertion point: after the STP + ADD (prologue) instructions. |
| 997 | let prologue_end = mf.blocks[0] |
| 998 | .insts |
| 999 | .iter() |
| 1000 | .position(|i| { |
| 1001 | // The prologue is StpPre followed by AddImm. Insert after those. |
| 1002 | !matches!(i.opcode, ArmOpcode::StpPre | ArmOpcode::AddImm) |
| 1003 | }) |
| 1004 | .unwrap_or(0); |
| 1005 | |
| 1006 | // Emit saves, pairing consecutive same-class registers into STP. |
| 1007 | // save_slots are ordered by increasing register number; slots are |
| 1008 | // allocated at decreasing offsets (-8, -16, -24, ...). Adjacent |
| 1009 | // slots differ by 8 bytes — exactly the STP stride for GP64/FP64. |
| 1010 | // |
| 1011 | // STP Xt1, Xt2, [Xn, #off]: stores Xt1 at Xn+off, Xt2 at Xn+off+8. |
| 1012 | // So to store reg_low (at lower_offset) and reg_high (at lower_offset+8): |
| 1013 | // STP reg_low, reg_high, [FP, #lower_offset] |
| 1014 | let saves = emit_callee_store_pairs(&save_slots, false); |
| 1015 | for (i, save) in saves.into_iter().enumerate() { |
| 1016 | mf.blocks[0].insts.insert(prologue_end + i, save); |
| 1017 | } |
| 1018 | |
| 1019 | // Insert restores before every epilogue sequence (LdpPost). |
| 1020 | for block in &mut mf.blocks { |
| 1021 | let mut insertions = Vec::new(); |
| 1022 | for (i, inst) in block.insts.iter().enumerate() { |
| 1023 | if inst.opcode == ArmOpcode::LdpPost { |
| 1024 | insertions.push(i); |
| 1025 | } |
| 1026 | } |
| 1027 | for &ldp_idx in insertions.iter().rev() { |
| 1028 | // Restores in reverse order (mirror of saves). |
| 1029 | let restores = emit_callee_store_pairs(&save_slots, true); |
| 1030 | for restore in restores.into_iter().rev() { |
| 1031 | block.insts.insert(ldp_idx, restore); |
| 1032 | } |
| 1033 | } |
| 1034 | } |
| 1035 | } |
| 1036 | |
| 1037 | /// Build the save (or restore) instruction list for a set of callee-saved |
| 1038 | /// slots, pairing consecutive adjacent GP/FP slots into STP/LDP. |
| 1039 | /// |
| 1040 | /// `restore` = true → emit LDP/LDR (load); false → STP/STR (store). |
| 1041 | /// |
| 1042 | /// Pairing rule: slots[i] and slots[i+1] are paired when: |
| 1043 | /// * Both are GP or both are FP (same register class) |
| 1044 | /// * slots[i+1].offset == slots[i].offset - 8 (adjacent 8-byte slots) |
| 1045 | /// |
| 1046 | /// STP Xt1, Xt2, [Xn, #off] → Xt1 at off, Xt2 at off+8. |
| 1047 | /// We pair as: STP slots[i+1].reg, slots[i].reg, [FP, #slots[i+1].offset] |
| 1048 | /// because slots[i+1] has the lower (more negative) offset. |
| 1049 | fn emit_callee_store_pairs(save_slots: &[(PhysReg, i32)], restore: bool) -> Vec<MachineInst> { |
| 1050 | let mut result = Vec::new(); |
| 1051 | let mut i = 0; |
| 1052 | while i < save_slots.len() { |
| 1053 | let (reg1, off1) = save_slots[i]; |
| 1054 | // Try to pair with next slot. |
| 1055 | if i + 1 < save_slots.len() { |
| 1056 | let (reg2, off2) = save_slots[i + 1]; |
| 1057 | let is_gp1 = !matches!(reg1, PhysReg::Fp(_) | PhysReg::Fp32(_)); |
| 1058 | let is_gp2 = !matches!(reg2, PhysReg::Fp(_) | PhysReg::Fp32(_)); |
| 1059 | let same_class = is_gp1 == is_gp2; |
| 1060 | // off1 is the higher slot (less negative); off2 = off1 - 8. |
| 1061 | let adjacent = off2 == off1 - 8; |
| 1062 | // STP offset must fit in 7-bit signed × 8: range -512..504. |
| 1063 | let in_range = (-512..=504).contains(&off2); |
| 1064 | if same_class && adjacent && in_range { |
| 1065 | let opcode = if restore { |
| 1066 | ArmOpcode::LdpOffset |
| 1067 | } else { |
| 1068 | ArmOpcode::StpOffset |
| 1069 | }; |
| 1070 | let (low_reg, high_reg) = (reg2, reg1); |
| 1071 | result.push(MachineInst { |
| 1072 | opcode, |
| 1073 | operands: vec![ |
| 1074 | MachineOperand::PhysReg(low_reg), |
| 1075 | MachineOperand::PhysReg(high_reg), |
| 1076 | MachineOperand::PhysReg(PhysReg::FP), |
| 1077 | MachineOperand::Imm(off2 as i64), |
| 1078 | ], |
| 1079 | def: None, |
| 1080 | }); |
| 1081 | i += 2; |
| 1082 | continue; |
| 1083 | } |
| 1084 | } |
| 1085 | // Emit individual STR/LDR. |
| 1086 | let (op, is_fp) = match reg1 { |
| 1087 | PhysReg::Fp(_) | PhysReg::Fp32(_) => { |
| 1088 | if restore { |
| 1089 | (ArmOpcode::LdrFpImm, true) |
| 1090 | } else { |
| 1091 | (ArmOpcode::StrFpImm, true) |
| 1092 | } |
| 1093 | } |
| 1094 | _ => { |
| 1095 | if restore { |
| 1096 | (ArmOpcode::LdrImm, false) |
| 1097 | } else { |
| 1098 | (ArmOpcode::StrImm, false) |
| 1099 | } |
| 1100 | } |
| 1101 | }; |
| 1102 | let _ = is_fp; |
| 1103 | result.push(MachineInst { |
| 1104 | opcode: op, |
| 1105 | operands: vec![ |
| 1106 | MachineOperand::PhysReg(reg1), |
| 1107 | MachineOperand::PhysReg(PhysReg::FP), |
| 1108 | MachineOperand::Imm(off1 as i64), |
| 1109 | ], |
| 1110 | def: if restore { |
| 1111 | Some(match reg1 { |
| 1112 | PhysReg::Gp(n) | PhysReg::Gp32(n) => crate::codegen::mir::VRegId(n.into()), |
| 1113 | PhysReg::Fp(n) | PhysReg::Fp32(n) => crate::codegen::mir::VRegId(n.into()), |
| 1114 | _ => crate::codegen::mir::VRegId(0), |
| 1115 | }) |
| 1116 | } else { |
| 1117 | None |
| 1118 | }, |
| 1119 | }); |
| 1120 | i += 1; |
| 1121 | } |
| 1122 | result |
| 1123 | } |
| 1124 | |
| 1125 | /// Basic move coalescing: eliminate mov instructions where src == dest. |
| 1126 | pub fn coalesce_moves(mf: &mut MachineFunction) { |
| 1127 | for block in &mut mf.blocks { |
| 1128 | block.insts.retain(|inst| { |
| 1129 | if matches!(inst.opcode, ArmOpcode::MovReg | ArmOpcode::FmovReg) |
| 1130 | && inst.operands.len() == 2 |
| 1131 | && inst.operands[0] == inst.operands[1] |
| 1132 | { |
| 1133 | return false; // eliminate self-move |
| 1134 | } |
| 1135 | true |
| 1136 | }); |
| 1137 | } |
| 1138 | } |
| 1139 | |
| 1140 | /// Emit the str/ldr pairs that bridge live-range splits across |
| 1141 | /// each call boundary. Runs *after* `parallelize_call_arg_moves` |
| 1142 | /// so the parallel-copy resolver sees an unfragmented arg-setup |
| 1143 | /// sequence (an interposed `str` between an arg-copy and the BL |
| 1144 | /// would otherwise stop the scan-back at the wrong point and the |
| 1145 | /// resolver would miss arg-copies that need cycle-breaking). |
| 1146 | /// |
| 1147 | /// For each `SplitRecord` whose `call_position` equals the |
| 1148 | /// position of a `Bl`/`Blr` in the function: insert a `str |
| 1149 | /// pre_phys, [fp, bridge_slot]` immediately before the call and, |
| 1150 | /// if the post-half got its own register, an `ldr post_phys, [fp, |
| 1151 | /// bridge_slot]` immediately after. Spilled post-halves rely on |
| 1152 | /// the existing per-use spill-load path, which already loads from |
| 1153 | /// `bridge_slot` thanks to `apply_allocation`'s position-aware |
| 1154 | /// operand rewrite. |
| 1155 | pub fn insert_split_bridges(mf: &mut MachineFunction, splits: &[SplitRecord]) { |
| 1156 | if splits.is_empty() { |
| 1157 | return; |
| 1158 | } |
| 1159 | let mut call_seen: u32 = 0; |
| 1160 | for block_idx in 0..mf.blocks.len() { |
| 1161 | let mut new_insts = Vec::with_capacity(mf.blocks[block_idx].insts.len()); |
| 1162 | let original = std::mem::take(&mut mf.blocks[block_idx].insts); |
| 1163 | for inst in original.into_iter() { |
| 1164 | let is_call = matches!(inst.opcode, ArmOpcode::Bl | ArmOpcode::Blr); |
| 1165 | if is_call { |
| 1166 | let idx = call_seen; |
| 1167 | // Place bridge `str pre_phys, [fp, slot]` *above* the |
| 1168 | // canonical arg-setup mov sequence. Arg-setup movs |
| 1169 | // can write to caller-saved physregs that are also |
| 1170 | // some split's `pre_phys` — by the time we reach the |
| 1171 | // BL, those registers no longer hold the pre-half's |
| 1172 | // value. Scan back over arg-setup movs and insert |
| 1173 | // bridges at the top of that block. |
| 1174 | let mut insert_at = new_insts.len(); |
| 1175 | while insert_at > 0 && is_call_arg_copy(&new_insts[insert_at - 1]) { |
| 1176 | insert_at -= 1; |
| 1177 | } |
| 1178 | let mut bridges: Vec<MachineInst> = Vec::new(); |
| 1179 | for r in splits { |
| 1180 | if r.call_index != idx { |
| 1181 | continue; |
| 1182 | } |
| 1183 | let store_op = match r.pre_phys { |
| 1184 | PhysReg::Fp(_) | PhysReg::Fp32(_) => ArmOpcode::StrFpImm, |
| 1185 | _ => ArmOpcode::StrImm, |
| 1186 | }; |
| 1187 | bridges.push(MachineInst { |
| 1188 | opcode: store_op, |
| 1189 | operands: vec![ |
| 1190 | MachineOperand::PhysReg(r.pre_phys), |
| 1191 | MachineOperand::PhysReg(PhysReg::FP), |
| 1192 | MachineOperand::Imm(r.bridge_slot as i64), |
| 1193 | ], |
| 1194 | def: None, |
| 1195 | }); |
| 1196 | } |
| 1197 | for (offset, b) in bridges.into_iter().enumerate() { |
| 1198 | new_insts.insert(insert_at + offset, b); |
| 1199 | } |
| 1200 | } |
| 1201 | new_insts.push(inst); |
| 1202 | if is_call { |
| 1203 | let idx = call_seen; |
| 1204 | for r in splits { |
| 1205 | if r.call_index != idx { |
| 1206 | continue; |
| 1207 | } |
| 1208 | if let PostHalf::Allocated(p) = r.post { |
| 1209 | let load_op = match p { |
| 1210 | PhysReg::Fp(_) | PhysReg::Fp32(_) => ArmOpcode::LdrFpImm, |
| 1211 | _ => ArmOpcode::LdrImm, |
| 1212 | }; |
| 1213 | new_insts.push(MachineInst { |
| 1214 | opcode: load_op, |
| 1215 | operands: vec![ |
| 1216 | MachineOperand::PhysReg(p), |
| 1217 | MachineOperand::PhysReg(PhysReg::FP), |
| 1218 | MachineOperand::Imm(r.bridge_slot as i64), |
| 1219 | ], |
| 1220 | def: None, |
| 1221 | }); |
| 1222 | } |
| 1223 | } |
| 1224 | call_seen += 1; |
| 1225 | } |
| 1226 | } |
| 1227 | mf.blocks[block_idx].insts = new_insts; |
| 1228 | } |
| 1229 | } |
| 1230 | |
| 1231 | /// Resolve the entry-block argument-receipt parallel copy. The |
| 1232 | /// function prologue copies incoming arg registers (x0..x7, |
| 1233 | /// d0..d7) into the registers that the allocator chose for the |
| 1234 | /// parameter vregs. Emitted naively this is a sequential `mov` |
| 1235 | /// chain, which corrupts values whenever a destination aliases a |
| 1236 | /// later source — e.g. `mov x4, x3; mov x3, x4` clobbers x4 before |
| 1237 | /// the second move can read it. Historically this was hidden |
| 1238 | /// because parameter vregs landed in callee-saved registers |
| 1239 | /// (x19..x28) that never alias x0..x7; once the allocator started |
| 1240 | /// preferring caller-saved destinations, the latent hazard |
| 1241 | /// surfaced. |
| 1242 | /// |
| 1243 | /// Find the leading run of phys-to-phys movs in the entry block |
| 1244 | /// (after the frame setup) whose source is an incoming arg |
| 1245 | /// register, and run the same parallel-copy resolver |
| 1246 | /// (`rewrite_call_arg_copies`) used for pre-call argument setup. |
| 1247 | pub fn parallelize_entry_arg_moves(mf: &mut MachineFunction) { |
| 1248 | if mf.blocks.is_empty() { |
| 1249 | return; |
| 1250 | } |
| 1251 | let original = std::mem::take(&mut mf.blocks[0].insts); |
| 1252 | let mut rebuilt: Vec<MachineInst> = Vec::with_capacity(original.len()); |
| 1253 | let mut iter = original.into_iter().peekable(); |
| 1254 | |
| 1255 | // Pass through the prologue (StpPre/AddImm/SubImm). |
| 1256 | while let Some(inst) = iter.peek() { |
| 1257 | if matches!( |
| 1258 | inst.opcode, |
| 1259 | ArmOpcode::StpPre | ArmOpcode::AddImm | ArmOpcode::SubImm |
| 1260 | ) { |
| 1261 | rebuilt.push(iter.next().unwrap()); |
| 1262 | } else { |
| 1263 | break; |
| 1264 | } |
| 1265 | } |
| 1266 | |
| 1267 | // Process arg-receipt parallel groups. Tolerate intervening |
| 1268 | // store instructions (which read but don't write registers in |
| 1269 | // the parallel-copy graph) so a result-address spill or |
| 1270 | // partial-receipt store doesn't fragment the receipts. Stop |
| 1271 | // once any instruction overwrites an incoming arg register — |
| 1272 | // beyond that point those values are no longer the original |
| 1273 | // arg values and parallel-copy resolution would be unsound. |
| 1274 | let mut pending: Vec<MachineInst> = Vec::new(); |
| 1275 | loop { |
| 1276 | match iter.peek() { |
| 1277 | None => break, |
| 1278 | Some(inst) if is_arg_receipt_copy(inst) => { |
| 1279 | pending.push(iter.next().unwrap()); |
| 1280 | } |
| 1281 | Some(inst) if is_transparent_to_arg_receipts(inst) => { |
| 1282 | if !pending.is_empty() { |
| 1283 | rebuilt.extend(rewrite_call_arg_copies(std::mem::take(&mut pending))); |
| 1284 | } |
| 1285 | rebuilt.push(iter.next().unwrap()); |
| 1286 | } |
| 1287 | Some(_) => break, |
| 1288 | } |
| 1289 | } |
| 1290 | if !pending.is_empty() { |
| 1291 | rebuilt.extend(rewrite_call_arg_copies(pending)); |
| 1292 | } |
| 1293 | rebuilt.extend(iter); |
| 1294 | mf.blocks[0].insts = rebuilt; |
| 1295 | } |
| 1296 | |
| 1297 | /// An instruction is *transparent* to the arg-receipt parallel |
| 1298 | /// copy if executing it cannot change the value held in any |
| 1299 | /// incoming arg register (x0..x7 / d0..d7). Stores qualify (they |
| 1300 | /// only read), as does any compute whose destination lies outside |
| 1301 | /// the arg-reg range. Anything that writes into x0..x7 forms a |
| 1302 | /// barrier — the receipts that come after it are reading |
| 1303 | /// post-clobber values, so reordering across the barrier is |
| 1304 | /// unsound. |
| 1305 | fn is_transparent_to_arg_receipts(inst: &MachineInst) -> bool { |
| 1306 | if is_store_opcode(inst.opcode) { |
| 1307 | return true; |
| 1308 | } |
| 1309 | if is_branch_or_call_opcode(inst.opcode) { |
| 1310 | return false; |
| 1311 | } |
| 1312 | match inst.operands.first() { |
| 1313 | Some(MachineOperand::PhysReg(dst)) => !is_call_arg_reg(*dst), |
| 1314 | _ => true, |
| 1315 | } |
| 1316 | } |
| 1317 | |
| 1318 | fn is_store_opcode(op: ArmOpcode) -> bool { |
| 1319 | matches!( |
| 1320 | op, |
| 1321 | ArmOpcode::StrImm |
| 1322 | | ArmOpcode::StrhImm |
| 1323 | | ArmOpcode::StrbImm |
| 1324 | | ArmOpcode::StrFpImm |
| 1325 | | ArmOpcode::StrReg |
| 1326 | | ArmOpcode::StrFpReg |
| 1327 | | ArmOpcode::StrQ |
| 1328 | | ArmOpcode::StpPre |
| 1329 | | ArmOpcode::StpOffset |
| 1330 | ) |
| 1331 | } |
| 1332 | |
| 1333 | fn is_branch_or_call_opcode(op: ArmOpcode) -> bool { |
| 1334 | matches!( |
| 1335 | op, |
| 1336 | ArmOpcode::B |
| 1337 | | ArmOpcode::BCond |
| 1338 | | ArmOpcode::Bl |
| 1339 | | ArmOpcode::Blr |
| 1340 | | ArmOpcode::Ret |
| 1341 | | ArmOpcode::Cbz |
| 1342 | | ArmOpcode::Cbnz |
| 1343 | | ArmOpcode::Tbz |
| 1344 | | ArmOpcode::Tbnz |
| 1345 | ) |
| 1346 | } |
| 1347 | |
| 1348 | /// Reorder physical argument-register copies immediately before calls so later |
| 1349 | /// sources are not clobbered by earlier destination writes. |
| 1350 | /// |
| 1351 | /// For an indirect call (`BLR Xn`), the target register `Xn` is a |
| 1352 | /// live use of the call. If `Xn` lies in the argument-register |
| 1353 | /// range (x0..x7) and one of the pending arg-copy moves writes to |
| 1354 | /// it, the function pointer would be clobbered before the branch. |
| 1355 | /// Save it to a non-arg scratch register (`x10`, distinct from |
| 1356 | /// `x9` that `rewrite_call_arg_copies` uses for cycle-breaking) and |
| 1357 | /// rewrite the BLR's operand. This was hidden as long as |
| 1358 | /// indirect-call targets landed in callee-saved registers (which |
| 1359 | /// can never alias x0..x7) but surfaces once non-call-crossing |
| 1360 | /// values prefer caller-saved. |
| 1361 | pub fn parallelize_call_arg_moves(mf: &mut MachineFunction) { |
| 1362 | for block in &mut mf.blocks { |
| 1363 | let mut rebuilt: Vec<MachineInst> = Vec::with_capacity(block.insts.len()); |
| 1364 | for inst in std::mem::take(&mut block.insts) { |
| 1365 | if matches!(inst.opcode, ArmOpcode::Bl | ArmOpcode::Blr) { |
| 1366 | let mut start = rebuilt.len(); |
| 1367 | while start > 0 && is_call_arg_copy(&rebuilt[start - 1]) { |
| 1368 | start -= 1; |
| 1369 | } |
| 1370 | let mut adjusted = inst; |
| 1371 | if matches!(adjusted.opcode, ArmOpcode::Blr) { |
| 1372 | let target_phys = match adjusted.operands.first() { |
| 1373 | Some(MachineOperand::PhysReg(t)) => Some(*t), |
| 1374 | _ => None, |
| 1375 | }; |
| 1376 | if let Some(target) = target_phys { |
| 1377 | if is_call_arg_reg(target) { |
| 1378 | let target_alias = phys_reg_alias(target); |
| 1379 | let writes_target = rebuilt[start..].iter().any(|i| { |
| 1380 | matches!(i.operands.first(), |
| 1381 | Some(MachineOperand::PhysReg(d)) |
| 1382 | if phys_reg_alias(*d) == target_alias) |
| 1383 | }); |
| 1384 | if writes_target { |
| 1385 | let scratch = blr_target_scratch(target); |
| 1386 | let save = MachineInst { |
| 1387 | opcode: move_opcode_for_phys(target), |
| 1388 | operands: vec![ |
| 1389 | MachineOperand::PhysReg(scratch), |
| 1390 | MachineOperand::PhysReg(target), |
| 1391 | ], |
| 1392 | def: None, |
| 1393 | }; |
| 1394 | rebuilt.insert(start, save); |
| 1395 | adjusted.operands[0] = MachineOperand::PhysReg(scratch); |
| 1396 | start += 1; |
| 1397 | } |
| 1398 | } |
| 1399 | } |
| 1400 | } |
| 1401 | if start < rebuilt.len() { |
| 1402 | let pending = rebuilt.split_off(start); |
| 1403 | rebuilt.extend(rewrite_call_arg_copies(pending)); |
| 1404 | } |
| 1405 | rebuilt.push(adjusted); |
| 1406 | } else { |
| 1407 | rebuilt.push(inst); |
| 1408 | } |
| 1409 | } |
| 1410 | block.insts = rebuilt; |
| 1411 | } |
| 1412 | } |
| 1413 | |
| 1414 | /// Scratch used to preserve a BLR target across parallel arg |
| 1415 | /// setup. Distinct from the `x9` / `d29` used by |
| 1416 | /// `rewrite_call_arg_copies` for cycle-breaking so the two scratch |
| 1417 | /// uses never collide. |
| 1418 | fn blr_target_scratch(target: PhysReg) -> PhysReg { |
| 1419 | match target { |
| 1420 | PhysReg::Gp(_) => PhysReg::Gp(10), |
| 1421 | PhysReg::Gp32(_) => PhysReg::Gp32(10), |
| 1422 | _ => panic!("blr_target_scratch: BLR target is always a 64/32-bit GP register"), |
| 1423 | } |
| 1424 | } |
| 1425 | |
| 1426 | // ---- Helpers ---- |
| 1427 | |
| 1428 | fn is_call_arg_copy(inst: &MachineInst) -> bool { |
| 1429 | matches!(inst.opcode, ArmOpcode::MovReg | ArmOpcode::FmovReg) |
| 1430 | && matches!( |
| 1431 | inst.operands.as_slice(), |
| 1432 | [MachineOperand::PhysReg(dst), MachineOperand::PhysReg(src)] |
| 1433 | if is_call_arg_reg(*dst) && !matches!(src, PhysReg::Xzr | PhysReg::Wzr) |
| 1434 | ) |
| 1435 | } |
| 1436 | |
| 1437 | /// Mirror of `is_call_arg_copy` for the function-entry direction: |
| 1438 | /// a phys-to-phys mov whose *source* is an incoming arg register. |
| 1439 | fn is_arg_receipt_copy(inst: &MachineInst) -> bool { |
| 1440 | matches!(inst.opcode, ArmOpcode::MovReg | ArmOpcode::FmovReg) |
| 1441 | && matches!( |
| 1442 | inst.operands.as_slice(), |
| 1443 | [MachineOperand::PhysReg(_), MachineOperand::PhysReg(src)] |
| 1444 | if is_call_arg_reg(*src) |
| 1445 | ) |
| 1446 | } |
| 1447 | |
| 1448 | fn is_call_arg_reg(reg: PhysReg) -> bool { |
| 1449 | match reg { |
| 1450 | PhysReg::Gp(n) | PhysReg::Gp32(n) => n < 8, |
| 1451 | PhysReg::Fp(n) | PhysReg::Fp32(n) => n < 8, |
| 1452 | _ => false, |
| 1453 | } |
| 1454 | } |
| 1455 | |
| 1456 | #[derive(Clone, Copy, PartialEq, Eq)] |
| 1457 | enum PhysRegAlias { |
| 1458 | Gp(u8), |
| 1459 | Fp(u8), |
| 1460 | } |
| 1461 | |
| 1462 | fn phys_reg_alias(reg: PhysReg) -> Option<PhysRegAlias> { |
| 1463 | match reg { |
| 1464 | PhysReg::Gp(n) | PhysReg::Gp32(n) => Some(PhysRegAlias::Gp(n)), |
| 1465 | PhysReg::Fp(n) | PhysReg::Fp32(n) => Some(PhysRegAlias::Fp(n)), |
| 1466 | _ => None, |
| 1467 | } |
| 1468 | } |
| 1469 | |
| 1470 | fn scratch_phys_for(reg: PhysReg) -> PhysReg { |
| 1471 | match reg { |
| 1472 | PhysReg::Gp(_) => PhysReg::Gp(9), |
| 1473 | PhysReg::Gp32(_) => PhysReg::Gp32(9), |
| 1474 | PhysReg::Fp(_) => PhysReg::Fp(29), |
| 1475 | PhysReg::Fp32(_) => PhysReg::Fp32(29), |
| 1476 | _ => panic!("call-arg scratch requested for non-register operand"), |
| 1477 | } |
| 1478 | } |
| 1479 | |
| 1480 | fn move_opcode_for_phys(reg: PhysReg) -> ArmOpcode { |
| 1481 | match reg { |
| 1482 | PhysReg::Fp(_) | PhysReg::Fp32(_) => ArmOpcode::FmovReg, |
| 1483 | PhysReg::Gp(_) | PhysReg::Gp32(_) => ArmOpcode::MovReg, |
| 1484 | _ => panic!("move opcode requested for non-register operand"), |
| 1485 | } |
| 1486 | } |
| 1487 | |
| 1488 | fn rewrite_call_arg_copies(pending_moves: Vec<MachineInst>) -> Vec<MachineInst> { |
| 1489 | let mut pending: Vec<(ArmOpcode, PhysReg, PhysReg)> = pending_moves |
| 1490 | .into_iter() |
| 1491 | .map(|inst| match inst.operands.as_slice() { |
| 1492 | [MachineOperand::PhysReg(dst), MachineOperand::PhysReg(src)] => { |
| 1493 | (inst.opcode, *dst, *src) |
| 1494 | } |
| 1495 | _ => panic!("call-arg copy rewrite saw unexpected operand shape"), |
| 1496 | }) |
| 1497 | .collect(); |
| 1498 | let mut rewritten = Vec::with_capacity(pending.len() + 1); |
| 1499 | |
| 1500 | while !pending.is_empty() { |
| 1501 | let safe_idx = (0..pending.len()).find(|&i| { |
| 1502 | let (_, dst, _) = pending[i]; |
| 1503 | let dst_alias = phys_reg_alias(dst).expect("call-arg copy dst should alias"); |
| 1504 | !pending |
| 1505 | .iter() |
| 1506 | .enumerate() |
| 1507 | .any(|(j, &(_, _, src))| j != i && phys_reg_alias(src) == Some(dst_alias)) |
| 1508 | }); |
| 1509 | |
| 1510 | if let Some(idx) = safe_idx { |
| 1511 | let (opcode, dst, src) = pending.remove(idx); |
| 1512 | rewritten.push(MachineInst { |
| 1513 | opcode, |
| 1514 | operands: vec![MachineOperand::PhysReg(dst), MachineOperand::PhysReg(src)], |
| 1515 | def: None, |
| 1516 | }); |
| 1517 | continue; |
| 1518 | } |
| 1519 | |
| 1520 | let (_, _, src) = pending[0]; |
| 1521 | let scratch = scratch_phys_for(src); |
| 1522 | rewritten.push(MachineInst { |
| 1523 | opcode: move_opcode_for_phys(src), |
| 1524 | operands: vec![ |
| 1525 | MachineOperand::PhysReg(scratch), |
| 1526 | MachineOperand::PhysReg(src), |
| 1527 | ], |
| 1528 | def: None, |
| 1529 | }); |
| 1530 | pending[0].2 = scratch; |
| 1531 | } |
| 1532 | |
| 1533 | rewritten |
| 1534 | } |
| 1535 | |
| 1536 | fn expire_intervals( |
| 1537 | active: &mut Vec<(u8, u32, VRegId)>, |
| 1538 | free_caller: &mut Vec<u8>, |
| 1539 | free_callee: &mut Vec<u8>, |
| 1540 | callee_range: &std::ops::RangeInclusive<u8>, |
| 1541 | pos: u32, |
| 1542 | ) { |
| 1543 | let mut i = 0; |
| 1544 | while i < active.len() { |
| 1545 | if active[i].1 < pos { |
| 1546 | let (reg, _, _) = active.remove(i); |
| 1547 | if callee_range.contains(®) { |
| 1548 | free_callee.push(reg); |
| 1549 | } else { |
| 1550 | free_caller.push(reg); |
| 1551 | } |
| 1552 | } else { |
| 1553 | i += 1; |
| 1554 | } |
| 1555 | } |
| 1556 | } |
| 1557 | |
| 1558 | /// Scratch registers for spill code. Multiple scratches needed when an |
| 1559 | /// instruction has multiple spilled operands. |
| 1560 | const GP_SPILL_SCRATCH: [u8; 3] = [9, 10, 11]; |
| 1561 | const FP_SPILL_SCRATCH: [u8; 3] = [29, 30, 31]; |
| 1562 | |
| 1563 | fn spill_scratch(class: RegClass, idx: usize) -> (PhysReg, ArmOpcode) { |
| 1564 | match class { |
| 1565 | RegClass::Fp64 => (PhysReg::Fp(FP_SPILL_SCRATCH[idx % 3]), ArmOpcode::LdrFpImm), |
| 1566 | RegClass::Fp32 => ( |
| 1567 | PhysReg::Fp32(FP_SPILL_SCRATCH[idx % 3]), |
| 1568 | ArmOpcode::LdrFpImm, |
| 1569 | ), |
| 1570 | // 128-bit vector spills/fills via LdrQ — same FP scratch |
| 1571 | // bank (the V registers ARE the 128-bit form of the same |
| 1572 | // physical D/S registers). |
| 1573 | RegClass::V128 => (PhysReg::Fp(FP_SPILL_SCRATCH[idx % 3]), ArmOpcode::LdrQ), |
| 1574 | RegClass::Gp32 => (PhysReg::Gp32(GP_SPILL_SCRATCH[idx % 3]), ArmOpcode::LdrImm), |
| 1575 | RegClass::Gp64 => (PhysReg::Gp(GP_SPILL_SCRATCH[idx % 3]), ArmOpcode::LdrImm), |
| 1576 | } |
| 1577 | } |
| 1578 | |
| 1579 | #[cfg(test)] |
| 1580 | mod tests { |
| 1581 | use super::*; |
| 1582 | use crate::codegen::isel::select_function; |
| 1583 | use crate::ir::builder::FuncBuilder; |
| 1584 | use crate::ir::inst::*; |
| 1585 | use crate::ir::types::*; |
| 1586 | |
| 1587 | #[test] |
| 1588 | fn linear_scan_assigns_registers() { |
| 1589 | let mut func = Function::new("test".into(), vec![], IrType::Void); |
| 1590 | { |
| 1591 | let mut b = FuncBuilder::new(&mut func); |
| 1592 | let x = b.const_i32(10); |
| 1593 | let y = b.const_i32(20); |
| 1594 | let _z = b.iadd(x, y); |
| 1595 | b.ret_void(); |
| 1596 | } |
| 1597 | let mut mf = select_function(&func); |
| 1598 | let result = linear_scan(&mut mf); |
| 1599 | // Should have assignments for the vregs. |
| 1600 | assert!( |
| 1601 | !result.assignments.is_empty(), |
| 1602 | "should have register assignments" |
| 1603 | ); |
| 1604 | // No spills needed for 3 vregs. |
| 1605 | assert!( |
| 1606 | result.spills.is_empty(), |
| 1607 | "should not spill with only 3 vregs" |
| 1608 | ); |
| 1609 | } |
| 1610 | |
| 1611 | #[test] |
| 1612 | fn linear_scan_no_x18() { |
| 1613 | let mut func = Function::new("test".into(), vec![], IrType::Void); |
| 1614 | { |
| 1615 | let mut b = FuncBuilder::new(&mut func); |
| 1616 | // Create many vregs to exercise the allocator. |
| 1617 | let mut vals = Vec::new(); |
| 1618 | for i in 0..20 { |
| 1619 | vals.push(b.const_i32(i)); |
| 1620 | } |
| 1621 | // Use them all. |
| 1622 | let mut sum = vals[0]; |
| 1623 | for &v in &vals[1..] { |
| 1624 | sum = b.iadd(sum, v); |
| 1625 | } |
| 1626 | b.ret_void(); |
| 1627 | } |
| 1628 | let mut mf = select_function(&func); |
| 1629 | let result = linear_scan(&mut mf); |
| 1630 | // x18 must never be assigned. |
| 1631 | for phys in result.assignments.values() { |
| 1632 | match phys { |
| 1633 | PhysReg::Gp(18) | PhysReg::Gp32(18) => { |
| 1634 | panic!("x18 assigned — platform reserved!"); |
| 1635 | } |
| 1636 | _ => {} |
| 1637 | } |
| 1638 | } |
| 1639 | } |
| 1640 | |
| 1641 | #[test] |
| 1642 | fn coalesce_eliminates_self_moves() { |
| 1643 | let mut mf = MachineFunction::new("test".into()); |
| 1644 | let reg = PhysReg::Gp(9); |
| 1645 | mf.blocks[0].insts.push(MachineInst { |
| 1646 | opcode: ArmOpcode::MovReg, |
| 1647 | operands: vec![MachineOperand::PhysReg(reg), MachineOperand::PhysReg(reg)], |
| 1648 | def: None, |
| 1649 | }); |
| 1650 | mf.blocks[0].insts.push(MachineInst { |
| 1651 | opcode: ArmOpcode::Ret, |
| 1652 | operands: vec![], |
| 1653 | def: None, |
| 1654 | }); |
| 1655 | assert_eq!(mf.blocks[0].insts.len(), 2); |
| 1656 | coalesce_moves(&mut mf); |
| 1657 | assert_eq!( |
| 1658 | mf.blocks[0].insts.len(), |
| 1659 | 1, |
| 1660 | "self-move should be eliminated" |
| 1661 | ); |
| 1662 | } |
| 1663 | |
| 1664 | #[test] |
| 1665 | fn parallelize_call_arg_moves_preserves_later_source_registers() { |
| 1666 | let mut mf = MachineFunction::new("test".into()); |
| 1667 | mf.blocks[0].insts.push(MachineInst { |
| 1668 | opcode: ArmOpcode::MovReg, |
| 1669 | operands: vec![ |
| 1670 | MachineOperand::PhysReg(PhysReg::Gp(0)), |
| 1671 | MachineOperand::PhysReg(PhysReg::Gp(28)), |
| 1672 | ], |
| 1673 | def: None, |
| 1674 | }); |
| 1675 | mf.blocks[0].insts.push(MachineInst { |
| 1676 | opcode: ArmOpcode::MovReg, |
| 1677 | operands: vec![ |
| 1678 | MachineOperand::PhysReg(PhysReg::Gp(1)), |
| 1679 | MachineOperand::PhysReg(PhysReg::Gp(0)), |
| 1680 | ], |
| 1681 | def: None, |
| 1682 | }); |
| 1683 | mf.blocks[0].insts.push(MachineInst { |
| 1684 | opcode: ArmOpcode::Blr, |
| 1685 | operands: vec![MachineOperand::PhysReg(PhysReg::Gp(12))], |
| 1686 | def: None, |
| 1687 | }); |
| 1688 | |
| 1689 | parallelize_call_arg_moves(&mut mf); |
| 1690 | |
| 1691 | assert_eq!(mf.blocks[0].insts.len(), 3); |
| 1692 | assert_eq!( |
| 1693 | mf.blocks[0].insts[0].operands, |
| 1694 | vec![ |
| 1695 | MachineOperand::PhysReg(PhysReg::Gp(1)), |
| 1696 | MachineOperand::PhysReg(PhysReg::Gp(0)), |
| 1697 | ] |
| 1698 | ); |
| 1699 | assert_eq!( |
| 1700 | mf.blocks[0].insts[1].operands, |
| 1701 | vec![ |
| 1702 | MachineOperand::PhysReg(PhysReg::Gp(0)), |
| 1703 | MachineOperand::PhysReg(PhysReg::Gp(28)), |
| 1704 | ] |
| 1705 | ); |
| 1706 | } |
| 1707 | |
| 1708 | #[test] |
| 1709 | fn parallelize_call_arg_moves_breaks_cycles_with_scratch() { |
| 1710 | let mut mf = MachineFunction::new("test".into()); |
| 1711 | mf.blocks[0].insts.push(MachineInst { |
| 1712 | opcode: ArmOpcode::MovReg, |
| 1713 | operands: vec![ |
| 1714 | MachineOperand::PhysReg(PhysReg::Gp(0)), |
| 1715 | MachineOperand::PhysReg(PhysReg::Gp(1)), |
| 1716 | ], |
| 1717 | def: None, |
| 1718 | }); |
| 1719 | mf.blocks[0].insts.push(MachineInst { |
| 1720 | opcode: ArmOpcode::MovReg, |
| 1721 | operands: vec![ |
| 1722 | MachineOperand::PhysReg(PhysReg::Gp(1)), |
| 1723 | MachineOperand::PhysReg(PhysReg::Gp(0)), |
| 1724 | ], |
| 1725 | def: None, |
| 1726 | }); |
| 1727 | mf.blocks[0].insts.push(MachineInst { |
| 1728 | opcode: ArmOpcode::Bl, |
| 1729 | operands: vec![MachineOperand::Extern("_callee".into())], |
| 1730 | def: None, |
| 1731 | }); |
| 1732 | |
| 1733 | parallelize_call_arg_moves(&mut mf); |
| 1734 | |
| 1735 | assert_eq!(mf.blocks[0].insts.len(), 4); |
| 1736 | assert_eq!( |
| 1737 | mf.blocks[0].insts[0].operands, |
| 1738 | vec![ |
| 1739 | MachineOperand::PhysReg(PhysReg::Gp(9)), |
| 1740 | MachineOperand::PhysReg(PhysReg::Gp(1)), |
| 1741 | ] |
| 1742 | ); |
| 1743 | assert_eq!( |
| 1744 | mf.blocks[0].insts[1].operands, |
| 1745 | vec![ |
| 1746 | MachineOperand::PhysReg(PhysReg::Gp(1)), |
| 1747 | MachineOperand::PhysReg(PhysReg::Gp(0)), |
| 1748 | ] |
| 1749 | ); |
| 1750 | assert_eq!( |
| 1751 | mf.blocks[0].insts[2].operands, |
| 1752 | vec![ |
| 1753 | MachineOperand::PhysReg(PhysReg::Gp(0)), |
| 1754 | MachineOperand::PhysReg(PhysReg::Gp(9)), |
| 1755 | ] |
| 1756 | ); |
| 1757 | } |
| 1758 | |
| 1759 | #[test] |
| 1760 | fn parallelize_call_arg_moves_ignores_zero_materialization_into_arg_regs() { |
| 1761 | let mut mf = MachineFunction::new("test".into()); |
| 1762 | mf.blocks[0].insts.push(MachineInst { |
| 1763 | opcode: ArmOpcode::MovReg, |
| 1764 | operands: vec![ |
| 1765 | MachineOperand::PhysReg(PhysReg::Gp(4)), |
| 1766 | MachineOperand::PhysReg(PhysReg::Xzr), |
| 1767 | ], |
| 1768 | def: None, |
| 1769 | }); |
| 1770 | mf.blocks[0].insts.push(MachineInst { |
| 1771 | opcode: ArmOpcode::MovReg, |
| 1772 | operands: vec![ |
| 1773 | MachineOperand::PhysReg(PhysReg::Gp(0)), |
| 1774 | MachineOperand::PhysReg(PhysReg::Gp(4)), |
| 1775 | ], |
| 1776 | def: None, |
| 1777 | }); |
| 1778 | mf.blocks[0].insts.push(MachineInst { |
| 1779 | opcode: ArmOpcode::Blr, |
| 1780 | operands: vec![MachineOperand::PhysReg(PhysReg::Gp(19))], |
| 1781 | def: None, |
| 1782 | }); |
| 1783 | |
| 1784 | parallelize_call_arg_moves(&mut mf); |
| 1785 | |
| 1786 | assert_eq!(mf.blocks[0].insts.len(), 3); |
| 1787 | assert_eq!( |
| 1788 | mf.blocks[0].insts[0].operands, |
| 1789 | vec![ |
| 1790 | MachineOperand::PhysReg(PhysReg::Gp(4)), |
| 1791 | MachineOperand::PhysReg(PhysReg::Xzr), |
| 1792 | ] |
| 1793 | ); |
| 1794 | assert_eq!( |
| 1795 | mf.blocks[0].insts[1].operands, |
| 1796 | vec![ |
| 1797 | MachineOperand::PhysReg(PhysReg::Gp(0)), |
| 1798 | MachineOperand::PhysReg(PhysReg::Gp(4)), |
| 1799 | ] |
| 1800 | ); |
| 1801 | } |
| 1802 | |
| 1803 | #[test] |
| 1804 | fn linear_scan_prefers_caller_saved_when_no_calls_cross() { |
| 1805 | // No call-crossing intervals — every assigned vreg should |
| 1806 | // land on a caller-saved register and callee_saved_used |
| 1807 | // should stay empty (no prologue STP/LDP overhead). |
| 1808 | let mut func = Function::new("test".into(), vec![], IrType::Void); |
| 1809 | { |
| 1810 | let mut b = FuncBuilder::new(&mut func); |
| 1811 | let x = b.const_i32(1); |
| 1812 | let y = b.const_i32(2); |
| 1813 | let z = b.iadd(x, y); |
| 1814 | let _w = b.iadd(z, x); |
| 1815 | b.ret_void(); |
| 1816 | } |
| 1817 | let mut mf = select_function(&func); |
| 1818 | let result = linear_scan(&mut mf); |
| 1819 | assert!( |
| 1820 | result.callee_saved_used.is_empty(), |
| 1821 | "no callee-saved expected for call-free function, got {:?}", |
| 1822 | result.callee_saved_used |
| 1823 | ); |
| 1824 | for phys in result.assignments.values() { |
| 1825 | if let PhysReg::Gp(n) | PhysReg::Gp32(n) = phys { |
| 1826 | assert!( |
| 1827 | !GP_CALLEE_SAVED.contains(n), |
| 1828 | "vreg landed on callee-saved x{} despite caller-saved availability", |
| 1829 | n |
| 1830 | ); |
| 1831 | } |
| 1832 | } |
| 1833 | } |
| 1834 | |
| 1835 | #[test] |
| 1836 | fn parallelize_entry_arg_moves_resolves_swap_in_receipts() { |
| 1837 | let mut mf = MachineFunction::new("test".into()); |
| 1838 | // Frame setup. |
| 1839 | mf.blocks[0].insts.push(MachineInst { |
| 1840 | opcode: ArmOpcode::StpPre, |
| 1841 | operands: vec![], |
| 1842 | def: None, |
| 1843 | }); |
| 1844 | mf.blocks[0].insts.push(MachineInst { |
| 1845 | opcode: ArmOpcode::AddImm, |
| 1846 | operands: vec![], |
| 1847 | def: None, |
| 1848 | }); |
| 1849 | // Arg-receipt swap: mov x4, x3; mov x3, x4. Naive |
| 1850 | // sequential emit would corrupt; the parallelizer must |
| 1851 | // route through scratch x9. |
| 1852 | mf.blocks[0].insts.push(MachineInst { |
| 1853 | opcode: ArmOpcode::MovReg, |
| 1854 | operands: vec![ |
| 1855 | MachineOperand::PhysReg(PhysReg::Gp(4)), |
| 1856 | MachineOperand::PhysReg(PhysReg::Gp(3)), |
| 1857 | ], |
| 1858 | def: None, |
| 1859 | }); |
| 1860 | mf.blocks[0].insts.push(MachineInst { |
| 1861 | opcode: ArmOpcode::MovReg, |
| 1862 | operands: vec![ |
| 1863 | MachineOperand::PhysReg(PhysReg::Gp(3)), |
| 1864 | MachineOperand::PhysReg(PhysReg::Gp(4)), |
| 1865 | ], |
| 1866 | def: None, |
| 1867 | }); |
| 1868 | parallelize_entry_arg_moves(&mut mf); |
| 1869 | let body: Vec<&[MachineOperand]> = mf.blocks[0].insts[2..] |
| 1870 | .iter() |
| 1871 | .map(|i| i.operands.as_slice()) |
| 1872 | .collect(); |
| 1873 | // Expect the parallelizer to insert a scratch save then |
| 1874 | // emit the two receipts without clobbering. |
| 1875 | assert_eq!(body.len(), 3, "swap pair should expand to 3 insts"); |
| 1876 | assert_eq!( |
| 1877 | body[0], |
| 1878 | &[ |
| 1879 | MachineOperand::PhysReg(PhysReg::Gp(9)), |
| 1880 | MachineOperand::PhysReg(PhysReg::Gp(3)), |
| 1881 | ] |
| 1882 | ); |
| 1883 | } |
| 1884 | |
| 1885 | #[test] |
| 1886 | fn parallelize_entry_arg_moves_tolerates_intervening_store() { |
| 1887 | let mut mf = MachineFunction::new("test".into()); |
| 1888 | mf.blocks[0].insts.push(MachineInst { |
| 1889 | opcode: ArmOpcode::StpPre, |
| 1890 | operands: vec![], |
| 1891 | def: None, |
| 1892 | }); |
| 1893 | mf.blocks[0].insts.push(MachineInst { |
| 1894 | opcode: ArmOpcode::AddImm, |
| 1895 | operands: vec![], |
| 1896 | def: None, |
| 1897 | }); |
| 1898 | // mov x10, x0 (transparent: dst x10 outside arg-reg range) |
| 1899 | mf.blocks[0].insts.push(MachineInst { |
| 1900 | opcode: ArmOpcode::MovReg, |
| 1901 | operands: vec![ |
| 1902 | MachineOperand::PhysReg(PhysReg::Gp(10)), |
| 1903 | MachineOperand::PhysReg(PhysReg::Gp(0)), |
| 1904 | ], |
| 1905 | def: None, |
| 1906 | }); |
| 1907 | // str x10, [fp, #-56] (store, transparent — reads x10) |
| 1908 | mf.blocks[0].insts.push(MachineInst { |
| 1909 | opcode: ArmOpcode::StrImm, |
| 1910 | operands: vec![ |
| 1911 | MachineOperand::PhysReg(PhysReg::Gp(10)), |
| 1912 | MachineOperand::PhysReg(PhysReg::FP), |
| 1913 | MachineOperand::Imm(-56), |
| 1914 | ], |
| 1915 | def: None, |
| 1916 | }); |
| 1917 | // Swap pair after the store — must still be parallelized. |
| 1918 | mf.blocks[0].insts.push(MachineInst { |
| 1919 | opcode: ArmOpcode::MovReg, |
| 1920 | operands: vec![ |
| 1921 | MachineOperand::PhysReg(PhysReg::Gp(4)), |
| 1922 | MachineOperand::PhysReg(PhysReg::Gp(3)), |
| 1923 | ], |
| 1924 | def: None, |
| 1925 | }); |
| 1926 | mf.blocks[0].insts.push(MachineInst { |
| 1927 | opcode: ArmOpcode::MovReg, |
| 1928 | operands: vec![ |
| 1929 | MachineOperand::PhysReg(PhysReg::Gp(3)), |
| 1930 | MachineOperand::PhysReg(PhysReg::Gp(4)), |
| 1931 | ], |
| 1932 | def: None, |
| 1933 | }); |
| 1934 | parallelize_entry_arg_moves(&mut mf); |
| 1935 | // The store should still appear (kept in place); the swap |
| 1936 | // pair should have expanded to 3 movs via scratch. |
| 1937 | let opcodes: Vec<ArmOpcode> = mf.blocks[0].insts.iter().map(|i| i.opcode).collect(); |
| 1938 | assert!( |
| 1939 | opcodes.contains(&ArmOpcode::StrImm), |
| 1940 | "store should be preserved across parallelization" |
| 1941 | ); |
| 1942 | let mov_count = opcodes |
| 1943 | .iter() |
| 1944 | .filter(|&&op| op == ArmOpcode::MovReg) |
| 1945 | .count(); |
| 1946 | assert_eq!( |
| 1947 | mov_count, 4, |
| 1948 | "expected 4 movs (1 receipt + 3 from swap expansion), got {}", |
| 1949 | mov_count |
| 1950 | ); |
| 1951 | } |
| 1952 | |
| 1953 | #[test] |
| 1954 | fn parallelize_call_arg_moves_routes_blr_target_through_scratch() { |
| 1955 | let mut mf = MachineFunction::new("test".into()); |
| 1956 | // ldr x0, [...] — function pointer loaded into x0 |
| 1957 | mf.blocks[0].insts.push(MachineInst { |
| 1958 | opcode: ArmOpcode::LdrImm, |
| 1959 | operands: vec![ |
| 1960 | MachineOperand::PhysReg(PhysReg::Gp(0)), |
| 1961 | MachineOperand::PhysReg(PhysReg::FP), |
| 1962 | MachineOperand::Imm(-8), |
| 1963 | ], |
| 1964 | def: None, |
| 1965 | }); |
| 1966 | // mov x0, x12 — arg-setup also targets x0 (clobbers fn ptr). |
| 1967 | mf.blocks[0].insts.push(MachineInst { |
| 1968 | opcode: ArmOpcode::MovReg, |
| 1969 | operands: vec![ |
| 1970 | MachineOperand::PhysReg(PhysReg::Gp(0)), |
| 1971 | MachineOperand::PhysReg(PhysReg::Gp(12)), |
| 1972 | ], |
| 1973 | def: None, |
| 1974 | }); |
| 1975 | mf.blocks[0].insts.push(MachineInst { |
| 1976 | opcode: ArmOpcode::Blr, |
| 1977 | operands: vec![MachineOperand::PhysReg(PhysReg::Gp(0))], |
| 1978 | def: None, |
| 1979 | }); |
| 1980 | parallelize_call_arg_moves(&mut mf); |
| 1981 | // First inst should now be `mov x10, x0` (preserve the fn |
| 1982 | // pointer to scratch); the BLR's operand should be x10. |
| 1983 | assert_eq!(mf.blocks[0].insts[0].opcode, ArmOpcode::LdrImm); |
| 1984 | assert_eq!(mf.blocks[0].insts[1].opcode, ArmOpcode::MovReg); |
| 1985 | assert_eq!( |
| 1986 | mf.blocks[0].insts[1].operands, |
| 1987 | vec![ |
| 1988 | MachineOperand::PhysReg(PhysReg::Gp(10)), |
| 1989 | MachineOperand::PhysReg(PhysReg::Gp(0)), |
| 1990 | ] |
| 1991 | ); |
| 1992 | let blr = mf.blocks[0].insts.last().unwrap(); |
| 1993 | assert_eq!(blr.opcode, ArmOpcode::Blr); |
| 1994 | assert_eq!( |
| 1995 | blr.operands, |
| 1996 | vec![MachineOperand::PhysReg(PhysReg::Gp(10))] |
| 1997 | ); |
| 1998 | } |
| 1999 | |
| 2000 | #[test] |
| 2001 | fn linear_scan_splits_call_crossing_when_callee_saved_exhausted() { |
| 2002 | // 12 vregs all live across a single BL forces the |
| 2003 | // allocator past the 10-deep callee-saved pool. Without |
| 2004 | // splitting the overflow vregs full-spill; with splitting |
| 2005 | // they go pre-call→caller-saved → str/ldr-bridge → |
| 2006 | // post-call→caller-saved (so each split is two halves on |
| 2007 | // caller-saved registers, no whole-range memory traffic). |
| 2008 | let mut mf = MachineFunction::new("test".into()); |
| 2009 | let vs: Vec<VRegId> = (0..12).map(|_| mf.new_vreg(RegClass::Gp64)).collect(); |
| 2010 | for (i, &v) in vs.iter().enumerate() { |
| 2011 | mf.blocks[0].insts.push(MachineInst { |
| 2012 | opcode: ArmOpcode::Movz, |
| 2013 | operands: vec![MachineOperand::VReg(v), MachineOperand::Imm(i as i64 + 1)], |
| 2014 | def: Some(v), |
| 2015 | }); |
| 2016 | } |
| 2017 | mf.blocks[0].insts.push(MachineInst { |
| 2018 | opcode: ArmOpcode::Bl, |
| 2019 | operands: vec![MachineOperand::Extern("_foo".into())], |
| 2020 | def: None, |
| 2021 | }); |
| 2022 | for &v in &vs { |
| 2023 | mf.blocks[0].insts.push(MachineInst { |
| 2024 | opcode: ArmOpcode::MovReg, |
| 2025 | operands: vec![ |
| 2026 | MachineOperand::PhysReg(PhysReg::Gp(0)), |
| 2027 | MachineOperand::VReg(v), |
| 2028 | ], |
| 2029 | def: None, |
| 2030 | }); |
| 2031 | } |
| 2032 | mf.blocks[0].insts.push(MachineInst { |
| 2033 | opcode: ArmOpcode::Ret, |
| 2034 | operands: vec![], |
| 2035 | def: None, |
| 2036 | }); |
| 2037 | let result = linear_scan(&mut mf); |
| 2038 | assert!( |
| 2039 | !result.split_records.is_empty(), |
| 2040 | "expected splits when 12 call-crossing vregs collide with the 10-deep callee-saved pool" |
| 2041 | ); |
| 2042 | for r in &result.split_records { |
| 2043 | // pre-half must be on a caller-saved register; that's |
| 2044 | // the entire point of splitting (the callee-saved tier |
| 2045 | // is exhausted at split time). |
| 2046 | if let PhysReg::Gp(n) | PhysReg::Gp32(n) = r.pre_phys { |
| 2047 | assert!( |
| 2048 | !GP_CALLEE_SAVED.contains(&n), |
| 2049 | "split pre_phys should be caller-saved, got x{}", |
| 2050 | n |
| 2051 | ); |
| 2052 | } |
| 2053 | // bridge_slot must be a real frame offset. |
| 2054 | assert!(r.bridge_slot < 0, "bridge_slot should be FP-negative"); |
| 2055 | } |
| 2056 | } |
| 2057 | } |
| 2058 |