Rust · 82998 bytes Raw Blame History
1 //! Linear scan register allocator.
2 //!
3 //! Replaces the naive spill-everything strategy with proper register assignment.
4 //! Algorithm: sort live intervals by start, walk in order, assign physical registers.
5 //! When no register is available, spill the interval ending furthest (or the current
6 //! one if it ends first). Insert ldr/str for spill code.
7 //!
8 //! Callee-saved registers (x19-x28, d8-d15) are tracked — if used, the function
9 //! prologue/epilogue must save/restore them.
10
11 use super::liveness::compute_liveness;
12 use super::mir::*;
13 use std::collections::{HashMap, HashSet};
14
15 /// GP registers available for allocation (excludes x18, x29, x30, x31/sp).
16 /// Ordered: caller-saved first (prefer these to avoid save/restore overhead),
17 /// then callee-saved.
18 ///
19 /// x8 is reserved for large-offset addressing during emission.
20 /// x16, x17 are the linker scratch registers (clobbered by dynamic
21 /// dispatch stubs).
22 ///
23 /// x9, x10, x11 are reserved **exclusively** for spill reloads
24 /// (see GP_SPILL_SCRATCH). They used to be in the allocation pool
25 /// under the theory that spill code could "borrow" them when
26 /// they're temporarily free, but that assumption is only valid
27 /// when the reload is inserted at a point where the borrowed
28 /// register is dead — and the spill code had no reliable way to
29 /// determine that. The result was a reload clobbering a
30 /// freshly-computed live value (the slice-print crash at -O1+).
31 /// Pinning x9-x11 out of the allocation pool costs 3 vregs worth
32 /// of pressure and guarantees reloads never clash.
33 const GP_ALLOC_ORDER: [u8; 22] = [
34 // Caller-saved (temporary, no save needed)
35 12, 13, 14, 15, // x12-x15
36 0, 1, 2, 3, 4, 5, 6, 7, // x0-x7 (args, but available between calls)
37 // Callee-saved (must save/restore if used)
38 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
39 ];
40
41 /// FP registers available for allocation. Same rationale as GP:
42 /// d29, d30, d31 are reserved exclusively as spill reload scratch.
43 const FP_ALLOC_ORDER: [u8; 29] = [
44 // Caller-saved
45 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, 3, 4, 5, 6, 7,
46 // Callee-saved
47 8, 9, 10, 11, 12, 13, 14, 15,
48 ];
49
50 /// Callee-saved GP register range.
51 const GP_CALLEE_SAVED: std::ops::RangeInclusive<u8> = 19..=28;
52 /// Callee-saved FP register range.
53 const FP_CALLEE_SAVED: std::ops::RangeInclusive<u8> = 8..=15;
54
55 /// Bytes a single spill of a vreg of class `class` occupies. NEON
56 /// vectors need 16; everything else fits in 8. Used to size frame
57 /// slots so the LdrQ/StrQ pair on a V128 spill operates on a slot
58 /// that's actually 16 bytes wide and 16-byte aligned.
59 fn spill_slot_size(class: RegClass) -> u32 {
60 match class {
61 RegClass::V128 => 16,
62 _ => 8,
63 }
64 }
65
66 /// Where a split interval's *post-call* half lives. `Allocated`
67 /// is the win — the post-half got a register, so the only memory
68 /// traffic is the str/ldr that bridges the call. `Spilled` is the
69 /// fall-back: the post-half couldn't be allocated to a register
70 /// either, so post-call uses load directly from `bridge_slot`
71 /// (which is the same slot the pre-half wrote into before the
72 /// call), and we omit the redundant ldr.
73 #[derive(Debug, Clone, Copy)]
74 pub enum PostHalf {
75 Allocated(PhysReg),
76 Spilled,
77 }
78
79 /// One record per call boundary at which a live range was split.
80 /// Drives `apply_allocation` to (a) rewrite operands of `vreg`
81 /// based on whether they precede or follow the call, and the
82 /// post-pass `insert_split_bridges` to (b) emit `str pre_phys,
83 /// [bridge_slot]` before the call plus, if the post-half got a
84 /// register, `ldr post_phys, [bridge_slot]` after.
85 ///
86 /// `call_position` is the BL/BLR's instruction position in the
87 /// liveness numbering — used by apply_allocation to compare with
88 /// each operand's cur_pos. `call_index` is the same call's
89 /// 0-based walk-order ordinal among all calls in the function —
90 /// used by insert_split_bridges to match the BL/BLR by ordinal,
91 /// since intervening passes (apply_allocation's spill code) shift
92 /// numerical positions but preserve walk order.
93 #[derive(Debug, Clone)]
94 pub struct SplitRecord {
95 pub vreg: VRegId,
96 pub call_position: u32,
97 pub call_index: u32,
98 pub pre_phys: PhysReg,
99 pub post: PostHalf,
100 pub bridge_slot: i32,
101 }
102
103 /// Result of register allocation.
104 pub struct AllocResult {
105 /// VRegId → assigned PhysReg. For a split vreg, this holds the
106 /// pre-call (first-half) physreg; the post-half lives in the
107 /// matching `SplitRecord`.
108 pub assignments: HashMap<VRegId, PhysReg>,
109 /// Spilled vregs → stack frame offset.
110 pub spills: HashMap<VRegId, i32>,
111 /// Callee-saved registers that were used and need save/restore.
112 pub callee_saved_used: Vec<PhysReg>,
113 /// Live-range splits performed at call boundaries. Sorted by
114 /// `(vreg, call_position)` for deterministic lookup.
115 pub split_records: Vec<SplitRecord>,
116 }
117
118 /// Run linear scan register allocation on a machine function.
119 pub fn linear_scan(mf: &mut MachineFunction) -> AllocResult {
120 let liveness = compute_liveness(mf);
121
122 let mut assignments: HashMap<VRegId, PhysReg> = HashMap::new();
123 let mut spills: HashMap<VRegId, i32> = HashMap::new();
124 // Active intervals: (reg_num, interval_end, current_vreg). The
125 // vreg is tracked here so spill victim selection can identify
126 // the *current* holder of a physical register without iterating
127 // the `assignments` HashMap (which accumulates stale historical
128 // entries when registers are reused after expiry, and whose
129 // iteration order is non-deterministic). Audit fix surfaced by
130 // mem2reg work — produced flaky `select_type.f90` failures and
131 // non-reproducible builds across compiles.
132 let mut active_gp: Vec<(u8, u32, VRegId)> = Vec::new();
133 let mut active_fp: Vec<(u8, u32, VRegId)> = Vec::new();
134 // Two free pools per class. Non-call-crossing intervals draw
135 // from caller-saved first (no save/restore overhead in the
136 // prologue/epilogue); the callee-saved tier is reserved for
137 // call-crossing intervals, with caller-tier exhaustion as a
138 // fall-through. Previously this was a single Vec consumed via
139 // pop(), which returned the *last* element first — quietly
140 // inverting the policy documented above on GP_ALLOC_ORDER and
141 // forcing every function with even a single allocated vreg to
142 // mark x28/x27/... as callee_saved_used.
143 let split_pool = |order: &[u8], callee: &std::ops::RangeInclusive<u8>| -> (Vec<u8>, Vec<u8>) {
144 let mut caller = Vec::new();
145 let mut callees = Vec::new();
146 for &r in order {
147 if callee.contains(&r) {
148 callees.push(r);
149 } else {
150 caller.push(r);
151 }
152 }
153 (caller, callees)
154 };
155 let (mut free_gp_caller, mut free_gp_callee) =
156 split_pool(&GP_ALLOC_ORDER, &GP_CALLEE_SAVED);
157 let (mut free_fp_caller, mut free_fp_callee) =
158 split_pool(&FP_ALLOC_ORDER, &FP_CALLEE_SAVED);
159 let mut callee_saved_used: HashSet<PhysReg> = HashSet::new();
160
161 // Hard-coded PhysReg writes inside instructions: positions
162 // where operand 0 is a PhysReg, e.g. arg-setup
163 // `mov PhysReg(x_i), VReg(value)` immediately before each BL.
164 // The splitter consults this map to avoid pre_phys candidates
165 // that will be clobbered by an arg-setup inside the pre-half's
166 // range — picking such a register would corrupt the bridge str
167 // because by the time the bridge runs the register no longer
168 // holds the pre-half value. We only constrain the splitter
169 // here (not the general allocator) to keep the inflated-
170 // liveness compensations elsewhere intact.
171 let mut phys_writes_gp: HashMap<u8, Vec<u32>> = HashMap::new();
172 let mut phys_writes_fp: HashMap<u8, Vec<u32>> = HashMap::new();
173 {
174 let mut p: u32 = 0;
175 for block in &mf.blocks {
176 for inst in &block.insts {
177 if let Some(MachineOperand::PhysReg(phys)) = inst.operands.first() {
178 match phys {
179 PhysReg::Gp(n) | PhysReg::Gp32(n) => {
180 phys_writes_gp.entry(*n).or_default().push(p);
181 }
182 PhysReg::Fp(n) | PhysReg::Fp32(n) => {
183 phys_writes_fp.entry(*n).or_default().push(p);
184 }
185 _ => {}
186 }
187 }
188 p += 2;
189 }
190 }
191 }
192 let phys_written_in = |phys_n: u8, is_fp: bool, start: u32, end: u32| -> bool {
193 let map = if is_fp {
194 &phys_writes_fp
195 } else {
196 &phys_writes_gp
197 };
198 if let Some(positions) = map.get(&phys_n) {
199 positions.iter().any(|&pp| pp > start && pp <= end)
200 } else {
201 false
202 }
203 };
204
205 // Tracks live-range splits in progress: each entry maps a
206 // synthetic post-half vreg to (original vreg, call position,
207 // bridge slot). After allocation we walk this map to
208 // materialize SplitRecords using whatever physreg/slot the
209 // post-half ended up with.
210 let mut splits_in_progress: HashMap<VRegId, (VRegId, u32, i32)> = HashMap::new();
211
212 // Per-vreg actual def/use position range, computed by direct
213 // walk of the MIR. Used by the splitter to sanity-check
214 // crosses_call before deciding to split: the global liveness
215 // dataflow has a quirk that extends every dest vreg's live
216 // range backward over its own def (op0 of `mov vreg_dst,
217 // vreg_src` is treated as both def and use), inflating short
218 // vreg-to-vreg copy chains to span entire blocks. The
219 // splitter would then see a "call crossing" for a vreg whose
220 // real def/use range is well before any call. Splitting such a
221 // vreg corrupts the chain. We don't fix the dataflow globally
222 // — other regalloc decisions depend on the inflated ranges in
223 // ways that surface latent bugs in arg-setup emission — but we
224 // do consult this tighter map before committing to a split.
225 let mut vreg_actual_range: HashMap<VRegId, (u32, u32)> = HashMap::new();
226 // Vregs defined in more than one MIR block — these are
227 // phi-like (block params receiving values via the parallel-copy
228 // sequence emit_branch_arg_copies inserts at the end of every
229 // predecessor). Linear-position liveness can't represent the
230 // true CFG-aware live set for these, so the splitter must not
231 // act on them: a call block lexically wedged between the def
232 // edge and the use block fakes a crossing point that no actual
233 // control-flow path traverses, and the resulting split assigns
234 // the post-half a different physreg than the pre-half — every
235 // predecessor's parallel-copy lands in pre_phys but every use
236 // inside the loop reads post_phys. (`realworld_affine_shift.f90`
237 // at -O2+ exhibits this: V_iv defined in `if_end_1`, used in
238 // `do_check_3`/`do_body_4`, but `if_then_2` carrying the
239 // recursive call is lexically between if_end_1 and do_check_3.)
240 let mut vreg_def_blocks: HashMap<VRegId, std::collections::HashSet<usize>> =
241 HashMap::new();
242 {
243 let mut p: u32 = 0;
244 for (block_idx, block) in mf.blocks.iter().enumerate() {
245 for inst in &block.insts {
246 let mut touched: Vec<VRegId> = Vec::new();
247 if let Some(d) = inst.def {
248 touched.push(d);
249 vreg_def_blocks.entry(d).or_default().insert(block_idx);
250 }
251 for op in &inst.operands {
252 if let MachineOperand::VReg(v) = op {
253 touched.push(*v);
254 }
255 }
256 for v in touched {
257 let entry = vreg_actual_range.entry(v).or_insert((p, p));
258 if p < entry.0 {
259 entry.0 = p;
260 }
261 if p > entry.1 {
262 entry.1 = p;
263 }
264 }
265 p += 2;
266 }
267 }
268 }
269 let multi_def_vregs: std::collections::HashSet<VRegId> = vreg_def_blocks
270 .iter()
271 .filter(|(_, blocks)| blocks.len() > 1)
272 .map(|(v, _)| *v)
273 .collect();
274
275 // Worklist of intervals to allocate. Starts as a clone of the
276 // pre-sorted liveness intervals; splitting inserts post-half
277 // intervals at their sorted position so the linear-scan
278 // invariant (process in start-position order) is preserved.
279 let mut worklist: Vec<super::liveness::LiveInterval> = liveness.intervals.clone();
280 let mut idx = 0;
281 while idx < worklist.len() {
282 let interval = worklist[idx].clone();
283 // V128 lives in the same physical V/Q register file as
284 // Fp32/Fp64; the regalloc must draw from the FP pool for
285 // any of these classes or the vector vreg ends up in a Gp
286 // register and emit prints `ldr qx21` (invalid asm).
287 let is_fp = matches!(
288 interval.class,
289 RegClass::Fp32 | RegClass::Fp64 | RegClass::V128
290 );
291
292 if is_fp {
293 expire_intervals(
294 &mut active_fp,
295 &mut free_fp_caller,
296 &mut free_fp_callee,
297 &FP_CALLEE_SAVED,
298 interval.start,
299 );
300 } else {
301 expire_intervals(
302 &mut active_gp,
303 &mut free_gp_caller,
304 &mut free_gp_callee,
305 &GP_CALLEE_SAVED,
306 interval.start,
307 );
308 }
309
310 let (active, free_caller, free_callee) = if is_fp {
311 (&mut active_fp, &mut free_fp_caller, &mut free_fp_callee)
312 } else {
313 (&mut active_gp, &mut free_gp_caller, &mut free_gp_callee)
314 };
315
316 let reg_opt = if interval.crosses_call {
317 // Must use callee-saved (caller-saved would be clobbered
318 // by the call).
319 free_callee.pop()
320 } else if let Some(hint) = interval.hint {
321 // Try the hinted register first across both tiers.
322 if let Some(i) = free_caller.iter().position(|&r| r == hint) {
323 Some(free_caller.remove(i))
324 } else if let Some(i) = free_callee.iter().position(|&r| r == hint) {
325 Some(free_callee.remove(i))
326 } else {
327 free_caller.pop().or_else(|| free_callee.pop())
328 }
329 } else {
330 free_caller.pop().or_else(|| free_callee.pop())
331 };
332
333 // Live-range splitting: a call-crossing interval that
334 // can't grab a callee-saved register would otherwise be
335 // full-spilled. If it has a single call crossing AND a
336 // caller-saved register is available right now, split at
337 // the call boundary instead — the pre-half goes to the
338 // caller-saved register, the post-half is allocated as a
339 // fresh interval, and we bridge the call with str/ldr to
340 // a frame slot. Net win: the longer-lived half between
341 // the def and the call avoids spill loads on every use.
342 // Cross-check `crosses_call` against the actual def/use
343 // range from the MIR walk: a vreg whose real range doesn't
344 // straddle the supposed crossing point is a false positive
345 // from inflated liveness, and must NOT be split.
346 let real_crosses = if multi_def_vregs.contains(&interval.vreg) {
347 // Phi-like vreg: positions can't represent the true
348 // live set (see comment above where multi_def_vregs is
349 // built). Refuse to split — the parallel-copy at every
350 // predecessor must land in the same physreg as every
351 // use, and splitting fundamentally breaks that.
352 false
353 } else if let Some(&(real_start, real_end)) =
354 vreg_actual_range.get(&interval.vreg)
355 {
356 interval.call_crossings.len() == 1
357 && interval.call_crossings[0] > real_start
358 && interval.call_crossings[0] < real_end
359 } else {
360 false
361 };
362 // Find a caller-saved that survives the pre-half range
363 // unclobbered. Reject any register P such that an
364 // instruction inside (interval.start, call_pos - 1] writes
365 // P — by the time the bridge str fires (right above the
366 // arg-setup block), P would no longer hold the pre-half's
367 // value and the bridge would capture garbage.
368 let safe_pre_phys = if !free_caller.is_empty() && interval.call_crossings.len() == 1 {
369 let cp = interval.call_crossings[0];
370 let pre_end = cp.saturating_sub(1);
371 free_caller.iter().rposition(|&r| {
372 !phys_written_in(r, is_fp, interval.start, pre_end)
373 })
374 } else {
375 None
376 };
377 if let (None, true, true, Some(safe_idx)) =
378 (reg_opt, interval.crosses_call, real_crosses, safe_pre_phys)
379 {
380 let call_pos = interval.call_crossings[0];
381 let bridge_slot = mf.alloc_local(spill_slot_size(interval.class));
382 let synthetic = mf.new_vreg(interval.class);
383 splits_in_progress.insert(synthetic, (interval.vreg, call_pos, bridge_slot));
384 // Pull the safe register out of the caller-saved pool
385 // and assign it directly: re-running through the normal
386 // assignment path could pick a different register that
387 // *isn't* phys-write-safe (free_caller.pop() doesn't
388 // filter), which would defeat the whole point.
389 let safe_reg = free_caller.remove(safe_idx);
390 let pre_phys = if is_fp {
391 match interval.class {
392 RegClass::Fp32 => PhysReg::Fp32(safe_reg),
393 _ => PhysReg::Fp(safe_reg),
394 }
395 } else {
396 match interval.class {
397 RegClass::Gp32 => PhysReg::Gp32(safe_reg),
398 _ => PhysReg::Gp(safe_reg),
399 }
400 };
401 assignments.insert(interval.vreg, pre_phys);
402 // Pre-half ends before the call — track so this slot
403 // becomes free for any post-call interval that needs a
404 // caller-saved.
405 active.push((safe_reg, call_pos.saturating_sub(1), interval.vreg));
406 active.sort_by_key(|&(_, end, _)| end);
407
408 // Post-half: a fresh synthetic vreg covering the
409 // post-call range. Single original crossing → no
410 // crossings remain; cannot recurse-split.
411 let post_half = super::liveness::LiveInterval {
412 vreg: synthetic,
413 class: interval.class,
414 start: call_pos.saturating_add(1),
415 end: interval.end,
416 crosses_call: false,
417 call_crossings: Vec::new(),
418 hint: None,
419 };
420 // Insert post-half at its sorted position.
421 let post_start = post_half.start;
422 let post_vreg = post_half.vreg.0;
423 let mut insert_at = idx + 1;
424 while insert_at < worklist.len() {
425 let w = &worklist[insert_at];
426 if (w.start, w.vreg.0) > (post_start, post_vreg) {
427 break;
428 }
429 insert_at += 1;
430 }
431 worklist.insert(insert_at, post_half);
432 idx += 1;
433 continue;
434 }
435
436 if let Some(reg) = reg_opt {
437 // Register available.
438 let phys = if is_fp {
439 match interval.class {
440 RegClass::Fp32 => PhysReg::Fp32(reg),
441 _ => PhysReg::Fp(reg),
442 }
443 } else {
444 match interval.class {
445 RegClass::Gp32 => PhysReg::Gp32(reg),
446 _ => PhysReg::Gp(reg),
447 }
448 };
449 assignments.insert(interval.vreg, phys);
450 active.push((reg, interval.end, interval.vreg));
451 active.sort_by_key(|&(_, end, _)| end);
452
453 // Track callee-saved usage.
454 if !is_fp && GP_CALLEE_SAVED.contains(&reg) {
455 callee_saved_used.insert(PhysReg::Gp(reg));
456 }
457 if is_fp && FP_CALLEE_SAVED.contains(&reg) {
458 callee_saved_used.insert(PhysReg::Fp(reg));
459 }
460 } else {
461 // No register available — spill. Find the active
462 // interval ending furthest. `active` is kept sorted by
463 // end (see the `active.sort_by_key(...)` calls above
464 // and below), so the last entry is always the furthest.
465 // The active list carries the current vreg directly so
466 // we don't have to scan the (non-deterministic,
467 // stale-entry-laden) assignments map for the victim.
468 if let Some(&(spill_reg, spill_end, victim)) = active.last() {
469 let last_idx = active.len() - 1;
470 if spill_end > interval.end {
471 // Spill the furthest active interval, give its register to us.
472 // The victim's class is what determines its slot size — not
473 // ours, since we're taking the victim's physreg.
474 let victim_class = mf
475 .vregs
476 .iter()
477 .find(|v| v.id == victim)
478 .map(|v| v.class)
479 .unwrap_or(RegClass::Gp64);
480 let offset = mf.alloc_local(spill_slot_size(victim_class));
481 spills.insert(victim, offset);
482 assignments.remove(&victim);
483
484 let phys = if is_fp {
485 match interval.class {
486 RegClass::Fp32 => PhysReg::Fp32(spill_reg),
487 _ => PhysReg::Fp(spill_reg),
488 }
489 } else {
490 match interval.class {
491 RegClass::Gp32 => PhysReg::Gp32(spill_reg),
492 _ => PhysReg::Gp(spill_reg),
493 }
494 };
495 assignments.insert(interval.vreg, phys);
496 active.remove(last_idx);
497 active.push((spill_reg, interval.end, interval.vreg));
498 active.sort_by_key(|&(_, end, _)| end);
499 } else {
500 // Current interval ends later — spill it. Use the
501 // current interval's class for slot sizing.
502 let offset = mf.alloc_local(spill_slot_size(interval.class));
503 spills.insert(interval.vreg, offset);
504 }
505 } else {
506 // No active intervals — shouldn't happen but spill to be safe.
507 let offset = mf.alloc_local(spill_slot_size(interval.class));
508 spills.insert(interval.vreg, offset);
509 }
510 }
511 idx += 1;
512 }
513
514 // Materialize SplitRecords from the in-progress map. Each
515 // synthetic post-half vreg either landed on a phys (the win
516 // case) or was spilled (the fall-back). Either way we have a
517 // bridge slot that links the two halves across the call.
518 //
519 // Build a position→call-index map from the BL/BLR walk so
520 // we can attach a stable ordinal to each split. `liveness`
521 // already enumerated calls in walk order, so its
522 // `call_positions` would do — but liveness exposes only
523 // intervals. Re-derive locally; the call positions are
524 // small.
525 let call_positions_walk: Vec<u32> = {
526 let mut p: u32 = 0;
527 let mut cps = Vec::new();
528 for block in &mf.blocks {
529 for inst in &block.insts {
530 if matches!(inst.opcode, ArmOpcode::Bl | ArmOpcode::Blr) {
531 cps.push(p);
532 }
533 p += 2;
534 }
535 }
536 cps
537 };
538 let mut split_records: Vec<SplitRecord> = splits_in_progress
539 .into_iter()
540 .filter_map(|(synthetic, (orig, call_pos, bridge_slot))| {
541 let pre_phys = *assignments.get(&orig)?;
542 let post = if let Some(&p) = assignments.get(&synthetic) {
543 PostHalf::Allocated(p)
544 } else if spills.contains_key(&synthetic) {
545 PostHalf::Spilled
546 } else {
547 // No allocation either way — shouldn't happen, but
548 // skip the record rather than emit garbage.
549 return None;
550 };
551 assignments.remove(&synthetic);
552 spills.remove(&synthetic);
553 let call_index = call_positions_walk.binary_search(&call_pos).ok()? as u32;
554 Some(SplitRecord {
555 vreg: orig,
556 call_position: call_pos,
557 call_index,
558 pre_phys,
559 post,
560 bridge_slot,
561 })
562 })
563 .collect();
564 split_records.sort_by_key(|r| (r.vreg.0, r.call_position));
565
566 // Sort callee-saved for consistent prologue/epilogue ordering.
567 let mut callee_saved: Vec<PhysReg> = callee_saved_used.into_iter().collect();
568 callee_saved.sort_by_key(|r| match r {
569 PhysReg::Gp(n) | PhysReg::Gp32(n) => *n as u16,
570 PhysReg::Fp(n) | PhysReg::Fp32(n) => 100 + *n as u16,
571 _ => 200,
572 });
573
574 AllocResult {
575 assignments,
576 spills,
577 split_records,
578 callee_saved_used: callee_saved,
579 }
580 }
581
582 /// Position-resolved assignment for a vreg. A vreg that wasn't
583 /// split has the same assignment everywhere; a split vreg's
584 /// assignment depends on whether the use is before or after the
585 /// call boundary, and the post-call assignment may itself be a
586 /// memory slot (the spilled-post fall-back).
587 enum LogicalAssignment {
588 Reg(PhysReg),
589 Slot(i32),
590 }
591
592 /// Apply allocation result: rewrite VReg operands to PhysReg, insert spill code.
593 /// For spilled operands, borrows a temporarily-free register from the allocation
594 /// pool rather than using dedicated scratch registers. This avoids wasting
595 /// registers in the pool and follows standard linear scan practice.
596 pub fn apply_allocation(
597 mf: &mut MachineFunction,
598 result: &AllocResult,
599 liveness: &super::liveness::LivenessResult,
600 ) {
601 let vreg_classes: HashMap<VRegId, RegClass> =
602 mf.vregs.iter().map(|v| (v.id, v.class)).collect();
603
604 // Build interval lookup: for each vreg, its live range.
605 let intervals: HashMap<VRegId, (u32, u32)> = liveness
606 .intervals
607 .iter()
608 .map(|i| (i.vreg, (i.start, i.end)))
609 .collect();
610
611 // Compute instruction positions for each block/instruction.
612 let mut inst_pos: HashMap<(usize, usize), u32> = HashMap::new();
613 let mut pos: u32 = 0;
614 for (bi, block) in mf.blocks.iter().enumerate() {
615 for (ii, _) in block.insts.iter().enumerate() {
616 inst_pos.insert((bi, ii), pos);
617 pos += 2;
618 }
619 }
620
621 // Position-aware lookup: returns the logical assignment of
622 // `vid` at `cur_pos`, consulting split_records first so that
623 // post-call uses pick up the post-half's location instead of
624 // the pre-half's (now-clobbered) caller-saved register.
625 let logical_assignment = |vid: VRegId, cur_pos: u32| -> Option<LogicalAssignment> {
626 for r in &result.split_records {
627 if r.vreg != vid {
628 continue;
629 }
630 if cur_pos > r.call_position {
631 return Some(match r.post {
632 PostHalf::Allocated(p) => LogicalAssignment::Reg(p),
633 PostHalf::Spilled => LogicalAssignment::Slot(r.bridge_slot),
634 });
635 }
636 return Some(LogicalAssignment::Reg(r.pre_phys));
637 }
638 if let Some(p) = result.assignments.get(&vid) {
639 Some(LogicalAssignment::Reg(*p))
640 } else {
641 result.spills.get(&vid).copied().map(LogicalAssignment::Slot)
642 }
643 };
644
645 for block_idx in 0..mf.blocks.len() {
646 let mut new_insts = Vec::new();
647 let insts = std::mem::take(&mut mf.blocks[block_idx].insts);
648
649 for (inst_idx, inst) in insts.iter().enumerate() {
650 let mut rewritten = inst.clone();
651 let cur_pos = inst_pos.get(&(block_idx, inst_idx)).copied().unwrap_or(0);
652
653 // Find GP registers NOT in use at this position. A
654 // vreg's register is in use during its full live range
655 // unless the vreg was split — in which case the
656 // pre-half occupies pre_phys during [start,
657 // call_position] and the post-half occupies post_phys
658 // during (call_position, end].
659 //
660 // Iterate `result.assignments` in **deterministic vreg
661 // order**, not raw HashMap iteration order — the latter
662 // varies between runs and produces different temp-reg
663 // assignments and therefore non-reproducible builds. The
664 // resulting `gp_temps`/`fp_temps` lists are consumed by
665 // index, so their order is load-bearing.
666 let mut gp_temps: Vec<u8> = Vec::new();
667 let mut fp_temps: Vec<u8> = Vec::new();
668 let mut sorted_assignments: Vec<(VRegId, PhysReg)> =
669 result.assignments.iter().map(|(&v, &p)| (v, p)).collect();
670 sorted_assignments.sort_by_key(|(v, _)| v.0);
671 // Build the set of phys regs in use at cur_pos. For
672 // split vregs, pre_phys is in use up to call_position,
673 // post_phys (if Allocated) takes over after.
674 let mut used_gp: HashSet<u8> = HashSet::new();
675 let mut used_fp: HashSet<u8> = HashSet::new();
676 let mark_used = |phys: PhysReg, gp: &mut HashSet<u8>, fp: &mut HashSet<u8>| match phys {
677 PhysReg::Gp(n) | PhysReg::Gp32(n) => {
678 gp.insert(n);
679 }
680 PhysReg::Fp(n) | PhysReg::Fp32(n) => {
681 fp.insert(n);
682 }
683 _ => {}
684 };
685 for (vreg, phys) in &sorted_assignments {
686 if let Some(&(start, end)) = intervals.get(vreg) {
687 if cur_pos < start || cur_pos > end {
688 continue;
689 }
690 // Split vregs: pre_phys covers [start,
691 // call_position]; post_phys covers
692 // (call_position, end]. assignments[vreg] is
693 // the pre_phys (we removed the synthetic).
694 let split = result.split_records.iter().find(|r| r.vreg == *vreg);
695 if let Some(rec) = split {
696 if cur_pos <= rec.call_position {
697 mark_used(rec.pre_phys, &mut used_gp, &mut used_fp);
698 }
699 if let PostHalf::Allocated(p) = rec.post {
700 if cur_pos > rec.call_position {
701 mark_used(p, &mut used_gp, &mut used_fp);
702 }
703 }
704 } else {
705 mark_used(*phys, &mut used_gp, &mut used_fp);
706 }
707 }
708 }
709 for (_, phys) in &sorted_assignments {
710 match phys {
711 PhysReg::Gp(n) | PhysReg::Gp32(n)
712 if !used_gp.contains(n) && !gp_temps.contains(n) =>
713 {
714 gp_temps.push(*n);
715 }
716 PhysReg::Fp(n) | PhysReg::Fp32(n)
717 if !used_fp.contains(n) && !fp_temps.contains(n) =>
718 {
719 fp_temps.push(*n);
720 }
721 _ => {}
722 }
723 }
724 // post_phys registers from split records are also
725 // candidates for temp use when *they're* not in use.
726 for r in &result.split_records {
727 if let PostHalf::Allocated(p) = r.post {
728 let in_use = match p {
729 PhysReg::Gp(n) | PhysReg::Gp32(n) => used_gp.contains(&n),
730 PhysReg::Fp(n) | PhysReg::Fp32(n) => used_fp.contains(&n),
731 _ => true,
732 };
733 if !in_use {
734 match p {
735 PhysReg::Gp(n) | PhysReg::Gp32(n) if !gp_temps.contains(&n) => {
736 gp_temps.push(n);
737 }
738 PhysReg::Fp(n) | PhysReg::Fp32(n) if !fp_temps.contains(&n) => {
739 fp_temps.push(n);
740 }
741 _ => {}
742 }
743 }
744 }
745 }
746 // Also include the dedicated fallback scratches in case no free regs found.
747 for &s in &GP_SPILL_SCRATCH {
748 if !gp_temps.contains(&s) {
749 gp_temps.push(s);
750 }
751 }
752 for &s in &FP_SPILL_SCRATCH {
753 if !fp_temps.contains(&s) {
754 fp_temps.push(s);
755 }
756 }
757
758 // For spilled (or post-spilled-after-split) vreg uses:
759 // borrow a temp register and load from the slot.
760 let mut loads = Vec::new();
761 let mut gp_temp_idx = 0usize;
762 let mut fp_temp_idx = 0usize;
763 for (i, op) in inst.operands.iter().enumerate() {
764 if let MachineOperand::VReg(vid) = op {
765 if let Some(LogicalAssignment::Slot(offset)) =
766 logical_assignment(*vid, cur_pos)
767 {
768 let class = vreg_classes.get(vid).copied().unwrap_or(RegClass::Gp64);
769 let (temp_reg, load_op) = match class {
770 RegClass::Fp32 => {
771 let r = fp_temps
772 .get(fp_temp_idx)
773 .copied()
774 .unwrap_or(FP_SPILL_SCRATCH[0]);
775 fp_temp_idx += 1;
776 (PhysReg::Fp32(r), ArmOpcode::LdrFpImm)
777 }
778 RegClass::Fp64 => {
779 let r = fp_temps
780 .get(fp_temp_idx)
781 .copied()
782 .unwrap_or(FP_SPILL_SCRATCH[0]);
783 fp_temp_idx += 1;
784 (PhysReg::Fp(r), ArmOpcode::LdrFpImm)
785 }
786 RegClass::V128 => {
787 let r = fp_temps
788 .get(fp_temp_idx)
789 .copied()
790 .unwrap_or(FP_SPILL_SCRATCH[0]);
791 fp_temp_idx += 1;
792 (PhysReg::Fp(r), ArmOpcode::LdrQ)
793 }
794 RegClass::Gp32 => {
795 let r = gp_temps
796 .get(gp_temp_idx)
797 .copied()
798 .unwrap_or(GP_SPILL_SCRATCH[0]);
799 gp_temp_idx += 1;
800 (PhysReg::Gp32(r), ArmOpcode::LdrImm)
801 }
802 RegClass::Gp64 => {
803 let r = gp_temps
804 .get(gp_temp_idx)
805 .copied()
806 .unwrap_or(GP_SPILL_SCRATCH[0]);
807 gp_temp_idx += 1;
808 (PhysReg::Gp(r), ArmOpcode::LdrImm)
809 }
810 };
811 loads.push((i, temp_reg, load_op, offset));
812 }
813 }
814 }
815
816 for (op_idx, scratch, load_op, offset) in &loads {
817 if *op_idx == 0
818 && inst
819 .def
820 .as_ref()
821 .map(|d| {
822 matches!(
823 logical_assignment(*d, cur_pos),
824 Some(LogicalAssignment::Slot(_))
825 )
826 })
827 .unwrap_or(false)
828 {
829 continue;
830 }
831 emit_spill_access(&mut new_insts, *load_op, *scratch, *offset as i64);
832 rewritten.operands[*op_idx] = MachineOperand::PhysReg(*scratch);
833 }
834
835 // Rewrite assigned vregs to physical registers using
836 // the position-aware lookup so split vregs pick up the
837 // correct half.
838 for op in &mut rewritten.operands {
839 if let MachineOperand::VReg(vid) = op {
840 if let Some(LogicalAssignment::Reg(phys)) =
841 logical_assignment(*vid, cur_pos)
842 {
843 *op = MachineOperand::PhysReg(phys);
844 }
845 }
846 }
847
848 // Handle def — use a temp that doesn't alias any input temp.
849 let def_temp_idx = gp_temp_idx.max(fp_temp_idx);
850 let def_spill = if let Some(def_vid) = &inst.def {
851 if let Some(LogicalAssignment::Slot(offset)) =
852 logical_assignment(*def_vid, cur_pos)
853 {
854 let class = vreg_classes.get(def_vid).copied().unwrap_or(RegClass::Gp64);
855 let temp_reg = match class {
856 RegClass::Fp32 => {
857 let r = fp_temps
858 .get(def_temp_idx)
859 .copied()
860 .unwrap_or(FP_SPILL_SCRATCH[0]);
861 PhysReg::Fp32(r)
862 }
863 RegClass::Fp64 | RegClass::V128 => {
864 let r = fp_temps
865 .get(def_temp_idx)
866 .copied()
867 .unwrap_or(FP_SPILL_SCRATCH[0]);
868 PhysReg::Fp(r)
869 }
870 RegClass::Gp32 => {
871 let r = gp_temps
872 .get(def_temp_idx)
873 .copied()
874 .unwrap_or(GP_SPILL_SCRATCH[0]);
875 PhysReg::Gp32(r)
876 }
877 RegClass::Gp64 => {
878 let r = gp_temps
879 .get(def_temp_idx)
880 .copied()
881 .unwrap_or(GP_SPILL_SCRATCH[0]);
882 PhysReg::Gp(r)
883 }
884 };
885 let scratch = temp_reg;
886 // Replace def operand with scratch.
887 if let Some(MachineOperand::VReg(vid)) = rewritten.operands.first() {
888 if vid == def_vid {
889 rewritten.operands[0] = MachineOperand::PhysReg(scratch);
890 }
891 }
892 Some((scratch, offset, class))
893 } else {
894 None
895 }
896 } else {
897 None
898 };
899
900 // Bridge instructions for live-range splits are
901 // emitted by `insert_split_bridges`, which runs *after*
902 // `parallelize_call_arg_moves` so the canonical
903 // arg-setup mov sequence isn't fragmented by an
904 // interposed store and the parallel-copy resolver can
905 // see all the arg-setup movs as one block.
906 rewritten.def = None;
907 new_insts.push(rewritten);
908
909 // Store after def for spilled vregs.
910 if let Some((scratch, offset, class)) = def_spill {
911 let store_op = match class {
912 RegClass::Fp32 | RegClass::Fp64 => ArmOpcode::StrFpImm,
913 RegClass::V128 => ArmOpcode::StrQ,
914 _ => ArmOpcode::StrImm,
915 };
916 emit_spill_access(&mut new_insts, store_op, scratch, offset as i64);
917 }
918 }
919
920 mf.blocks[block_idx].insts = new_insts;
921 }
922 }
923
924 /// Emit a spill load or store, materializing the FP-relative
925 /// address through `x8` when the offset is out of the LDR/STR
926 /// immediate's reach. ARM64's LDUR / STUR (used as a fallback for
927 /// negative immediates) only encode signed 9-bit offsets
928 /// `(-256, 255)`. For wider frames we substitute
929 /// `sub x8, x29, #|offset|; ldr/str rt, [x8, #0]`. Positive
930 /// out-of-scaled-range offsets get the symmetric `add` treatment.
931 fn emit_spill_access(
932 insts: &mut Vec<MachineInst>,
933 op: ArmOpcode,
934 rt: PhysReg,
935 offset: i64,
936 ) {
937 // Most spill ops encode comfortably with FP-relative immediates.
938 // Only fall back to address materialization when out of LDUR
939 // range. (Positive offsets up to 65520 step 16 are handled by
940 // LDR's scaled form; negatives must use LDUR; we materialize
941 // anything outside the union.)
942 let in_range = (-256..=255).contains(&offset);
943 if in_range {
944 insts.push(MachineInst {
945 opcode: op,
946 operands: vec![
947 MachineOperand::PhysReg(rt),
948 MachineOperand::PhysReg(PhysReg::FP),
949 MachineOperand::Imm(offset),
950 ],
951 def: None,
952 });
953 return;
954 }
955 let addr = PhysReg::Gp(8);
956 let (mat_op, abs) = if offset >= 0 {
957 (ArmOpcode::AddImm, offset)
958 } else {
959 (ArmOpcode::SubImm, -offset)
960 };
961 insts.push(MachineInst {
962 opcode: mat_op,
963 operands: vec![
964 MachineOperand::PhysReg(addr),
965 MachineOperand::PhysReg(PhysReg::FP),
966 MachineOperand::Imm(abs),
967 ],
968 def: None,
969 });
970 insts.push(MachineInst {
971 opcode: op,
972 operands: vec![
973 MachineOperand::PhysReg(rt),
974 MachineOperand::PhysReg(addr),
975 MachineOperand::Imm(0),
976 ],
977 def: None,
978 });
979 }
980
981 /// Insert callee-saved register saves in prologue and restores in epilogue.
982 /// Must be called after apply_allocation so we know which callee-saved regs were used.
983 pub fn insert_callee_saves(mf: &mut MachineFunction, callee_saved: &[PhysReg]) {
984 if callee_saved.is_empty() {
985 return;
986 }
987
988 // Allocate stack slots for callee-saved registers.
989 let mut save_slots: Vec<(PhysReg, i32)> = Vec::new();
990 for &reg in callee_saved {
991 let offset = mf.frame.alloc_local(8);
992 save_slots.push((reg, offset));
993 }
994
995 // Insert saves at the start of the entry block (after prologue setup).
996 // Find the insertion point: after the STP + ADD (prologue) instructions.
997 let prologue_end = mf.blocks[0]
998 .insts
999 .iter()
1000 .position(|i| {
1001 // The prologue is StpPre followed by AddImm. Insert after those.
1002 !matches!(i.opcode, ArmOpcode::StpPre | ArmOpcode::AddImm)
1003 })
1004 .unwrap_or(0);
1005
1006 // Emit saves, pairing consecutive same-class registers into STP.
1007 // save_slots are ordered by increasing register number; slots are
1008 // allocated at decreasing offsets (-8, -16, -24, ...). Adjacent
1009 // slots differ by 8 bytes — exactly the STP stride for GP64/FP64.
1010 //
1011 // STP Xt1, Xt2, [Xn, #off]: stores Xt1 at Xn+off, Xt2 at Xn+off+8.
1012 // So to store reg_low (at lower_offset) and reg_high (at lower_offset+8):
1013 // STP reg_low, reg_high, [FP, #lower_offset]
1014 let saves = emit_callee_store_pairs(&save_slots, false);
1015 for (i, save) in saves.into_iter().enumerate() {
1016 mf.blocks[0].insts.insert(prologue_end + i, save);
1017 }
1018
1019 // Insert restores before every epilogue sequence (LdpPost).
1020 for block in &mut mf.blocks {
1021 let mut insertions = Vec::new();
1022 for (i, inst) in block.insts.iter().enumerate() {
1023 if inst.opcode == ArmOpcode::LdpPost {
1024 insertions.push(i);
1025 }
1026 }
1027 for &ldp_idx in insertions.iter().rev() {
1028 // Restores in reverse order (mirror of saves).
1029 let restores = emit_callee_store_pairs(&save_slots, true);
1030 for restore in restores.into_iter().rev() {
1031 block.insts.insert(ldp_idx, restore);
1032 }
1033 }
1034 }
1035 }
1036
1037 /// Build the save (or restore) instruction list for a set of callee-saved
1038 /// slots, pairing consecutive adjacent GP/FP slots into STP/LDP.
1039 ///
1040 /// `restore` = true → emit LDP/LDR (load); false → STP/STR (store).
1041 ///
1042 /// Pairing rule: slots[i] and slots[i+1] are paired when:
1043 /// * Both are GP or both are FP (same register class)
1044 /// * slots[i+1].offset == slots[i].offset - 8 (adjacent 8-byte slots)
1045 ///
1046 /// STP Xt1, Xt2, [Xn, #off] → Xt1 at off, Xt2 at off+8.
1047 /// We pair as: STP slots[i+1].reg, slots[i].reg, [FP, #slots[i+1].offset]
1048 /// because slots[i+1] has the lower (more negative) offset.
1049 fn emit_callee_store_pairs(save_slots: &[(PhysReg, i32)], restore: bool) -> Vec<MachineInst> {
1050 let mut result = Vec::new();
1051 let mut i = 0;
1052 while i < save_slots.len() {
1053 let (reg1, off1) = save_slots[i];
1054 // Try to pair with next slot.
1055 if i + 1 < save_slots.len() {
1056 let (reg2, off2) = save_slots[i + 1];
1057 let is_gp1 = !matches!(reg1, PhysReg::Fp(_) | PhysReg::Fp32(_));
1058 let is_gp2 = !matches!(reg2, PhysReg::Fp(_) | PhysReg::Fp32(_));
1059 let same_class = is_gp1 == is_gp2;
1060 // off1 is the higher slot (less negative); off2 = off1 - 8.
1061 let adjacent = off2 == off1 - 8;
1062 // STP offset must fit in 7-bit signed × 8: range -512..504.
1063 let in_range = (-512..=504).contains(&off2);
1064 if same_class && adjacent && in_range {
1065 let opcode = if restore {
1066 ArmOpcode::LdpOffset
1067 } else {
1068 ArmOpcode::StpOffset
1069 };
1070 let (low_reg, high_reg) = (reg2, reg1);
1071 result.push(MachineInst {
1072 opcode,
1073 operands: vec![
1074 MachineOperand::PhysReg(low_reg),
1075 MachineOperand::PhysReg(high_reg),
1076 MachineOperand::PhysReg(PhysReg::FP),
1077 MachineOperand::Imm(off2 as i64),
1078 ],
1079 def: None,
1080 });
1081 i += 2;
1082 continue;
1083 }
1084 }
1085 // Emit individual STR/LDR.
1086 let (op, is_fp) = match reg1 {
1087 PhysReg::Fp(_) | PhysReg::Fp32(_) => {
1088 if restore {
1089 (ArmOpcode::LdrFpImm, true)
1090 } else {
1091 (ArmOpcode::StrFpImm, true)
1092 }
1093 }
1094 _ => {
1095 if restore {
1096 (ArmOpcode::LdrImm, false)
1097 } else {
1098 (ArmOpcode::StrImm, false)
1099 }
1100 }
1101 };
1102 let _ = is_fp;
1103 result.push(MachineInst {
1104 opcode: op,
1105 operands: vec![
1106 MachineOperand::PhysReg(reg1),
1107 MachineOperand::PhysReg(PhysReg::FP),
1108 MachineOperand::Imm(off1 as i64),
1109 ],
1110 def: if restore {
1111 Some(match reg1 {
1112 PhysReg::Gp(n) | PhysReg::Gp32(n) => crate::codegen::mir::VRegId(n.into()),
1113 PhysReg::Fp(n) | PhysReg::Fp32(n) => crate::codegen::mir::VRegId(n.into()),
1114 _ => crate::codegen::mir::VRegId(0),
1115 })
1116 } else {
1117 None
1118 },
1119 });
1120 i += 1;
1121 }
1122 result
1123 }
1124
1125 /// Basic move coalescing: eliminate mov instructions where src == dest.
1126 pub fn coalesce_moves(mf: &mut MachineFunction) {
1127 for block in &mut mf.blocks {
1128 block.insts.retain(|inst| {
1129 if matches!(inst.opcode, ArmOpcode::MovReg | ArmOpcode::FmovReg)
1130 && inst.operands.len() == 2
1131 && inst.operands[0] == inst.operands[1]
1132 {
1133 return false; // eliminate self-move
1134 }
1135 true
1136 });
1137 }
1138 }
1139
1140 /// Emit the str/ldr pairs that bridge live-range splits across
1141 /// each call boundary. Runs *after* `parallelize_call_arg_moves`
1142 /// so the parallel-copy resolver sees an unfragmented arg-setup
1143 /// sequence (an interposed `str` between an arg-copy and the BL
1144 /// would otherwise stop the scan-back at the wrong point and the
1145 /// resolver would miss arg-copies that need cycle-breaking).
1146 ///
1147 /// For each `SplitRecord` whose `call_position` equals the
1148 /// position of a `Bl`/`Blr` in the function: insert a `str
1149 /// pre_phys, [fp, bridge_slot]` immediately before the call and,
1150 /// if the post-half got its own register, an `ldr post_phys, [fp,
1151 /// bridge_slot]` immediately after. Spilled post-halves rely on
1152 /// the existing per-use spill-load path, which already loads from
1153 /// `bridge_slot` thanks to `apply_allocation`'s position-aware
1154 /// operand rewrite.
1155 pub fn insert_split_bridges(mf: &mut MachineFunction, splits: &[SplitRecord]) {
1156 if splits.is_empty() {
1157 return;
1158 }
1159 let mut call_seen: u32 = 0;
1160 for block_idx in 0..mf.blocks.len() {
1161 let mut new_insts = Vec::with_capacity(mf.blocks[block_idx].insts.len());
1162 let original = std::mem::take(&mut mf.blocks[block_idx].insts);
1163 for inst in original.into_iter() {
1164 let is_call = matches!(inst.opcode, ArmOpcode::Bl | ArmOpcode::Blr);
1165 if is_call {
1166 let idx = call_seen;
1167 // Place bridge `str pre_phys, [fp, slot]` *above* the
1168 // canonical arg-setup mov sequence. Arg-setup movs
1169 // can write to caller-saved physregs that are also
1170 // some split's `pre_phys` — by the time we reach the
1171 // BL, those registers no longer hold the pre-half's
1172 // value. Scan back over arg-setup movs and insert
1173 // bridges at the top of that block.
1174 let mut insert_at = new_insts.len();
1175 while insert_at > 0 && is_call_arg_copy(&new_insts[insert_at - 1]) {
1176 insert_at -= 1;
1177 }
1178 let mut bridges: Vec<MachineInst> = Vec::new();
1179 for r in splits {
1180 if r.call_index != idx {
1181 continue;
1182 }
1183 let store_op = match r.pre_phys {
1184 PhysReg::Fp(_) | PhysReg::Fp32(_) => ArmOpcode::StrFpImm,
1185 _ => ArmOpcode::StrImm,
1186 };
1187 bridges.push(MachineInst {
1188 opcode: store_op,
1189 operands: vec![
1190 MachineOperand::PhysReg(r.pre_phys),
1191 MachineOperand::PhysReg(PhysReg::FP),
1192 MachineOperand::Imm(r.bridge_slot as i64),
1193 ],
1194 def: None,
1195 });
1196 }
1197 for (offset, b) in bridges.into_iter().enumerate() {
1198 new_insts.insert(insert_at + offset, b);
1199 }
1200 }
1201 new_insts.push(inst);
1202 if is_call {
1203 let idx = call_seen;
1204 for r in splits {
1205 if r.call_index != idx {
1206 continue;
1207 }
1208 if let PostHalf::Allocated(p) = r.post {
1209 let load_op = match p {
1210 PhysReg::Fp(_) | PhysReg::Fp32(_) => ArmOpcode::LdrFpImm,
1211 _ => ArmOpcode::LdrImm,
1212 };
1213 new_insts.push(MachineInst {
1214 opcode: load_op,
1215 operands: vec![
1216 MachineOperand::PhysReg(p),
1217 MachineOperand::PhysReg(PhysReg::FP),
1218 MachineOperand::Imm(r.bridge_slot as i64),
1219 ],
1220 def: None,
1221 });
1222 }
1223 }
1224 call_seen += 1;
1225 }
1226 }
1227 mf.blocks[block_idx].insts = new_insts;
1228 }
1229 }
1230
1231 /// Resolve the entry-block argument-receipt parallel copy. The
1232 /// function prologue copies incoming arg registers (x0..x7,
1233 /// d0..d7) into the registers that the allocator chose for the
1234 /// parameter vregs. Emitted naively this is a sequential `mov`
1235 /// chain, which corrupts values whenever a destination aliases a
1236 /// later source — e.g. `mov x4, x3; mov x3, x4` clobbers x4 before
1237 /// the second move can read it. Historically this was hidden
1238 /// because parameter vregs landed in callee-saved registers
1239 /// (x19..x28) that never alias x0..x7; once the allocator started
1240 /// preferring caller-saved destinations, the latent hazard
1241 /// surfaced.
1242 ///
1243 /// Find the leading run of phys-to-phys movs in the entry block
1244 /// (after the frame setup) whose source is an incoming arg
1245 /// register, and run the same parallel-copy resolver
1246 /// (`rewrite_call_arg_copies`) used for pre-call argument setup.
1247 pub fn parallelize_entry_arg_moves(mf: &mut MachineFunction) {
1248 if mf.blocks.is_empty() {
1249 return;
1250 }
1251 let original = std::mem::take(&mut mf.blocks[0].insts);
1252 let mut rebuilt: Vec<MachineInst> = Vec::with_capacity(original.len());
1253 let mut iter = original.into_iter().peekable();
1254
1255 // Pass through the prologue (StpPre/AddImm/SubImm).
1256 while let Some(inst) = iter.peek() {
1257 if matches!(
1258 inst.opcode,
1259 ArmOpcode::StpPre | ArmOpcode::AddImm | ArmOpcode::SubImm
1260 ) {
1261 rebuilt.push(iter.next().unwrap());
1262 } else {
1263 break;
1264 }
1265 }
1266
1267 // Process arg-receipt parallel groups. Tolerate intervening
1268 // store instructions (which read but don't write registers in
1269 // the parallel-copy graph) so a result-address spill or
1270 // partial-receipt store doesn't fragment the receipts. Stop
1271 // once any instruction overwrites an incoming arg register —
1272 // beyond that point those values are no longer the original
1273 // arg values and parallel-copy resolution would be unsound.
1274 let mut pending: Vec<MachineInst> = Vec::new();
1275 loop {
1276 match iter.peek() {
1277 None => break,
1278 Some(inst) if is_arg_receipt_copy(inst) => {
1279 pending.push(iter.next().unwrap());
1280 }
1281 Some(inst) if is_transparent_to_arg_receipts(inst) => {
1282 if !pending.is_empty() {
1283 rebuilt.extend(rewrite_call_arg_copies(std::mem::take(&mut pending)));
1284 }
1285 rebuilt.push(iter.next().unwrap());
1286 }
1287 Some(_) => break,
1288 }
1289 }
1290 if !pending.is_empty() {
1291 rebuilt.extend(rewrite_call_arg_copies(pending));
1292 }
1293 rebuilt.extend(iter);
1294 mf.blocks[0].insts = rebuilt;
1295 }
1296
1297 /// An instruction is *transparent* to the arg-receipt parallel
1298 /// copy if executing it cannot change the value held in any
1299 /// incoming arg register (x0..x7 / d0..d7). Stores qualify (they
1300 /// only read), as does any compute whose destination lies outside
1301 /// the arg-reg range. Anything that writes into x0..x7 forms a
1302 /// barrier — the receipts that come after it are reading
1303 /// post-clobber values, so reordering across the barrier is
1304 /// unsound.
1305 fn is_transparent_to_arg_receipts(inst: &MachineInst) -> bool {
1306 if is_store_opcode(inst.opcode) {
1307 return true;
1308 }
1309 if is_branch_or_call_opcode(inst.opcode) {
1310 return false;
1311 }
1312 match inst.operands.first() {
1313 Some(MachineOperand::PhysReg(dst)) => !is_call_arg_reg(*dst),
1314 _ => true,
1315 }
1316 }
1317
1318 fn is_store_opcode(op: ArmOpcode) -> bool {
1319 matches!(
1320 op,
1321 ArmOpcode::StrImm
1322 | ArmOpcode::StrhImm
1323 | ArmOpcode::StrbImm
1324 | ArmOpcode::StrFpImm
1325 | ArmOpcode::StrReg
1326 | ArmOpcode::StrFpReg
1327 | ArmOpcode::StrQ
1328 | ArmOpcode::StpPre
1329 | ArmOpcode::StpOffset
1330 )
1331 }
1332
1333 fn is_branch_or_call_opcode(op: ArmOpcode) -> bool {
1334 matches!(
1335 op,
1336 ArmOpcode::B
1337 | ArmOpcode::BCond
1338 | ArmOpcode::Bl
1339 | ArmOpcode::Blr
1340 | ArmOpcode::Ret
1341 | ArmOpcode::Cbz
1342 | ArmOpcode::Cbnz
1343 | ArmOpcode::Tbz
1344 | ArmOpcode::Tbnz
1345 )
1346 }
1347
1348 /// Reorder physical argument-register copies immediately before calls so later
1349 /// sources are not clobbered by earlier destination writes.
1350 ///
1351 /// For an indirect call (`BLR Xn`), the target register `Xn` is a
1352 /// live use of the call. If `Xn` lies in the argument-register
1353 /// range (x0..x7) and one of the pending arg-copy moves writes to
1354 /// it, the function pointer would be clobbered before the branch.
1355 /// Save it to a non-arg scratch register (`x10`, distinct from
1356 /// `x9` that `rewrite_call_arg_copies` uses for cycle-breaking) and
1357 /// rewrite the BLR's operand. This was hidden as long as
1358 /// indirect-call targets landed in callee-saved registers (which
1359 /// can never alias x0..x7) but surfaces once non-call-crossing
1360 /// values prefer caller-saved.
1361 pub fn parallelize_call_arg_moves(mf: &mut MachineFunction) {
1362 for block in &mut mf.blocks {
1363 let mut rebuilt: Vec<MachineInst> = Vec::with_capacity(block.insts.len());
1364 for inst in std::mem::take(&mut block.insts) {
1365 if matches!(inst.opcode, ArmOpcode::Bl | ArmOpcode::Blr) {
1366 let mut start = rebuilt.len();
1367 while start > 0 && is_call_arg_copy(&rebuilt[start - 1]) {
1368 start -= 1;
1369 }
1370 let mut adjusted = inst;
1371 if matches!(adjusted.opcode, ArmOpcode::Blr) {
1372 let target_phys = match adjusted.operands.first() {
1373 Some(MachineOperand::PhysReg(t)) => Some(*t),
1374 _ => None,
1375 };
1376 if let Some(target) = target_phys {
1377 if is_call_arg_reg(target) {
1378 let target_alias = phys_reg_alias(target);
1379 let writes_target = rebuilt[start..].iter().any(|i| {
1380 matches!(i.operands.first(),
1381 Some(MachineOperand::PhysReg(d))
1382 if phys_reg_alias(*d) == target_alias)
1383 });
1384 if writes_target {
1385 let scratch = blr_target_scratch(target);
1386 let save = MachineInst {
1387 opcode: move_opcode_for_phys(target),
1388 operands: vec![
1389 MachineOperand::PhysReg(scratch),
1390 MachineOperand::PhysReg(target),
1391 ],
1392 def: None,
1393 };
1394 rebuilt.insert(start, save);
1395 adjusted.operands[0] = MachineOperand::PhysReg(scratch);
1396 start += 1;
1397 }
1398 }
1399 }
1400 }
1401 if start < rebuilt.len() {
1402 let pending = rebuilt.split_off(start);
1403 rebuilt.extend(rewrite_call_arg_copies(pending));
1404 }
1405 rebuilt.push(adjusted);
1406 } else {
1407 rebuilt.push(inst);
1408 }
1409 }
1410 block.insts = rebuilt;
1411 }
1412 }
1413
1414 /// Scratch used to preserve a BLR target across parallel arg
1415 /// setup. Distinct from the `x9` / `d29` used by
1416 /// `rewrite_call_arg_copies` for cycle-breaking so the two scratch
1417 /// uses never collide.
1418 fn blr_target_scratch(target: PhysReg) -> PhysReg {
1419 match target {
1420 PhysReg::Gp(_) => PhysReg::Gp(10),
1421 PhysReg::Gp32(_) => PhysReg::Gp32(10),
1422 _ => panic!("blr_target_scratch: BLR target is always a 64/32-bit GP register"),
1423 }
1424 }
1425
1426 // ---- Helpers ----
1427
1428 fn is_call_arg_copy(inst: &MachineInst) -> bool {
1429 matches!(inst.opcode, ArmOpcode::MovReg | ArmOpcode::FmovReg)
1430 && matches!(
1431 inst.operands.as_slice(),
1432 [MachineOperand::PhysReg(dst), MachineOperand::PhysReg(src)]
1433 if is_call_arg_reg(*dst) && !matches!(src, PhysReg::Xzr | PhysReg::Wzr)
1434 )
1435 }
1436
1437 /// Mirror of `is_call_arg_copy` for the function-entry direction:
1438 /// a phys-to-phys mov whose *source* is an incoming arg register.
1439 fn is_arg_receipt_copy(inst: &MachineInst) -> bool {
1440 matches!(inst.opcode, ArmOpcode::MovReg | ArmOpcode::FmovReg)
1441 && matches!(
1442 inst.operands.as_slice(),
1443 [MachineOperand::PhysReg(_), MachineOperand::PhysReg(src)]
1444 if is_call_arg_reg(*src)
1445 )
1446 }
1447
1448 fn is_call_arg_reg(reg: PhysReg) -> bool {
1449 match reg {
1450 PhysReg::Gp(n) | PhysReg::Gp32(n) => n < 8,
1451 PhysReg::Fp(n) | PhysReg::Fp32(n) => n < 8,
1452 _ => false,
1453 }
1454 }
1455
1456 #[derive(Clone, Copy, PartialEq, Eq)]
1457 enum PhysRegAlias {
1458 Gp(u8),
1459 Fp(u8),
1460 }
1461
1462 fn phys_reg_alias(reg: PhysReg) -> Option<PhysRegAlias> {
1463 match reg {
1464 PhysReg::Gp(n) | PhysReg::Gp32(n) => Some(PhysRegAlias::Gp(n)),
1465 PhysReg::Fp(n) | PhysReg::Fp32(n) => Some(PhysRegAlias::Fp(n)),
1466 _ => None,
1467 }
1468 }
1469
1470 fn scratch_phys_for(reg: PhysReg) -> PhysReg {
1471 match reg {
1472 PhysReg::Gp(_) => PhysReg::Gp(9),
1473 PhysReg::Gp32(_) => PhysReg::Gp32(9),
1474 PhysReg::Fp(_) => PhysReg::Fp(29),
1475 PhysReg::Fp32(_) => PhysReg::Fp32(29),
1476 _ => panic!("call-arg scratch requested for non-register operand"),
1477 }
1478 }
1479
1480 fn move_opcode_for_phys(reg: PhysReg) -> ArmOpcode {
1481 match reg {
1482 PhysReg::Fp(_) | PhysReg::Fp32(_) => ArmOpcode::FmovReg,
1483 PhysReg::Gp(_) | PhysReg::Gp32(_) => ArmOpcode::MovReg,
1484 _ => panic!("move opcode requested for non-register operand"),
1485 }
1486 }
1487
1488 fn rewrite_call_arg_copies(pending_moves: Vec<MachineInst>) -> Vec<MachineInst> {
1489 let mut pending: Vec<(ArmOpcode, PhysReg, PhysReg)> = pending_moves
1490 .into_iter()
1491 .map(|inst| match inst.operands.as_slice() {
1492 [MachineOperand::PhysReg(dst), MachineOperand::PhysReg(src)] => {
1493 (inst.opcode, *dst, *src)
1494 }
1495 _ => panic!("call-arg copy rewrite saw unexpected operand shape"),
1496 })
1497 .collect();
1498 let mut rewritten = Vec::with_capacity(pending.len() + 1);
1499
1500 while !pending.is_empty() {
1501 let safe_idx = (0..pending.len()).find(|&i| {
1502 let (_, dst, _) = pending[i];
1503 let dst_alias = phys_reg_alias(dst).expect("call-arg copy dst should alias");
1504 !pending
1505 .iter()
1506 .enumerate()
1507 .any(|(j, &(_, _, src))| j != i && phys_reg_alias(src) == Some(dst_alias))
1508 });
1509
1510 if let Some(idx) = safe_idx {
1511 let (opcode, dst, src) = pending.remove(idx);
1512 rewritten.push(MachineInst {
1513 opcode,
1514 operands: vec![MachineOperand::PhysReg(dst), MachineOperand::PhysReg(src)],
1515 def: None,
1516 });
1517 continue;
1518 }
1519
1520 let (_, _, src) = pending[0];
1521 let scratch = scratch_phys_for(src);
1522 rewritten.push(MachineInst {
1523 opcode: move_opcode_for_phys(src),
1524 operands: vec![
1525 MachineOperand::PhysReg(scratch),
1526 MachineOperand::PhysReg(src),
1527 ],
1528 def: None,
1529 });
1530 pending[0].2 = scratch;
1531 }
1532
1533 rewritten
1534 }
1535
1536 fn expire_intervals(
1537 active: &mut Vec<(u8, u32, VRegId)>,
1538 free_caller: &mut Vec<u8>,
1539 free_callee: &mut Vec<u8>,
1540 callee_range: &std::ops::RangeInclusive<u8>,
1541 pos: u32,
1542 ) {
1543 let mut i = 0;
1544 while i < active.len() {
1545 if active[i].1 < pos {
1546 let (reg, _, _) = active.remove(i);
1547 if callee_range.contains(&reg) {
1548 free_callee.push(reg);
1549 } else {
1550 free_caller.push(reg);
1551 }
1552 } else {
1553 i += 1;
1554 }
1555 }
1556 }
1557
1558 /// Scratch registers for spill code. Multiple scratches needed when an
1559 /// instruction has multiple spilled operands.
1560 const GP_SPILL_SCRATCH: [u8; 3] = [9, 10, 11];
1561 const FP_SPILL_SCRATCH: [u8; 3] = [29, 30, 31];
1562
1563 fn spill_scratch(class: RegClass, idx: usize) -> (PhysReg, ArmOpcode) {
1564 match class {
1565 RegClass::Fp64 => (PhysReg::Fp(FP_SPILL_SCRATCH[idx % 3]), ArmOpcode::LdrFpImm),
1566 RegClass::Fp32 => (
1567 PhysReg::Fp32(FP_SPILL_SCRATCH[idx % 3]),
1568 ArmOpcode::LdrFpImm,
1569 ),
1570 // 128-bit vector spills/fills via LdrQ — same FP scratch
1571 // bank (the V registers ARE the 128-bit form of the same
1572 // physical D/S registers).
1573 RegClass::V128 => (PhysReg::Fp(FP_SPILL_SCRATCH[idx % 3]), ArmOpcode::LdrQ),
1574 RegClass::Gp32 => (PhysReg::Gp32(GP_SPILL_SCRATCH[idx % 3]), ArmOpcode::LdrImm),
1575 RegClass::Gp64 => (PhysReg::Gp(GP_SPILL_SCRATCH[idx % 3]), ArmOpcode::LdrImm),
1576 }
1577 }
1578
1579 #[cfg(test)]
1580 mod tests {
1581 use super::*;
1582 use crate::codegen::isel::select_function;
1583 use crate::ir::builder::FuncBuilder;
1584 use crate::ir::inst::*;
1585 use crate::ir::types::*;
1586
1587 #[test]
1588 fn linear_scan_assigns_registers() {
1589 let mut func = Function::new("test".into(), vec![], IrType::Void);
1590 {
1591 let mut b = FuncBuilder::new(&mut func);
1592 let x = b.const_i32(10);
1593 let y = b.const_i32(20);
1594 let _z = b.iadd(x, y);
1595 b.ret_void();
1596 }
1597 let mut mf = select_function(&func);
1598 let result = linear_scan(&mut mf);
1599 // Should have assignments for the vregs.
1600 assert!(
1601 !result.assignments.is_empty(),
1602 "should have register assignments"
1603 );
1604 // No spills needed for 3 vregs.
1605 assert!(
1606 result.spills.is_empty(),
1607 "should not spill with only 3 vregs"
1608 );
1609 }
1610
1611 #[test]
1612 fn linear_scan_no_x18() {
1613 let mut func = Function::new("test".into(), vec![], IrType::Void);
1614 {
1615 let mut b = FuncBuilder::new(&mut func);
1616 // Create many vregs to exercise the allocator.
1617 let mut vals = Vec::new();
1618 for i in 0..20 {
1619 vals.push(b.const_i32(i));
1620 }
1621 // Use them all.
1622 let mut sum = vals[0];
1623 for &v in &vals[1..] {
1624 sum = b.iadd(sum, v);
1625 }
1626 b.ret_void();
1627 }
1628 let mut mf = select_function(&func);
1629 let result = linear_scan(&mut mf);
1630 // x18 must never be assigned.
1631 for phys in result.assignments.values() {
1632 match phys {
1633 PhysReg::Gp(18) | PhysReg::Gp32(18) => {
1634 panic!("x18 assigned — platform reserved!");
1635 }
1636 _ => {}
1637 }
1638 }
1639 }
1640
1641 #[test]
1642 fn coalesce_eliminates_self_moves() {
1643 let mut mf = MachineFunction::new("test".into());
1644 let reg = PhysReg::Gp(9);
1645 mf.blocks[0].insts.push(MachineInst {
1646 opcode: ArmOpcode::MovReg,
1647 operands: vec![MachineOperand::PhysReg(reg), MachineOperand::PhysReg(reg)],
1648 def: None,
1649 });
1650 mf.blocks[0].insts.push(MachineInst {
1651 opcode: ArmOpcode::Ret,
1652 operands: vec![],
1653 def: None,
1654 });
1655 assert_eq!(mf.blocks[0].insts.len(), 2);
1656 coalesce_moves(&mut mf);
1657 assert_eq!(
1658 mf.blocks[0].insts.len(),
1659 1,
1660 "self-move should be eliminated"
1661 );
1662 }
1663
1664 #[test]
1665 fn parallelize_call_arg_moves_preserves_later_source_registers() {
1666 let mut mf = MachineFunction::new("test".into());
1667 mf.blocks[0].insts.push(MachineInst {
1668 opcode: ArmOpcode::MovReg,
1669 operands: vec![
1670 MachineOperand::PhysReg(PhysReg::Gp(0)),
1671 MachineOperand::PhysReg(PhysReg::Gp(28)),
1672 ],
1673 def: None,
1674 });
1675 mf.blocks[0].insts.push(MachineInst {
1676 opcode: ArmOpcode::MovReg,
1677 operands: vec![
1678 MachineOperand::PhysReg(PhysReg::Gp(1)),
1679 MachineOperand::PhysReg(PhysReg::Gp(0)),
1680 ],
1681 def: None,
1682 });
1683 mf.blocks[0].insts.push(MachineInst {
1684 opcode: ArmOpcode::Blr,
1685 operands: vec![MachineOperand::PhysReg(PhysReg::Gp(12))],
1686 def: None,
1687 });
1688
1689 parallelize_call_arg_moves(&mut mf);
1690
1691 assert_eq!(mf.blocks[0].insts.len(), 3);
1692 assert_eq!(
1693 mf.blocks[0].insts[0].operands,
1694 vec![
1695 MachineOperand::PhysReg(PhysReg::Gp(1)),
1696 MachineOperand::PhysReg(PhysReg::Gp(0)),
1697 ]
1698 );
1699 assert_eq!(
1700 mf.blocks[0].insts[1].operands,
1701 vec![
1702 MachineOperand::PhysReg(PhysReg::Gp(0)),
1703 MachineOperand::PhysReg(PhysReg::Gp(28)),
1704 ]
1705 );
1706 }
1707
1708 #[test]
1709 fn parallelize_call_arg_moves_breaks_cycles_with_scratch() {
1710 let mut mf = MachineFunction::new("test".into());
1711 mf.blocks[0].insts.push(MachineInst {
1712 opcode: ArmOpcode::MovReg,
1713 operands: vec![
1714 MachineOperand::PhysReg(PhysReg::Gp(0)),
1715 MachineOperand::PhysReg(PhysReg::Gp(1)),
1716 ],
1717 def: None,
1718 });
1719 mf.blocks[0].insts.push(MachineInst {
1720 opcode: ArmOpcode::MovReg,
1721 operands: vec![
1722 MachineOperand::PhysReg(PhysReg::Gp(1)),
1723 MachineOperand::PhysReg(PhysReg::Gp(0)),
1724 ],
1725 def: None,
1726 });
1727 mf.blocks[0].insts.push(MachineInst {
1728 opcode: ArmOpcode::Bl,
1729 operands: vec![MachineOperand::Extern("_callee".into())],
1730 def: None,
1731 });
1732
1733 parallelize_call_arg_moves(&mut mf);
1734
1735 assert_eq!(mf.blocks[0].insts.len(), 4);
1736 assert_eq!(
1737 mf.blocks[0].insts[0].operands,
1738 vec![
1739 MachineOperand::PhysReg(PhysReg::Gp(9)),
1740 MachineOperand::PhysReg(PhysReg::Gp(1)),
1741 ]
1742 );
1743 assert_eq!(
1744 mf.blocks[0].insts[1].operands,
1745 vec![
1746 MachineOperand::PhysReg(PhysReg::Gp(1)),
1747 MachineOperand::PhysReg(PhysReg::Gp(0)),
1748 ]
1749 );
1750 assert_eq!(
1751 mf.blocks[0].insts[2].operands,
1752 vec![
1753 MachineOperand::PhysReg(PhysReg::Gp(0)),
1754 MachineOperand::PhysReg(PhysReg::Gp(9)),
1755 ]
1756 );
1757 }
1758
1759 #[test]
1760 fn parallelize_call_arg_moves_ignores_zero_materialization_into_arg_regs() {
1761 let mut mf = MachineFunction::new("test".into());
1762 mf.blocks[0].insts.push(MachineInst {
1763 opcode: ArmOpcode::MovReg,
1764 operands: vec![
1765 MachineOperand::PhysReg(PhysReg::Gp(4)),
1766 MachineOperand::PhysReg(PhysReg::Xzr),
1767 ],
1768 def: None,
1769 });
1770 mf.blocks[0].insts.push(MachineInst {
1771 opcode: ArmOpcode::MovReg,
1772 operands: vec![
1773 MachineOperand::PhysReg(PhysReg::Gp(0)),
1774 MachineOperand::PhysReg(PhysReg::Gp(4)),
1775 ],
1776 def: None,
1777 });
1778 mf.blocks[0].insts.push(MachineInst {
1779 opcode: ArmOpcode::Blr,
1780 operands: vec![MachineOperand::PhysReg(PhysReg::Gp(19))],
1781 def: None,
1782 });
1783
1784 parallelize_call_arg_moves(&mut mf);
1785
1786 assert_eq!(mf.blocks[0].insts.len(), 3);
1787 assert_eq!(
1788 mf.blocks[0].insts[0].operands,
1789 vec![
1790 MachineOperand::PhysReg(PhysReg::Gp(4)),
1791 MachineOperand::PhysReg(PhysReg::Xzr),
1792 ]
1793 );
1794 assert_eq!(
1795 mf.blocks[0].insts[1].operands,
1796 vec![
1797 MachineOperand::PhysReg(PhysReg::Gp(0)),
1798 MachineOperand::PhysReg(PhysReg::Gp(4)),
1799 ]
1800 );
1801 }
1802
1803 #[test]
1804 fn linear_scan_prefers_caller_saved_when_no_calls_cross() {
1805 // No call-crossing intervals — every assigned vreg should
1806 // land on a caller-saved register and callee_saved_used
1807 // should stay empty (no prologue STP/LDP overhead).
1808 let mut func = Function::new("test".into(), vec![], IrType::Void);
1809 {
1810 let mut b = FuncBuilder::new(&mut func);
1811 let x = b.const_i32(1);
1812 let y = b.const_i32(2);
1813 let z = b.iadd(x, y);
1814 let _w = b.iadd(z, x);
1815 b.ret_void();
1816 }
1817 let mut mf = select_function(&func);
1818 let result = linear_scan(&mut mf);
1819 assert!(
1820 result.callee_saved_used.is_empty(),
1821 "no callee-saved expected for call-free function, got {:?}",
1822 result.callee_saved_used
1823 );
1824 for phys in result.assignments.values() {
1825 if let PhysReg::Gp(n) | PhysReg::Gp32(n) = phys {
1826 assert!(
1827 !GP_CALLEE_SAVED.contains(n),
1828 "vreg landed on callee-saved x{} despite caller-saved availability",
1829 n
1830 );
1831 }
1832 }
1833 }
1834
1835 #[test]
1836 fn parallelize_entry_arg_moves_resolves_swap_in_receipts() {
1837 let mut mf = MachineFunction::new("test".into());
1838 // Frame setup.
1839 mf.blocks[0].insts.push(MachineInst {
1840 opcode: ArmOpcode::StpPre,
1841 operands: vec![],
1842 def: None,
1843 });
1844 mf.blocks[0].insts.push(MachineInst {
1845 opcode: ArmOpcode::AddImm,
1846 operands: vec![],
1847 def: None,
1848 });
1849 // Arg-receipt swap: mov x4, x3; mov x3, x4. Naive
1850 // sequential emit would corrupt; the parallelizer must
1851 // route through scratch x9.
1852 mf.blocks[0].insts.push(MachineInst {
1853 opcode: ArmOpcode::MovReg,
1854 operands: vec![
1855 MachineOperand::PhysReg(PhysReg::Gp(4)),
1856 MachineOperand::PhysReg(PhysReg::Gp(3)),
1857 ],
1858 def: None,
1859 });
1860 mf.blocks[0].insts.push(MachineInst {
1861 opcode: ArmOpcode::MovReg,
1862 operands: vec![
1863 MachineOperand::PhysReg(PhysReg::Gp(3)),
1864 MachineOperand::PhysReg(PhysReg::Gp(4)),
1865 ],
1866 def: None,
1867 });
1868 parallelize_entry_arg_moves(&mut mf);
1869 let body: Vec<&[MachineOperand]> = mf.blocks[0].insts[2..]
1870 .iter()
1871 .map(|i| i.operands.as_slice())
1872 .collect();
1873 // Expect the parallelizer to insert a scratch save then
1874 // emit the two receipts without clobbering.
1875 assert_eq!(body.len(), 3, "swap pair should expand to 3 insts");
1876 assert_eq!(
1877 body[0],
1878 &[
1879 MachineOperand::PhysReg(PhysReg::Gp(9)),
1880 MachineOperand::PhysReg(PhysReg::Gp(3)),
1881 ]
1882 );
1883 }
1884
1885 #[test]
1886 fn parallelize_entry_arg_moves_tolerates_intervening_store() {
1887 let mut mf = MachineFunction::new("test".into());
1888 mf.blocks[0].insts.push(MachineInst {
1889 opcode: ArmOpcode::StpPre,
1890 operands: vec![],
1891 def: None,
1892 });
1893 mf.blocks[0].insts.push(MachineInst {
1894 opcode: ArmOpcode::AddImm,
1895 operands: vec![],
1896 def: None,
1897 });
1898 // mov x10, x0 (transparent: dst x10 outside arg-reg range)
1899 mf.blocks[0].insts.push(MachineInst {
1900 opcode: ArmOpcode::MovReg,
1901 operands: vec![
1902 MachineOperand::PhysReg(PhysReg::Gp(10)),
1903 MachineOperand::PhysReg(PhysReg::Gp(0)),
1904 ],
1905 def: None,
1906 });
1907 // str x10, [fp, #-56] (store, transparent — reads x10)
1908 mf.blocks[0].insts.push(MachineInst {
1909 opcode: ArmOpcode::StrImm,
1910 operands: vec![
1911 MachineOperand::PhysReg(PhysReg::Gp(10)),
1912 MachineOperand::PhysReg(PhysReg::FP),
1913 MachineOperand::Imm(-56),
1914 ],
1915 def: None,
1916 });
1917 // Swap pair after the store — must still be parallelized.
1918 mf.blocks[0].insts.push(MachineInst {
1919 opcode: ArmOpcode::MovReg,
1920 operands: vec![
1921 MachineOperand::PhysReg(PhysReg::Gp(4)),
1922 MachineOperand::PhysReg(PhysReg::Gp(3)),
1923 ],
1924 def: None,
1925 });
1926 mf.blocks[0].insts.push(MachineInst {
1927 opcode: ArmOpcode::MovReg,
1928 operands: vec![
1929 MachineOperand::PhysReg(PhysReg::Gp(3)),
1930 MachineOperand::PhysReg(PhysReg::Gp(4)),
1931 ],
1932 def: None,
1933 });
1934 parallelize_entry_arg_moves(&mut mf);
1935 // The store should still appear (kept in place); the swap
1936 // pair should have expanded to 3 movs via scratch.
1937 let opcodes: Vec<ArmOpcode> = mf.blocks[0].insts.iter().map(|i| i.opcode).collect();
1938 assert!(
1939 opcodes.contains(&ArmOpcode::StrImm),
1940 "store should be preserved across parallelization"
1941 );
1942 let mov_count = opcodes
1943 .iter()
1944 .filter(|&&op| op == ArmOpcode::MovReg)
1945 .count();
1946 assert_eq!(
1947 mov_count, 4,
1948 "expected 4 movs (1 receipt + 3 from swap expansion), got {}",
1949 mov_count
1950 );
1951 }
1952
1953 #[test]
1954 fn parallelize_call_arg_moves_routes_blr_target_through_scratch() {
1955 let mut mf = MachineFunction::new("test".into());
1956 // ldr x0, [...] — function pointer loaded into x0
1957 mf.blocks[0].insts.push(MachineInst {
1958 opcode: ArmOpcode::LdrImm,
1959 operands: vec![
1960 MachineOperand::PhysReg(PhysReg::Gp(0)),
1961 MachineOperand::PhysReg(PhysReg::FP),
1962 MachineOperand::Imm(-8),
1963 ],
1964 def: None,
1965 });
1966 // mov x0, x12 — arg-setup also targets x0 (clobbers fn ptr).
1967 mf.blocks[0].insts.push(MachineInst {
1968 opcode: ArmOpcode::MovReg,
1969 operands: vec![
1970 MachineOperand::PhysReg(PhysReg::Gp(0)),
1971 MachineOperand::PhysReg(PhysReg::Gp(12)),
1972 ],
1973 def: None,
1974 });
1975 mf.blocks[0].insts.push(MachineInst {
1976 opcode: ArmOpcode::Blr,
1977 operands: vec![MachineOperand::PhysReg(PhysReg::Gp(0))],
1978 def: None,
1979 });
1980 parallelize_call_arg_moves(&mut mf);
1981 // First inst should now be `mov x10, x0` (preserve the fn
1982 // pointer to scratch); the BLR's operand should be x10.
1983 assert_eq!(mf.blocks[0].insts[0].opcode, ArmOpcode::LdrImm);
1984 assert_eq!(mf.blocks[0].insts[1].opcode, ArmOpcode::MovReg);
1985 assert_eq!(
1986 mf.blocks[0].insts[1].operands,
1987 vec![
1988 MachineOperand::PhysReg(PhysReg::Gp(10)),
1989 MachineOperand::PhysReg(PhysReg::Gp(0)),
1990 ]
1991 );
1992 let blr = mf.blocks[0].insts.last().unwrap();
1993 assert_eq!(blr.opcode, ArmOpcode::Blr);
1994 assert_eq!(
1995 blr.operands,
1996 vec![MachineOperand::PhysReg(PhysReg::Gp(10))]
1997 );
1998 }
1999
2000 #[test]
2001 fn linear_scan_splits_call_crossing_when_callee_saved_exhausted() {
2002 // 12 vregs all live across a single BL forces the
2003 // allocator past the 10-deep callee-saved pool. Without
2004 // splitting the overflow vregs full-spill; with splitting
2005 // they go pre-call→caller-saved → str/ldr-bridge →
2006 // post-call→caller-saved (so each split is two halves on
2007 // caller-saved registers, no whole-range memory traffic).
2008 let mut mf = MachineFunction::new("test".into());
2009 let vs: Vec<VRegId> = (0..12).map(|_| mf.new_vreg(RegClass::Gp64)).collect();
2010 for (i, &v) in vs.iter().enumerate() {
2011 mf.blocks[0].insts.push(MachineInst {
2012 opcode: ArmOpcode::Movz,
2013 operands: vec![MachineOperand::VReg(v), MachineOperand::Imm(i as i64 + 1)],
2014 def: Some(v),
2015 });
2016 }
2017 mf.blocks[0].insts.push(MachineInst {
2018 opcode: ArmOpcode::Bl,
2019 operands: vec![MachineOperand::Extern("_foo".into())],
2020 def: None,
2021 });
2022 for &v in &vs {
2023 mf.blocks[0].insts.push(MachineInst {
2024 opcode: ArmOpcode::MovReg,
2025 operands: vec![
2026 MachineOperand::PhysReg(PhysReg::Gp(0)),
2027 MachineOperand::VReg(v),
2028 ],
2029 def: None,
2030 });
2031 }
2032 mf.blocks[0].insts.push(MachineInst {
2033 opcode: ArmOpcode::Ret,
2034 operands: vec![],
2035 def: None,
2036 });
2037 let result = linear_scan(&mut mf);
2038 assert!(
2039 !result.split_records.is_empty(),
2040 "expected splits when 12 call-crossing vregs collide with the 10-deep callee-saved pool"
2041 );
2042 for r in &result.split_records {
2043 // pre-half must be on a caller-saved register; that's
2044 // the entire point of splitting (the callee-saved tier
2045 // is exhausted at split time).
2046 if let PhysReg::Gp(n) | PhysReg::Gp32(n) = r.pre_phys {
2047 assert!(
2048 !GP_CALLEE_SAVED.contains(&n),
2049 "split pre_phys should be caller-saved, got x{}",
2050 n
2051 );
2052 }
2053 // bridge_slot must be a real frame offset.
2054 assert!(r.bridge_slot < 0, "bridge_slot should be FP-negative");
2055 }
2056 }
2057 }
2058