armfortas Public

Watch 0 Fork 0 Star 0
Rust · 82998 bytes Raw Blame History
  
        1
        //! Linear scan register allocator.
      
        2
        //!
      
        3
        //! Replaces the naive spill-everything strategy with proper register assignment.
      
        4
        //! Algorithm: sort live intervals by start, walk in order, assign physical registers.
      
        5
        //! When no register is available, spill the interval ending furthest (or the current
      
        6
        //! one if it ends first). Insert ldr/str for spill code.
      
        7
        //!
      
        8
        //! Callee-saved registers (x19-x28, d8-d15) are tracked — if used, the function
      
        9
        //! prologue/epilogue must save/restore them.
      
        10
        
        11
        use super::liveness::compute_liveness;
      
        12
        use super::mir::*;
      
        13
        use std::collections::{HashMap, HashSet};
      
        14
        
        15
        /// GP registers available for allocation (excludes x18, x29, x30, x31/sp).
      
        16
        /// Ordered: caller-saved first (prefer these to avoid save/restore overhead),
      
        17
        /// then callee-saved.
      
        18
        ///
      
        19
        /// x8 is reserved for large-offset addressing during emission.
      
        20
        /// x16, x17 are the linker scratch registers (clobbered by dynamic
      
        21
        /// dispatch stubs).
      
        22
        ///
      
        23
        /// x9, x10, x11 are reserved **exclusively** for spill reloads
      
        24
        /// (see GP_SPILL_SCRATCH). They used to be in the allocation pool
      
        25
        /// under the theory that spill code could "borrow" them when
      
        26
        /// they're temporarily free, but that assumption is only valid
      
        27
        /// when the reload is inserted at a point where the borrowed
      
        28
        /// register is dead — and the spill code had no reliable way to
      
        29
        /// determine that. The result was a reload clobbering a
      
        30
        /// freshly-computed live value (the slice-print crash at -O1+).
      
        31
        /// Pinning x9-x11 out of the allocation pool costs 3 vregs worth
      
        32
        /// of pressure and guarantees reloads never clash.
      
        33
        const GP_ALLOC_ORDER: [u8; 22] = [
      
        34
            // Caller-saved (temporary, no save needed)
      
        35
            12, 13, 14, 15, // x12-x15
      
        36
            0, 1, 2, 3, 4, 5, 6, 7, // x0-x7 (args, but available between calls)
      
        37
            // Callee-saved (must save/restore if used)
      
        38
            19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
      
        39
        ];
      
        40
        
        41
        /// FP registers available for allocation. Same rationale as GP:
      
        42
        /// d29, d30, d31 are reserved exclusively as spill reload scratch.
      
        43
        const FP_ALLOC_ORDER: [u8; 29] = [
      
        44
            // Caller-saved
      
        45
            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, 3, 4, 5, 6, 7,
      
        46
            // Callee-saved
      
        47
            8, 9, 10, 11, 12, 13, 14, 15,
      
        48
        ];
      
        49
        
        50
        /// Callee-saved GP register range.
      
        51
        const GP_CALLEE_SAVED: std::ops::RangeInclusive<u8> = 19..=28;
      
        52
        /// Callee-saved FP register range.
      
        53
        const FP_CALLEE_SAVED: std::ops::RangeInclusive<u8> = 8..=15;
      
        54
        
        55
        /// Bytes a single spill of a vreg of class `class` occupies. NEON
      
        56
        /// vectors need 16; everything else fits in 8. Used to size frame
      
        57
        /// slots so the LdrQ/StrQ pair on a V128 spill operates on a slot
      
        58
        /// that's actually 16 bytes wide and 16-byte aligned.
      
        59
        fn spill_slot_size(class: RegClass) -> u32 {
      
        60
            match class {
      
        61
                RegClass::V128 => 16,
      
        62
                _ => 8,
      
        63
            }
      
        64
        }
      
        65
        
        66
        /// Where a split interval's *post-call* half lives. `Allocated`
      
        67
        /// is the win — the post-half got a register, so the only memory
      
        68
        /// traffic is the str/ldr that bridges the call.  `Spilled` is the
      
        69
        /// fall-back: the post-half couldn't be allocated to a register
      
        70
        /// either, so post-call uses load directly from `bridge_slot`
      
        71
        /// (which is the same slot the pre-half wrote into before the
      
        72
        /// call), and we omit the redundant ldr.
      
        73
        #[derive(Debug, Clone, Copy)]
      
        74
        pub enum PostHalf {
      
        75
            Allocated(PhysReg),
      
        76
            Spilled,
      
        77
        }
      
        78
        
        79
        /// One record per call boundary at which a live range was split.
      
        80
        /// Drives `apply_allocation` to (a) rewrite operands of `vreg`
      
        81
        /// based on whether they precede or follow the call, and the
      
        82
        /// post-pass `insert_split_bridges` to (b) emit `str pre_phys,
      
        83
        /// [bridge_slot]` before the call plus, if the post-half got a
      
        84
        /// register, `ldr post_phys, [bridge_slot]` after.
      
        85
        ///
      
        86
        /// `call_position` is the BL/BLR's instruction position in the
      
        87
        /// liveness numbering — used by apply_allocation to compare with
      
        88
        /// each operand's cur_pos. `call_index` is the same call's
      
        89
        /// 0-based walk-order ordinal among all calls in the function —
      
        90
        /// used by insert_split_bridges to match the BL/BLR by ordinal,
      
        91
        /// since intervening passes (apply_allocation's spill code) shift
      
        92
        /// numerical positions but preserve walk order.
      
        93
        #[derive(Debug, Clone)]
      
        94
        pub struct SplitRecord {
      
        95
            pub vreg: VRegId,
      
        96
            pub call_position: u32,
      
        97
            pub call_index: u32,
      
        98
            pub pre_phys: PhysReg,
      
        99
            pub post: PostHalf,
      
        100
            pub bridge_slot: i32,
      
        101
        }
      
        102
        
        103
        /// Result of register allocation.
      
        104
        pub struct AllocResult {
      
        105
            /// VRegId → assigned PhysReg. For a split vreg, this holds the
      
        106
            /// pre-call (first-half) physreg; the post-half lives in the
      
        107
            /// matching `SplitRecord`.
      
        108
            pub assignments: HashMap<VRegId, PhysReg>,
      
        109
            /// Spilled vregs → stack frame offset.
      
        110
            pub spills: HashMap<VRegId, i32>,
      
        111
            /// Callee-saved registers that were used and need save/restore.
      
        112
            pub callee_saved_used: Vec<PhysReg>,
      
        113
            /// Live-range splits performed at call boundaries. Sorted by
      
        114
            /// `(vreg, call_position)` for deterministic lookup.
      
        115
            pub split_records: Vec<SplitRecord>,
      
        116
        }
      
        117
        
        118
        /// Run linear scan register allocation on a machine function.
      
        119
        pub fn linear_scan(mf: &mut MachineFunction) -> AllocResult {
      
        120
            let liveness = compute_liveness(mf);
      
        121
        
        122
            let mut assignments: HashMap<VRegId, PhysReg> = HashMap::new();
      
        123
            let mut spills: HashMap<VRegId, i32> = HashMap::new();
      
        124
            // Active intervals: (reg_num, interval_end, current_vreg). The
      
        125
            // vreg is tracked here so spill victim selection can identify
      
        126
            // the *current* holder of a physical register without iterating
      
        127
            // the `assignments` HashMap (which accumulates stale historical
      
        128
            // entries when registers are reused after expiry, and whose
      
        129
            // iteration order is non-deterministic). Audit fix surfaced by
      
        130
            // mem2reg work — produced flaky `select_type.f90` failures and
      
        131
            // non-reproducible builds across compiles.
      
        132
            let mut active_gp: Vec<(u8, u32, VRegId)> = Vec::new();
      
        133
            let mut active_fp: Vec<(u8, u32, VRegId)> = Vec::new();
      
        134
            // Two free pools per class. Non-call-crossing intervals draw
      
        135
            // from caller-saved first (no save/restore overhead in the
      
        136
            // prologue/epilogue); the callee-saved tier is reserved for
      
        137
            // call-crossing intervals, with caller-tier exhaustion as a
      
        138
            // fall-through. Previously this was a single Vec consumed via
      
        139
            // pop(), which returned the *last* element first — quietly
      
        140
            // inverting the policy documented above on GP_ALLOC_ORDER and
      
        141
            // forcing every function with even a single allocated vreg to
      
        142
            // mark x28/x27/... as callee_saved_used.
      
        143
            let split_pool = |order: &[u8], callee: &std::ops::RangeInclusive<u8>| -> (Vec<u8>, Vec<u8>) {
      
        144
                let mut caller = Vec::new();
      
        145
                let mut callees = Vec::new();
      
        146
                for &r in order {
      
        147
                    if callee.contains(&r) {
      
        148
                        callees.push(r);
      
        149
                    } else {
      
        150
                        caller.push(r);
      
        151
                    }
      
        152
                }
      
        153
                (caller, callees)
      
        154
            };
      
        155
            let (mut free_gp_caller, mut free_gp_callee) =
      
        156
                split_pool(&GP_ALLOC_ORDER, &GP_CALLEE_SAVED);
      
        157
            let (mut free_fp_caller, mut free_fp_callee) =
      
        158
                split_pool(&FP_ALLOC_ORDER, &FP_CALLEE_SAVED);
      
        159
            let mut callee_saved_used: HashSet<PhysReg> = HashSet::new();
      
        160
        
        161
            // Hard-coded PhysReg writes inside instructions: positions
      
        162
            // where operand 0 is a PhysReg, e.g. arg-setup
      
        163
            // `mov PhysReg(x_i), VReg(value)` immediately before each BL.
      
        164
            // The splitter consults this map to avoid pre_phys candidates
      
        165
            // that will be clobbered by an arg-setup inside the pre-half's
      
        166
            // range — picking such a register would corrupt the bridge str
      
        167
            // because by the time the bridge runs the register no longer
      
        168
            // holds the pre-half value. We only constrain the splitter
      
        169
            // here (not the general allocator) to keep the inflated-
      
        170
            // liveness compensations elsewhere intact.
      
        171
            let mut phys_writes_gp: HashMap<u8, Vec<u32>> = HashMap::new();
      
        172
            let mut phys_writes_fp: HashMap<u8, Vec<u32>> = HashMap::new();
      
        173
            {
      
        174
                let mut p: u32 = 0;
      
        175
                for block in &mf.blocks {
      
        176
                    for inst in &block.insts {
      
        177
                        if let Some(MachineOperand::PhysReg(phys)) = inst.operands.first() {
      
        178
                            match phys {
      
        179
                                PhysReg::Gp(n) | PhysReg::Gp32(n) => {
      
        180
                                    phys_writes_gp.entry(*n).or_default().push(p);
      
        181
                                }
      
        182
                                PhysReg::Fp(n) | PhysReg::Fp32(n) => {
      
        183
                                    phys_writes_fp.entry(*n).or_default().push(p);
      
        184
                                }
      
        185
                                _ => {}
      
        186
                            }
      
        187
                        }
      
        188
                        p += 2;
      
        189
                    }
      
        190
                }
      
        191
            }
      
        192
            let phys_written_in = |phys_n: u8, is_fp: bool, start: u32, end: u32| -> bool {
      
        193
                let map = if is_fp {
      
        194
                    &phys_writes_fp
      
        195
                } else {
      
        196
                    &phys_writes_gp
      
        197
                };
      
        198
                if let Some(positions) = map.get(&phys_n) {
      
        199
                    positions.iter().any(|&pp| pp > start && pp <= end)
      
        200
                } else {
      
        201
                    false
      
        202
                }
      
        203
            };
      
        204
        
        205
            // Tracks live-range splits in progress: each entry maps a
      
        206
            // synthetic post-half vreg to (original vreg, call position,
      
        207
            // bridge slot). After allocation we walk this map to
      
        208
            // materialize SplitRecords using whatever physreg/slot the
      
        209
            // post-half ended up with.
      
        210
            let mut splits_in_progress: HashMap<VRegId, (VRegId, u32, i32)> = HashMap::new();
      
        211
        
        212
            // Per-vreg actual def/use position range, computed by direct
      
        213
            // walk of the MIR. Used by the splitter to sanity-check
      
        214
            // crosses_call before deciding to split: the global liveness
      
        215
            // dataflow has a quirk that extends every dest vreg's live
      
        216
            // range backward over its own def (op0 of `mov vreg_dst,
      
        217
            // vreg_src` is treated as both def and use), inflating short
      
        218
            // vreg-to-vreg copy chains to span entire blocks. The
      
        219
            // splitter would then see a "call crossing" for a vreg whose
      
        220
            // real def/use range is well before any call. Splitting such a
      
        221
            // vreg corrupts the chain. We don't fix the dataflow globally
      
        222
            // — other regalloc decisions depend on the inflated ranges in
      
        223
            // ways that surface latent bugs in arg-setup emission — but we
      
        224
            // do consult this tighter map before committing to a split.
      
        225
            let mut vreg_actual_range: HashMap<VRegId, (u32, u32)> = HashMap::new();
      
        226
            // Vregs defined in more than one MIR block — these are
      
        227
            // phi-like (block params receiving values via the parallel-copy
      
        228
            // sequence emit_branch_arg_copies inserts at the end of every
      
        229
            // predecessor). Linear-position liveness can't represent the
      
        230
            // true CFG-aware live set for these, so the splitter must not
      
        231
            // act on them: a call block lexically wedged between the def
      
        232
            // edge and the use block fakes a crossing point that no actual
      
        233
            // control-flow path traverses, and the resulting split assigns
      
        234
            // the post-half a different physreg than the pre-half — every
      
        235
            // predecessor's parallel-copy lands in pre_phys but every use
      
        236
            // inside the loop reads post_phys. (`realworld_affine_shift.f90`
      
        237
            // at -O2+ exhibits this: V_iv defined in `if_end_1`, used in
      
        238
            // `do_check_3`/`do_body_4`, but `if_then_2` carrying the
      
        239
            // recursive call is lexically between if_end_1 and do_check_3.)
      
        240
            let mut vreg_def_blocks: HashMap<VRegId, std::collections::HashSet<usize>> =
      
        241
                HashMap::new();
      
        242
            {
      
        243
                let mut p: u32 = 0;
      
        244
                for (block_idx, block) in mf.blocks.iter().enumerate() {
      
        245
                    for inst in &block.insts {
      
        246
                        let mut touched: Vec<VRegId> = Vec::new();
      
        247
                        if let Some(d) = inst.def {
      
        248
                            touched.push(d);
      
        249
                            vreg_def_blocks.entry(d).or_default().insert(block_idx);
      
        250
                        }
      
        251
                        for op in &inst.operands {
      
        252
                            if let MachineOperand::VReg(v) = op {
      
        253
                                touched.push(*v);
      
        254
                            }
      
        255
                        }
      
        256
                        for v in touched {
      
        257
                            let entry = vreg_actual_range.entry(v).or_insert((p, p));
      
        258
                            if p < entry.0 {
      
        259
                                entry.0 = p;
      
        260
                            }
      
        261
                            if p > entry.1 {
      
        262
                                entry.1 = p;
      
        263
                            }
      
        264
                        }
      
        265
                        p += 2;
      
        266
                    }
      
        267
                }
      
        268
            }
      
        269
            let multi_def_vregs: std::collections::HashSet<VRegId> = vreg_def_blocks
      
        270
                .iter()
      
        271
                .filter(|(_, blocks)| blocks.len() > 1)
      
        272
                .map(|(v, _)| *v)
      
        273
                .collect();
      
        274
        
        275
            // Worklist of intervals to allocate. Starts as a clone of the
      
        276
            // pre-sorted liveness intervals; splitting inserts post-half
      
        277
            // intervals at their sorted position so the linear-scan
      
        278
            // invariant (process in start-position order) is preserved.
      
        279
            let mut worklist: Vec<super::liveness::LiveInterval> = liveness.intervals.clone();
      
        280
            let mut idx = 0;
      
        281
            while idx < worklist.len() {
      
        282
                let interval = worklist[idx].clone();
      
        283
                // V128 lives in the same physical V/Q register file as
      
        284
                // Fp32/Fp64; the regalloc must draw from the FP pool for
      
        285
                // any of these classes or the vector vreg ends up in a Gp
      
        286
                // register and emit prints `ldr qx21` (invalid asm).
      
        287
                let is_fp = matches!(
      
        288
                    interval.class,
      
        289
                    RegClass::Fp32 | RegClass::Fp64 | RegClass::V128
      
        290
                );
      
        291
        
        292
                if is_fp {
      
        293
                    expire_intervals(
      
        294
                        &mut active_fp,
      
        295
                        &mut free_fp_caller,
      
        296
                        &mut free_fp_callee,
      
        297
                        &FP_CALLEE_SAVED,
      
        298
                        interval.start,
      
        299
                    );
      
        300
                } else {
      
        301
                    expire_intervals(
      
        302
                        &mut active_gp,
      
        303
                        &mut free_gp_caller,
      
        304
                        &mut free_gp_callee,
      
        305
                        &GP_CALLEE_SAVED,
      
        306
                        interval.start,
      
        307
                    );
      
        308
                }
      
        309
        
        310
                let (active, free_caller, free_callee) = if is_fp {
      
        311
                    (&mut active_fp, &mut free_fp_caller, &mut free_fp_callee)
      
        312
                } else {
      
        313
                    (&mut active_gp, &mut free_gp_caller, &mut free_gp_callee)
      
        314
                };
      
        315
        
        316
                let reg_opt = if interval.crosses_call {
      
        317
                    // Must use callee-saved (caller-saved would be clobbered
      
        318
                    // by the call).
      
        319
                    free_callee.pop()
      
        320
                } else if let Some(hint) = interval.hint {
      
        321
                    // Try the hinted register first across both tiers.
      
        322
                    if let Some(i) = free_caller.iter().position(|&r| r == hint) {
      
        323
                        Some(free_caller.remove(i))
      
        324
                    } else if let Some(i) = free_callee.iter().position(|&r| r == hint) {
      
        325
                        Some(free_callee.remove(i))
      
        326
                    } else {
      
        327
                        free_caller.pop().or_else(|| free_callee.pop())
      
        328
                    }
      
        329
                } else {
      
        330
                    free_caller.pop().or_else(|| free_callee.pop())
      
        331
                };
      
        332
        
        333
                // Live-range splitting: a call-crossing interval that
      
        334
                // can't grab a callee-saved register would otherwise be
      
        335
                // full-spilled. If it has a single call crossing AND a
      
        336
                // caller-saved register is available right now, split at
      
        337
                // the call boundary instead — the pre-half goes to the
      
        338
                // caller-saved register, the post-half is allocated as a
      
        339
                // fresh interval, and we bridge the call with str/ldr to
      
        340
                // a frame slot. Net win: the longer-lived half between
      
        341
                // the def and the call avoids spill loads on every use.
      
        342
                // Cross-check `crosses_call` against the actual def/use
      
        343
                // range from the MIR walk: a vreg whose real range doesn't
      
        344
                // straddle the supposed crossing point is a false positive
      
        345
                // from inflated liveness, and must NOT be split.
      
        346
                let real_crosses = if multi_def_vregs.contains(&interval.vreg) {
      
        347
                    // Phi-like vreg: positions can't represent the true
      
        348
                    // live set (see comment above where multi_def_vregs is
      
        349
                    // built). Refuse to split — the parallel-copy at every
      
        350
                    // predecessor must land in the same physreg as every
      
        351
                    // use, and splitting fundamentally breaks that.
      
        352
                    false
      
        353
                } else if let Some(&(real_start, real_end)) =
      
        354
                    vreg_actual_range.get(&interval.vreg)
      
        355
                {
      
        356
                    interval.call_crossings.len() == 1
      
        357
                        && interval.call_crossings[0] > real_start
      
        358
                        && interval.call_crossings[0] < real_end
      
        359
                } else {
      
        360
                    false
      
        361
                };
      
        362
                // Find a caller-saved that survives the pre-half range
      
        363
                // unclobbered. Reject any register P such that an
      
        364
                // instruction inside (interval.start, call_pos - 1] writes
      
        365
                // P — by the time the bridge str fires (right above the
      
        366
                // arg-setup block), P would no longer hold the pre-half's
      
        367
                // value and the bridge would capture garbage.
      
        368
                let safe_pre_phys = if !free_caller.is_empty() && interval.call_crossings.len() == 1 {
      
        369
                    let cp = interval.call_crossings[0];
      
        370
                    let pre_end = cp.saturating_sub(1);
      
        371
                    free_caller.iter().rposition(|&r| {
      
        372
                        !phys_written_in(r, is_fp, interval.start, pre_end)
      
        373
                    })
      
        374
                } else {
      
        375
                    None
      
        376
                };
      
        377
                if let (None, true, true, Some(safe_idx)) =
      
        378
                    (reg_opt, interval.crosses_call, real_crosses, safe_pre_phys)
      
        379
                {
      
        380
                    let call_pos = interval.call_crossings[0];
      
        381
                    let bridge_slot = mf.alloc_local(spill_slot_size(interval.class));
      
        382
                    let synthetic = mf.new_vreg(interval.class);
      
        383
                    splits_in_progress.insert(synthetic, (interval.vreg, call_pos, bridge_slot));
      
        384
                    // Pull the safe register out of the caller-saved pool
      
        385
                    // and assign it directly: re-running through the normal
      
        386
                    // assignment path could pick a different register that
      
        387
                    // *isn't* phys-write-safe (free_caller.pop() doesn't
      
        388
                    // filter), which would defeat the whole point.
      
        389
                    let safe_reg = free_caller.remove(safe_idx);
      
        390
                    let pre_phys = if is_fp {
      
        391
                        match interval.class {
      
        392
                            RegClass::Fp32 => PhysReg::Fp32(safe_reg),
      
        393
                            _ => PhysReg::Fp(safe_reg),
      
        394
                        }
      
        395
                    } else {
      
        396
                        match interval.class {
      
        397
                            RegClass::Gp32 => PhysReg::Gp32(safe_reg),
      
        398
                            _ => PhysReg::Gp(safe_reg),
      
        399
                        }
      
        400
                    };
      
        401
                    assignments.insert(interval.vreg, pre_phys);
      
        402
                    // Pre-half ends before the call — track so this slot
      
        403
                    // becomes free for any post-call interval that needs a
      
        404
                    // caller-saved.
      
        405
                    active.push((safe_reg, call_pos.saturating_sub(1), interval.vreg));
      
        406
                    active.sort_by_key(|&(_, end, _)| end);
      
        407
        
        408
                    // Post-half: a fresh synthetic vreg covering the
      
        409
                    // post-call range. Single original crossing → no
      
        410
                    // crossings remain; cannot recurse-split.
      
        411
                    let post_half = super::liveness::LiveInterval {
      
        412
                        vreg: synthetic,
      
        413
                        class: interval.class,
      
        414
                        start: call_pos.saturating_add(1),
      
        415
                        end: interval.end,
      
        416
                        crosses_call: false,
      
        417
                        call_crossings: Vec::new(),
      
        418
                        hint: None,
      
        419
                    };
      
        420
                    // Insert post-half at its sorted position.
      
        421
                    let post_start = post_half.start;
      
        422
                    let post_vreg = post_half.vreg.0;
      
        423
                    let mut insert_at = idx + 1;
      
        424
                    while insert_at < worklist.len() {
      
        425
                        let w = &worklist[insert_at];
      
        426
                        if (w.start, w.vreg.0) > (post_start, post_vreg) {
      
        427
                            break;
      
        428
                        }
      
        429
                        insert_at += 1;
      
        430
                    }
      
        431
                    worklist.insert(insert_at, post_half);
      
        432
                    idx += 1;
      
        433
                    continue;
      
        434
                }
      
        435
        
        436
                if let Some(reg) = reg_opt {
      
        437
                    // Register available.
      
        438
                    let phys = if is_fp {
      
        439
                        match interval.class {
      
        440
                            RegClass::Fp32 => PhysReg::Fp32(reg),
      
        441
                            _ => PhysReg::Fp(reg),
      
        442
                        }
      
        443
                    } else {
      
        444
                        match interval.class {
      
        445
                            RegClass::Gp32 => PhysReg::Gp32(reg),
      
        446
                            _ => PhysReg::Gp(reg),
      
        447
                        }
      
        448
                    };
      
        449
                    assignments.insert(interval.vreg, phys);
      
        450
                    active.push((reg, interval.end, interval.vreg));
      
        451
                    active.sort_by_key(|&(_, end, _)| end);
      
        452
        
        453
                    // Track callee-saved usage.
      
        454
                    if !is_fp && GP_CALLEE_SAVED.contains(&reg) {
      
        455
                        callee_saved_used.insert(PhysReg::Gp(reg));
      
        456
                    }
      
        457
                    if is_fp && FP_CALLEE_SAVED.contains(&reg) {
      
        458
                        callee_saved_used.insert(PhysReg::Fp(reg));
      
        459
                    }
      
        460
                } else {
      
        461
                    // No register available — spill. Find the active
      
        462
                    // interval ending furthest. `active` is kept sorted by
      
        463
                    // end (see the `active.sort_by_key(...)` calls above
      
        464
                    // and below), so the last entry is always the furthest.
      
        465
                    // The active list carries the current vreg directly so
      
        466
                    // we don't have to scan the (non-deterministic,
      
        467
                    // stale-entry-laden) assignments map for the victim.
      
        468
                    if let Some(&(spill_reg, spill_end, victim)) = active.last() {
      
        469
                        let last_idx = active.len() - 1;
      
        470
                        if spill_end > interval.end {
      
        471
                            // Spill the furthest active interval, give its register to us.
      
        472
                            // The victim's class is what determines its slot size — not
      
        473
                            // ours, since we're taking the victim's physreg.
      
        474
                            let victim_class = mf
      
        475
                                .vregs
      
        476
                                .iter()
      
        477
                                .find(|v| v.id == victim)
      
        478
                                .map(|v| v.class)
      
        479
                                .unwrap_or(RegClass::Gp64);
      
        480
                            let offset = mf.alloc_local(spill_slot_size(victim_class));
      
        481
                            spills.insert(victim, offset);
      
        482
                            assignments.remove(&victim);
      
        483
        
        484
                            let phys = if is_fp {
      
        485
                                match interval.class {
      
        486
                                    RegClass::Fp32 => PhysReg::Fp32(spill_reg),
      
        487
                                    _ => PhysReg::Fp(spill_reg),
      
        488
                                }
      
        489
                            } else {
      
        490
                                match interval.class {
      
        491
                                    RegClass::Gp32 => PhysReg::Gp32(spill_reg),
      
        492
                                    _ => PhysReg::Gp(spill_reg),
      
        493
                                }
      
        494
                            };
      
        495
                            assignments.insert(interval.vreg, phys);
      
        496
                            active.remove(last_idx);
      
        497
                            active.push((spill_reg, interval.end, interval.vreg));
      
        498
                            active.sort_by_key(|&(_, end, _)| end);
      
        499
                        } else {
      
        500
                            // Current interval ends later — spill it. Use the
      
        501
                            // current interval's class for slot sizing.
      
        502
                            let offset = mf.alloc_local(spill_slot_size(interval.class));
      
        503
                            spills.insert(interval.vreg, offset);
      
        504
                        }
      
        505
                    } else {
      
        506
                        // No active intervals — shouldn't happen but spill to be safe.
      
        507
                        let offset = mf.alloc_local(spill_slot_size(interval.class));
      
        508
                        spills.insert(interval.vreg, offset);
      
        509
                    }
      
        510
                }
      
        511
                idx += 1;
      
        512
            }
      
        513
        
        514
            // Materialize SplitRecords from the in-progress map. Each
      
        515
            // synthetic post-half vreg either landed on a phys (the win
      
        516
            // case) or was spilled (the fall-back). Either way we have a
      
        517
            // bridge slot that links the two halves across the call.
      
        518
            //
      
        519
            // Build a position→call-index map from the BL/BLR walk so
      
        520
            // we can attach a stable ordinal to each split. `liveness`
      
        521
            // already enumerated calls in walk order, so its
      
        522
            // `call_positions` would do — but liveness exposes only
      
        523
            // intervals.  Re-derive locally; the call positions are
      
        524
            // small.
      
        525
            let call_positions_walk: Vec<u32> = {
      
        526
                let mut p: u32 = 0;
      
        527
                let mut cps = Vec::new();
      
        528
                for block in &mf.blocks {
      
        529
                    for inst in &block.insts {
      
        530
                        if matches!(inst.opcode, ArmOpcode::Bl | ArmOpcode::Blr) {
      
        531
                            cps.push(p);
      
        532
                        }
      
        533
                        p += 2;
      
        534
                    }
      
        535
                }
      
        536
                cps
      
        537
            };
      
        538
            let mut split_records: Vec<SplitRecord> = splits_in_progress
      
        539
                .into_iter()
      
        540
                .filter_map(|(synthetic, (orig, call_pos, bridge_slot))| {
      
        541
                    let pre_phys = *assignments.get(&orig)?;
      
        542
                    let post = if let Some(&p) = assignments.get(&synthetic) {
      
        543
                        PostHalf::Allocated(p)
      
        544
                    } else if spills.contains_key(&synthetic) {
      
        545
                        PostHalf::Spilled
      
        546
                    } else {
      
        547
                        // No allocation either way — shouldn't happen, but
      
        548
                        // skip the record rather than emit garbage.
      
        549
                        return None;
      
        550
                    };
      
        551
                    assignments.remove(&synthetic);
      
        552
                    spills.remove(&synthetic);
      
        553
                    let call_index = call_positions_walk.binary_search(&call_pos).ok()? as u32;
      
        554
                    Some(SplitRecord {
      
        555
                        vreg: orig,
      
        556
                        call_position: call_pos,
      
        557
                        call_index,
      
        558
                        pre_phys,
      
        559
                        post,
      
        560
                        bridge_slot,
      
        561
                    })
      
        562
                })
      
        563
                .collect();
      
        564
            split_records.sort_by_key(|r| (r.vreg.0, r.call_position));
      
        565
        
        566
            // Sort callee-saved for consistent prologue/epilogue ordering.
      
        567
            let mut callee_saved: Vec<PhysReg> = callee_saved_used.into_iter().collect();
      
        568
            callee_saved.sort_by_key(|r| match r {
      
        569
                PhysReg::Gp(n) | PhysReg::Gp32(n) => *n as u16,
      
        570
                PhysReg::Fp(n) | PhysReg::Fp32(n) => 100 + *n as u16,
      
        571
                _ => 200,
      
        572
            });
      
        573
        
        574
            AllocResult {
      
        575
                assignments,
      
        576
                spills,
      
        577
                split_records,
      
        578
                callee_saved_used: callee_saved,
      
        579
            }
      
        580
        }
      
        581
        
        582
        /// Position-resolved assignment for a vreg. A vreg that wasn't
      
        583
        /// split has the same assignment everywhere; a split vreg's
      
        584
        /// assignment depends on whether the use is before or after the
      
        585
        /// call boundary, and the post-call assignment may itself be a
      
        586
        /// memory slot (the spilled-post fall-back).
      
        587
        enum LogicalAssignment {
      
        588
            Reg(PhysReg),
      
        589
            Slot(i32),
      
        590
        }
      
        591
        
        592
        /// Apply allocation result: rewrite VReg operands to PhysReg, insert spill code.
      
        593
        /// For spilled operands, borrows a temporarily-free register from the allocation
      
        594
        /// pool rather than using dedicated scratch registers. This avoids wasting
      
        595
        /// registers in the pool and follows standard linear scan practice.
      
        596
        pub fn apply_allocation(
      
        597
            mf: &mut MachineFunction,
      
        598
            result: &AllocResult,
      
        599
            liveness: &super::liveness::LivenessResult,
      
        600
        ) {
      
        601
            let vreg_classes: HashMap<VRegId, RegClass> =
      
        602
                mf.vregs.iter().map(|v| (v.id, v.class)).collect();
      
        603
        
        604
            // Build interval lookup: for each vreg, its live range.
      
        605
            let intervals: HashMap<VRegId, (u32, u32)> = liveness
      
        606
                .intervals
      
        607
                .iter()
      
        608
                .map(|i| (i.vreg, (i.start, i.end)))
      
        609
                .collect();
      
        610
        
        611
            // Compute instruction positions for each block/instruction.
      
        612
            let mut inst_pos: HashMap<(usize, usize), u32> = HashMap::new();
      
        613
            let mut pos: u32 = 0;
      
        614
            for (bi, block) in mf.blocks.iter().enumerate() {
      
        615
                for (ii, _) in block.insts.iter().enumerate() {
      
        616
                    inst_pos.insert((bi, ii), pos);
      
        617
                    pos += 2;
      
        618
                }
      
        619
            }
      
        620
        
        621
            // Position-aware lookup: returns the logical assignment of
      
        622
            // `vid` at `cur_pos`, consulting split_records first so that
      
        623
            // post-call uses pick up the post-half's location instead of
      
        624
            // the pre-half's (now-clobbered) caller-saved register.
      
        625
            let logical_assignment = |vid: VRegId, cur_pos: u32| -> Option<LogicalAssignment> {
      
        626
                for r in &result.split_records {
      
        627
                    if r.vreg != vid {
      
        628
                        continue;
      
        629
                    }
      
        630
                    if cur_pos > r.call_position {
      
        631
                        return Some(match r.post {
      
        632
                            PostHalf::Allocated(p) => LogicalAssignment::Reg(p),
      
        633
                            PostHalf::Spilled => LogicalAssignment::Slot(r.bridge_slot),
      
        634
                        });
      
        635
                    }
      
        636
                    return Some(LogicalAssignment::Reg(r.pre_phys));
      
        637
                }
      
        638
                if let Some(p) = result.assignments.get(&vid) {
      
        639
                    Some(LogicalAssignment::Reg(*p))
      
        640
                } else {
      
        641
                    result.spills.get(&vid).copied().map(LogicalAssignment::Slot)
      
        642
                }
      
        643
            };
      
        644
        
        645
            for block_idx in 0..mf.blocks.len() {
      
        646
                let mut new_insts = Vec::new();
      
        647
                let insts = std::mem::take(&mut mf.blocks[block_idx].insts);
      
        648
        
        649
                for (inst_idx, inst) in insts.iter().enumerate() {
      
        650
                    let mut rewritten = inst.clone();
      
        651
                    let cur_pos = inst_pos.get(&(block_idx, inst_idx)).copied().unwrap_or(0);
      
        652
        
        653
                    // Find GP registers NOT in use at this position. A
      
        654
                    // vreg's register is in use during its full live range
      
        655
                    // unless the vreg was split — in which case the
      
        656
                    // pre-half occupies pre_phys during [start,
      
        657
                    // call_position] and the post-half occupies post_phys
      
        658
                    // during (call_position, end].
      
        659
                    //
      
        660
                    // Iterate `result.assignments` in **deterministic vreg
      
        661
                    // order**, not raw HashMap iteration order — the latter
      
        662
                    // varies between runs and produces different temp-reg
      
        663
                    // assignments and therefore non-reproducible builds. The
      
        664
                    // resulting `gp_temps`/`fp_temps` lists are consumed by
      
        665
                    // index, so their order is load-bearing.
      
        666
                    let mut gp_temps: Vec<u8> = Vec::new();
      
        667
                    let mut fp_temps: Vec<u8> = Vec::new();
      
        668
                    let mut sorted_assignments: Vec<(VRegId, PhysReg)> =
      
        669
                        result.assignments.iter().map(|(&v, &p)| (v, p)).collect();
      
        670
                    sorted_assignments.sort_by_key(|(v, _)| v.0);
      
        671
                    // Build the set of phys regs in use at cur_pos. For
      
        672
                    // split vregs, pre_phys is in use up to call_position,
      
        673
                    // post_phys (if Allocated) takes over after.
      
        674
                    let mut used_gp: HashSet<u8> = HashSet::new();
      
        675
                    let mut used_fp: HashSet<u8> = HashSet::new();
      
        676
                    let mark_used = |phys: PhysReg, gp: &mut HashSet<u8>, fp: &mut HashSet<u8>| match phys {
      
        677
                        PhysReg::Gp(n) | PhysReg::Gp32(n) => {
      
        678
                            gp.insert(n);
      
        679
                        }
      
        680
                        PhysReg::Fp(n) | PhysReg::Fp32(n) => {
      
        681
                            fp.insert(n);
      
        682
                        }
      
        683
                        _ => {}
      
        684
                    };
      
        685
                    for (vreg, phys) in &sorted_assignments {
      
        686
                        if let Some(&(start, end)) = intervals.get(vreg) {
      
        687
                            if cur_pos < start || cur_pos > end {
      
        688
                                continue;
      
        689
                            }
      
        690
                            // Split vregs: pre_phys covers [start,
      
        691
                            // call_position]; post_phys covers
      
        692
                            // (call_position, end]. assignments[vreg] is
      
        693
                            // the pre_phys (we removed the synthetic).
      
        694
                            let split = result.split_records.iter().find(|r| r.vreg == *vreg);
      
        695
                            if let Some(rec) = split {
      
        696
                                if cur_pos <= rec.call_position {
      
        697
                                    mark_used(rec.pre_phys, &mut used_gp, &mut used_fp);
      
        698
                                }
      
        699
                                if let PostHalf::Allocated(p) = rec.post {
      
        700
                                    if cur_pos > rec.call_position {
      
        701
                                        mark_used(p, &mut used_gp, &mut used_fp);
      
        702
                                    }
      
        703
                                }
      
        704
                            } else {
      
        705
                                mark_used(*phys, &mut used_gp, &mut used_fp);
      
        706
                            }
      
        707
                        }
      
        708
                    }
      
        709
                    for (_, phys) in &sorted_assignments {
      
        710
                        match phys {
      
        711
                            PhysReg::Gp(n) | PhysReg::Gp32(n)
      
        712
                                if !used_gp.contains(n) && !gp_temps.contains(n) =>
      
        713
                            {
      
        714
                                gp_temps.push(*n);
      
        715
                            }
      
        716
                            PhysReg::Fp(n) | PhysReg::Fp32(n)
      
        717
                                if !used_fp.contains(n) && !fp_temps.contains(n) =>
      
        718
                            {
      
        719
                                fp_temps.push(*n);
      
        720
                            }
      
        721
                            _ => {}
      
        722
                        }
      
        723
                    }
      
        724
                    // post_phys registers from split records are also
      
        725
                    // candidates for temp use when *they're* not in use.
      
        726
                    for r in &result.split_records {
      
        727
                        if let PostHalf::Allocated(p) = r.post {
      
        728
                            let in_use = match p {
      
        729
                                PhysReg::Gp(n) | PhysReg::Gp32(n) => used_gp.contains(&n),
      
        730
                                PhysReg::Fp(n) | PhysReg::Fp32(n) => used_fp.contains(&n),
      
        731
                                _ => true,
      
        732
                            };
      
        733
                            if !in_use {
      
        734
                                match p {
      
        735
                                    PhysReg::Gp(n) | PhysReg::Gp32(n) if !gp_temps.contains(&n) => {
      
        736
                                        gp_temps.push(n);
      
        737
                                    }
      
        738
                                    PhysReg::Fp(n) | PhysReg::Fp32(n) if !fp_temps.contains(&n) => {
      
        739
                                        fp_temps.push(n);
      
        740
                                    }
      
        741
                                    _ => {}
      
        742
                                }
      
        743
                            }
      
        744
                        }
      
        745
                    }
      
        746
                    // Also include the dedicated fallback scratches in case no free regs found.
      
        747
                    for &s in &GP_SPILL_SCRATCH {
      
        748
                        if !gp_temps.contains(&s) {
      
        749
                            gp_temps.push(s);
      
        750
                        }
      
        751
                    }
      
        752
                    for &s in &FP_SPILL_SCRATCH {
      
        753
                        if !fp_temps.contains(&s) {
      
        754
                            fp_temps.push(s);
      
        755
                        }
      
        756
                    }
      
        757
        
        758
                    // For spilled (or post-spilled-after-split) vreg uses:
      
        759
                    // borrow a temp register and load from the slot.
      
        760
                    let mut loads = Vec::new();
      
        761
                    let mut gp_temp_idx = 0usize;
      
        762
                    let mut fp_temp_idx = 0usize;
      
        763
                    for (i, op) in inst.operands.iter().enumerate() {
      
        764
                        if let MachineOperand::VReg(vid) = op {
      
        765
                            if let Some(LogicalAssignment::Slot(offset)) =
      
        766
                                logical_assignment(*vid, cur_pos)
      
        767
                            {
      
        768
                                let class = vreg_classes.get(vid).copied().unwrap_or(RegClass::Gp64);
      
        769
                                let (temp_reg, load_op) = match class {
      
        770
                                    RegClass::Fp32 => {
      
        771
                                        let r = fp_temps
      
        772
                                            .get(fp_temp_idx)
      
        773
                                            .copied()
      
        774
                                            .unwrap_or(FP_SPILL_SCRATCH[0]);
      
        775
                                        fp_temp_idx += 1;
      
        776
                                        (PhysReg::Fp32(r), ArmOpcode::LdrFpImm)
      
        777
                                    }
      
        778
                                    RegClass::Fp64 => {
      
        779
                                        let r = fp_temps
      
        780
                                            .get(fp_temp_idx)
      
        781
                                            .copied()
      
        782
                                            .unwrap_or(FP_SPILL_SCRATCH[0]);
      
        783
                                        fp_temp_idx += 1;
      
        784
                                        (PhysReg::Fp(r), ArmOpcode::LdrFpImm)
      
        785
                                    }
      
        786
                                    RegClass::V128 => {
      
        787
                                        let r = fp_temps
      
        788
                                            .get(fp_temp_idx)
      
        789
                                            .copied()
      
        790
                                            .unwrap_or(FP_SPILL_SCRATCH[0]);
      
        791
                                        fp_temp_idx += 1;
      
        792
                                        (PhysReg::Fp(r), ArmOpcode::LdrQ)
      
        793
                                    }
      
        794
                                    RegClass::Gp32 => {
      
        795
                                        let r = gp_temps
      
        796
                                            .get(gp_temp_idx)
      
        797
                                            .copied()
      
        798
                                            .unwrap_or(GP_SPILL_SCRATCH[0]);
      
        799
                                        gp_temp_idx += 1;
      
        800
                                        (PhysReg::Gp32(r), ArmOpcode::LdrImm)
      
        801
                                    }
      
        802
                                    RegClass::Gp64 => {
      
        803
                                        let r = gp_temps
      
        804
                                            .get(gp_temp_idx)
      
        805
                                            .copied()
      
        806
                                            .unwrap_or(GP_SPILL_SCRATCH[0]);
      
        807
                                        gp_temp_idx += 1;
      
        808
                                        (PhysReg::Gp(r), ArmOpcode::LdrImm)
      
        809
                                    }
      
        810
                                };
      
        811
                                loads.push((i, temp_reg, load_op, offset));
      
        812
                            }
      
        813
                        }
      
        814
                    }
      
        815
        
        816
                    for (op_idx, scratch, load_op, offset) in &loads {
      
        817
                        if *op_idx == 0
      
        818
                            && inst
      
        819
                                .def
      
        820
                                .as_ref()
      
        821
                                .map(|d| {
      
        822
                                    matches!(
      
        823
                                        logical_assignment(*d, cur_pos),
      
        824
                                        Some(LogicalAssignment::Slot(_))
      
        825
                                    )
      
        826
                                })
      
        827
                                .unwrap_or(false)
      
        828
                        {
      
        829
                            continue;
      
        830
                        }
      
        831
                        emit_spill_access(&mut new_insts, *load_op, *scratch, *offset as i64);
      
        832
                        rewritten.operands[*op_idx] = MachineOperand::PhysReg(*scratch);
      
        833
                    }
      
        834
        
        835
                    // Rewrite assigned vregs to physical registers using
      
        836
                    // the position-aware lookup so split vregs pick up the
      
        837
                    // correct half.
      
        838
                    for op in &mut rewritten.operands {
      
        839
                        if let MachineOperand::VReg(vid) = op {
      
        840
                            if let Some(LogicalAssignment::Reg(phys)) =
      
        841
                                logical_assignment(*vid, cur_pos)
      
        842
                            {
      
        843
                                *op = MachineOperand::PhysReg(phys);
      
        844
                            }
      
        845
                        }
      
        846
                    }
      
        847
        
        848
                    // Handle def — use a temp that doesn't alias any input temp.
      
        849
                    let def_temp_idx = gp_temp_idx.max(fp_temp_idx);
      
        850
                    let def_spill = if let Some(def_vid) = &inst.def {
      
        851
                        if let Some(LogicalAssignment::Slot(offset)) =
      
        852
                            logical_assignment(*def_vid, cur_pos)
      
        853
                        {
      
        854
                            let class = vreg_classes.get(def_vid).copied().unwrap_or(RegClass::Gp64);
      
        855
                            let temp_reg = match class {
      
        856
                                RegClass::Fp32 => {
      
        857
                                    let r = fp_temps
      
        858
                                        .get(def_temp_idx)
      
        859
                                        .copied()
      
        860
                                        .unwrap_or(FP_SPILL_SCRATCH[0]);
      
        861
                                    PhysReg::Fp32(r)
      
        862
                                }
      
        863
                                RegClass::Fp64 | RegClass::V128 => {
      
        864
                                    let r = fp_temps
      
        865
                                        .get(def_temp_idx)
      
        866
                                        .copied()
      
        867
                                        .unwrap_or(FP_SPILL_SCRATCH[0]);
      
        868
                                    PhysReg::Fp(r)
      
        869
                                }
      
        870
                                RegClass::Gp32 => {
      
        871
                                    let r = gp_temps
      
        872
                                        .get(def_temp_idx)
      
        873
                                        .copied()
      
        874
                                        .unwrap_or(GP_SPILL_SCRATCH[0]);
      
        875
                                    PhysReg::Gp32(r)
      
        876
                                }
      
        877
                                RegClass::Gp64 => {
      
        878
                                    let r = gp_temps
      
        879
                                        .get(def_temp_idx)
      
        880
                                        .copied()
      
        881
                                        .unwrap_or(GP_SPILL_SCRATCH[0]);
      
        882
                                    PhysReg::Gp(r)
      
        883
                                }
      
        884
                            };
      
        885
                            let scratch = temp_reg;
      
        886
                            // Replace def operand with scratch.
      
        887
                            if let Some(MachineOperand::VReg(vid)) = rewritten.operands.first() {
      
        888
                                if vid == def_vid {
      
        889
                                    rewritten.operands[0] = MachineOperand::PhysReg(scratch);
      
        890
                                }
      
        891
                            }
      
        892
                            Some((scratch, offset, class))
      
        893
                        } else {
      
        894
                            None
      
        895
                        }
      
        896
                    } else {
      
        897
                        None
      
        898
                    };
      
        899
        
        900
                    // Bridge instructions for live-range splits are
      
        901
                    // emitted by `insert_split_bridges`, which runs *after*
      
        902
                    // `parallelize_call_arg_moves` so the canonical
      
        903
                    // arg-setup mov sequence isn't fragmented by an
      
        904
                    // interposed store and the parallel-copy resolver can
      
        905
                    // see all the arg-setup movs as one block.
      
        906
                    rewritten.def = None;
      
        907
                    new_insts.push(rewritten);
      
        908
        
        909
                    // Store after def for spilled vregs.
      
        910
                    if let Some((scratch, offset, class)) = def_spill {
      
        911
                        let store_op = match class {
      
        912
                            RegClass::Fp32 | RegClass::Fp64 => ArmOpcode::StrFpImm,
      
        913
                            RegClass::V128 => ArmOpcode::StrQ,
      
        914
                            _ => ArmOpcode::StrImm,
      
        915
                        };
      
        916
                        emit_spill_access(&mut new_insts, store_op, scratch, offset as i64);
      
        917
                    }
      
        918
                }
      
        919
        
        920
                mf.blocks[block_idx].insts = new_insts;
      
        921
            }
      
        922
        }
      
        923
        
        924
        /// Emit a spill load or store, materializing the FP-relative
      
        925
        /// address through `x8` when the offset is out of the LDR/STR
      
        926
        /// immediate's reach. ARM64's LDUR / STUR (used as a fallback for
      
        927
        /// negative immediates) only encode signed 9-bit offsets
      
        928
        /// `(-256, 255)`. For wider frames we substitute
      
        929
        /// `sub x8, x29, #|offset|; ldr/str rt, [x8, #0]`. Positive
      
        930
        /// out-of-scaled-range offsets get the symmetric `add` treatment.
      
        931
        fn emit_spill_access(
      
        932
            insts: &mut Vec<MachineInst>,
      
        933
            op: ArmOpcode,
      
        934
            rt: PhysReg,
      
        935
            offset: i64,
      
        936
        ) {
      
        937
            // Most spill ops encode comfortably with FP-relative immediates.
      
        938
            // Only fall back to address materialization when out of LDUR
      
        939
            // range. (Positive offsets up to 65520 step 16 are handled by
      
        940
            // LDR's scaled form; negatives must use LDUR; we materialize
      
        941
            // anything outside the union.)
      
        942
            let in_range = (-256..=255).contains(&offset);
      
        943
            if in_range {
      
        944
                insts.push(MachineInst {
      
        945
                    opcode: op,
      
        946
                    operands: vec![
      
        947
                        MachineOperand::PhysReg(rt),
      
        948
                        MachineOperand::PhysReg(PhysReg::FP),
      
        949
                        MachineOperand::Imm(offset),
      
        950
                    ],
      
        951
                    def: None,
      
        952
                });
      
        953
                return;
      
        954
            }
      
        955
            let addr = PhysReg::Gp(8);
      
        956
            let (mat_op, abs) = if offset >= 0 {
      
        957
                (ArmOpcode::AddImm, offset)
      
        958
            } else {
      
        959
                (ArmOpcode::SubImm, -offset)
      
        960
            };
      
        961
            insts.push(MachineInst {
      
        962
                opcode: mat_op,
      
        963
                operands: vec![
      
        964
                    MachineOperand::PhysReg(addr),
      
        965
                    MachineOperand::PhysReg(PhysReg::FP),
      
        966
                    MachineOperand::Imm(abs),
      
        967
                ],
      
        968
                def: None,
      
        969
            });
      
        970
            insts.push(MachineInst {
      
        971
                opcode: op,
      
        972
                operands: vec![
      
        973
                    MachineOperand::PhysReg(rt),
      
        974
                    MachineOperand::PhysReg(addr),
      
        975
                    MachineOperand::Imm(0),
      
        976
                ],
      
        977
                def: None,
      
        978
            });
      
        979
        }
      
        980
        
        981
        /// Insert callee-saved register saves in prologue and restores in epilogue.
      
        982
        /// Must be called after apply_allocation so we know which callee-saved regs were used.
      
        983
        pub fn insert_callee_saves(mf: &mut MachineFunction, callee_saved: &[PhysReg]) {
      
        984
            if callee_saved.is_empty() {
      
        985
                return;
      
        986
            }
      
        987
        
        988
            // Allocate stack slots for callee-saved registers.
      
        989
            let mut save_slots: Vec<(PhysReg, i32)> = Vec::new();
      
        990
            for &reg in callee_saved {
      
        991
                let offset = mf.frame.alloc_local(8);
      
        992
                save_slots.push((reg, offset));
      
        993
            }
      
        994
        
        995
            // Insert saves at the start of the entry block (after prologue setup).
      
        996
            // Find the insertion point: after the STP + ADD (prologue) instructions.
      
        997
            let prologue_end = mf.blocks[0]
      
        998
                .insts
      
        999
                .iter()
      
        1000
                .position(|i| {
      
        1001
                    // The prologue is StpPre followed by AddImm. Insert after those.
      
        1002
                    !matches!(i.opcode, ArmOpcode::StpPre | ArmOpcode::AddImm)
      
        1003
                })
      
        1004
                .unwrap_or(0);
      
        1005
        
        1006
            // Emit saves, pairing consecutive same-class registers into STP.
      
        1007
            // save_slots are ordered by increasing register number; slots are
      
        1008
            // allocated at decreasing offsets (-8, -16, -24, ...). Adjacent
      
        1009
            // slots differ by 8 bytes — exactly the STP stride for GP64/FP64.
      
        1010
            //
      
        1011
            // STP Xt1, Xt2, [Xn, #off]: stores Xt1 at Xn+off, Xt2 at Xn+off+8.
      
        1012
            // So to store reg_low (at lower_offset) and reg_high (at lower_offset+8):
      
        1013
            //   STP reg_low, reg_high, [FP, #lower_offset]
      
        1014
            let saves = emit_callee_store_pairs(&save_slots, false);
      
        1015
            for (i, save) in saves.into_iter().enumerate() {
      
        1016
                mf.blocks[0].insts.insert(prologue_end + i, save);
      
        1017
            }
      
        1018
        
        1019
            // Insert restores before every epilogue sequence (LdpPost).
      
        1020
            for block in &mut mf.blocks {
      
        1021
                let mut insertions = Vec::new();
      
        1022
                for (i, inst) in block.insts.iter().enumerate() {
      
        1023
                    if inst.opcode == ArmOpcode::LdpPost {
      
        1024
                        insertions.push(i);
      
        1025
                    }
      
        1026
                }
      
        1027
                for &ldp_idx in insertions.iter().rev() {
      
        1028
                    // Restores in reverse order (mirror of saves).
      
        1029
                    let restores = emit_callee_store_pairs(&save_slots, true);
      
        1030
                    for restore in restores.into_iter().rev() {
      
        1031
                        block.insts.insert(ldp_idx, restore);
      
        1032
                    }
      
        1033
                }
      
        1034
            }
      
        1035
        }
      
        1036
        
        1037
        /// Build the save (or restore) instruction list for a set of callee-saved
      
        1038
        /// slots, pairing consecutive adjacent GP/FP slots into STP/LDP.
      
        1039
        ///
      
        1040
        /// `restore` = true → emit LDP/LDR (load); false → STP/STR (store).
      
        1041
        ///
      
        1042
        /// Pairing rule: slots[i] and slots[i+1] are paired when:
      
        1043
        ///   * Both are GP or both are FP (same register class)
      
        1044
        ///   * slots[i+1].offset == slots[i].offset - 8 (adjacent 8-byte slots)
      
        1045
        ///
      
        1046
        /// STP Xt1, Xt2, [Xn, #off] → Xt1 at off, Xt2 at off+8.
      
        1047
        /// We pair as: STP slots[i+1].reg, slots[i].reg, [FP, #slots[i+1].offset]
      
        1048
        /// because slots[i+1] has the lower (more negative) offset.
      
        1049
        fn emit_callee_store_pairs(save_slots: &[(PhysReg, i32)], restore: bool) -> Vec<MachineInst> {
      
        1050
            let mut result = Vec::new();
      
        1051
            let mut i = 0;
      
        1052
            while i < save_slots.len() {
      
        1053
                let (reg1, off1) = save_slots[i];
      
        1054
                // Try to pair with next slot.
      
        1055
                if i + 1 < save_slots.len() {
      
        1056
                    let (reg2, off2) = save_slots[i + 1];
      
        1057
                    let is_gp1 = !matches!(reg1, PhysReg::Fp(_) | PhysReg::Fp32(_));
      
        1058
                    let is_gp2 = !matches!(reg2, PhysReg::Fp(_) | PhysReg::Fp32(_));
      
        1059
                    let same_class = is_gp1 == is_gp2;
      
        1060
                    // off1 is the higher slot (less negative); off2 = off1 - 8.
      
        1061
                    let adjacent = off2 == off1 - 8;
      
        1062
                    // STP offset must fit in 7-bit signed × 8: range -512..504.
      
        1063
                    let in_range = (-512..=504).contains(&off2);
      
        1064
                    if same_class && adjacent && in_range {
      
        1065
                        let opcode = if restore {
      
        1066
                            ArmOpcode::LdpOffset
      
        1067
                        } else {
      
        1068
                            ArmOpcode::StpOffset
      
        1069
                        };
      
        1070
                        let (low_reg, high_reg) = (reg2, reg1);
      
        1071
                        result.push(MachineInst {
      
        1072
                            opcode,
      
        1073
                            operands: vec![
      
        1074
                                MachineOperand::PhysReg(low_reg),
      
        1075
                                MachineOperand::PhysReg(high_reg),
      
        1076
                                MachineOperand::PhysReg(PhysReg::FP),
      
        1077
                                MachineOperand::Imm(off2 as i64),
      
        1078
                            ],
      
        1079
                            def: None,
      
        1080
                        });
      
        1081
                        i += 2;
      
        1082
                        continue;
      
        1083
                    }
      
        1084
                }
      
        1085
                // Emit individual STR/LDR.
      
        1086
                let (op, is_fp) = match reg1 {
      
        1087
                    PhysReg::Fp(_) | PhysReg::Fp32(_) => {
      
        1088
                        if restore {
      
        1089
                            (ArmOpcode::LdrFpImm, true)
      
        1090
                        } else {
      
        1091
                            (ArmOpcode::StrFpImm, true)
      
        1092
                        }
      
        1093
                    }
      
        1094
                    _ => {
      
        1095
                        if restore {
      
        1096
                            (ArmOpcode::LdrImm, false)
      
        1097
                        } else {
      
        1098
                            (ArmOpcode::StrImm, false)
      
        1099
                        }
      
        1100
                    }
      
        1101
                };
      
        1102
                let _ = is_fp;
      
        1103
                result.push(MachineInst {
      
        1104
                    opcode: op,
      
        1105
                    operands: vec![
      
        1106
                        MachineOperand::PhysReg(reg1),
      
        1107
                        MachineOperand::PhysReg(PhysReg::FP),
      
        1108
                        MachineOperand::Imm(off1 as i64),
      
        1109
                    ],
      
        1110
                    def: if restore {
      
        1111
                        Some(match reg1 {
      
        1112
                            PhysReg::Gp(n) | PhysReg::Gp32(n) => crate::codegen::mir::VRegId(n.into()),
      
        1113
                            PhysReg::Fp(n) | PhysReg::Fp32(n) => crate::codegen::mir::VRegId(n.into()),
      
        1114
                            _ => crate::codegen::mir::VRegId(0),
      
        1115
                        })
      
        1116
                    } else {
      
        1117
                        None
      
        1118
                    },
      
        1119
                });
      
        1120
                i += 1;
      
        1121
            }
      
        1122
            result
      
        1123
        }
      
        1124
        
        1125
        /// Basic move coalescing: eliminate mov instructions where src == dest.
      
        1126
        pub fn coalesce_moves(mf: &mut MachineFunction) {
      
        1127
            for block in &mut mf.blocks {
      
        1128
                block.insts.retain(|inst| {
      
        1129
                    if matches!(inst.opcode, ArmOpcode::MovReg | ArmOpcode::FmovReg)
      
        1130
                        && inst.operands.len() == 2
      
        1131
                        && inst.operands[0] == inst.operands[1]
      
        1132
                    {
      
        1133
                        return false; // eliminate self-move
      
        1134
                    }
      
        1135
                    true
      
        1136
                });
      
        1137
            }
      
        1138
        }
      
        1139
        
        1140
        /// Emit the str/ldr pairs that bridge live-range splits across
      
        1141
        /// each call boundary. Runs *after* `parallelize_call_arg_moves`
      
        1142
        /// so the parallel-copy resolver sees an unfragmented arg-setup
      
        1143
        /// sequence (an interposed `str` between an arg-copy and the BL
      
        1144
        /// would otherwise stop the scan-back at the wrong point and the
      
        1145
        /// resolver would miss arg-copies that need cycle-breaking).
      
        1146
        ///
      
        1147
        /// For each `SplitRecord` whose `call_position` equals the
      
        1148
        /// position of a `Bl`/`Blr` in the function: insert a `str
      
        1149
        /// pre_phys, [fp, bridge_slot]` immediately before the call and,
      
        1150
        /// if the post-half got its own register, an `ldr post_phys, [fp,
      
        1151
        /// bridge_slot]` immediately after.  Spilled post-halves rely on
      
        1152
        /// the existing per-use spill-load path, which already loads from
      
        1153
        /// `bridge_slot` thanks to `apply_allocation`'s position-aware
      
        1154
        /// operand rewrite.
      
        1155
        pub fn insert_split_bridges(mf: &mut MachineFunction, splits: &[SplitRecord]) {
      
        1156
            if splits.is_empty() {
      
        1157
                return;
      
        1158
            }
      
        1159
            let mut call_seen: u32 = 0;
      
        1160
            for block_idx in 0..mf.blocks.len() {
      
        1161
                let mut new_insts = Vec::with_capacity(mf.blocks[block_idx].insts.len());
      
        1162
                let original = std::mem::take(&mut mf.blocks[block_idx].insts);
      
        1163
                for inst in original.into_iter() {
      
        1164
                    let is_call = matches!(inst.opcode, ArmOpcode::Bl | ArmOpcode::Blr);
      
        1165
                    if is_call {
      
        1166
                        let idx = call_seen;
      
        1167
                        // Place bridge `str pre_phys, [fp, slot]` *above* the
      
        1168
                        // canonical arg-setup mov sequence. Arg-setup movs
      
        1169
                        // can write to caller-saved physregs that are also
      
        1170
                        // some split's `pre_phys` — by the time we reach the
      
        1171
                        // BL, those registers no longer hold the pre-half's
      
        1172
                        // value. Scan back over arg-setup movs and insert
      
        1173
                        // bridges at the top of that block.
      
        1174
                        let mut insert_at = new_insts.len();
      
        1175
                        while insert_at > 0 && is_call_arg_copy(&new_insts[insert_at - 1]) {
      
        1176
                            insert_at -= 1;
      
        1177
                        }
      
        1178
                        let mut bridges: Vec<MachineInst> = Vec::new();
      
        1179
                        for r in splits {
      
        1180
                            if r.call_index != idx {
      
        1181
                                continue;
      
        1182
                            }
      
        1183
                            let store_op = match r.pre_phys {
      
        1184
                                PhysReg::Fp(_) | PhysReg::Fp32(_) => ArmOpcode::StrFpImm,
      
        1185
                                _ => ArmOpcode::StrImm,
      
        1186
                            };
      
        1187
                            bridges.push(MachineInst {
      
        1188
                                opcode: store_op,
      
        1189
                                operands: vec![
      
        1190
                                    MachineOperand::PhysReg(r.pre_phys),
      
        1191
                                    MachineOperand::PhysReg(PhysReg::FP),
      
        1192
                                    MachineOperand::Imm(r.bridge_slot as i64),
      
        1193
                                ],
      
        1194
                                def: None,
      
        1195
                            });
      
        1196
                        }
      
        1197
                        for (offset, b) in bridges.into_iter().enumerate() {
      
        1198
                            new_insts.insert(insert_at + offset, b);
      
        1199
                        }
      
        1200
                    }
      
        1201
                    new_insts.push(inst);
      
        1202
                    if is_call {
      
        1203
                        let idx = call_seen;
      
        1204
                        for r in splits {
      
        1205
                            if r.call_index != idx {
      
        1206
                                continue;
      
        1207
                            }
      
        1208
                            if let PostHalf::Allocated(p) = r.post {
      
        1209
                                let load_op = match p {
      
        1210
                                    PhysReg::Fp(_) | PhysReg::Fp32(_) => ArmOpcode::LdrFpImm,
      
        1211
                                    _ => ArmOpcode::LdrImm,
      
        1212
                                };
      
        1213
                                new_insts.push(MachineInst {
      
        1214
                                    opcode: load_op,
      
        1215
                                    operands: vec![
      
        1216
                                        MachineOperand::PhysReg(p),
      
        1217
                                        MachineOperand::PhysReg(PhysReg::FP),
      
        1218
                                        MachineOperand::Imm(r.bridge_slot as i64),
      
        1219
                                    ],
      
        1220
                                    def: None,
      
        1221
                                });
      
        1222
                            }
      
        1223
                        }
      
        1224
                        call_seen += 1;
      
        1225
                    }
      
        1226
                }
      
        1227
                mf.blocks[block_idx].insts = new_insts;
      
        1228
            }
      
        1229
        }
      
        1230
        
        1231
        /// Resolve the entry-block argument-receipt parallel copy. The
      
        1232
        /// function prologue copies incoming arg registers (x0..x7,
      
        1233
        /// d0..d7) into the registers that the allocator chose for the
      
        1234
        /// parameter vregs. Emitted naively this is a sequential `mov`
      
        1235
        /// chain, which corrupts values whenever a destination aliases a
      
        1236
        /// later source — e.g. `mov x4, x3; mov x3, x4` clobbers x4 before
      
        1237
        /// the second move can read it. Historically this was hidden
      
        1238
        /// because parameter vregs landed in callee-saved registers
      
        1239
        /// (x19..x28) that never alias x0..x7; once the allocator started
      
        1240
        /// preferring caller-saved destinations, the latent hazard
      
        1241
        /// surfaced.
      
        1242
        ///
      
        1243
        /// Find the leading run of phys-to-phys movs in the entry block
      
        1244
        /// (after the frame setup) whose source is an incoming arg
      
        1245
        /// register, and run the same parallel-copy resolver
      
        1246
        /// (`rewrite_call_arg_copies`) used for pre-call argument setup.
      
        1247
        pub fn parallelize_entry_arg_moves(mf: &mut MachineFunction) {
      
        1248
            if mf.blocks.is_empty() {
      
        1249
                return;
      
        1250
            }
      
        1251
            let original = std::mem::take(&mut mf.blocks[0].insts);
      
        1252
            let mut rebuilt: Vec<MachineInst> = Vec::with_capacity(original.len());
      
        1253
            let mut iter = original.into_iter().peekable();
      
        1254
        
        1255
            // Pass through the prologue (StpPre/AddImm/SubImm).
      
        1256
            while let Some(inst) = iter.peek() {
      
        1257
                if matches!(
      
        1258
                    inst.opcode,
      
        1259
                    ArmOpcode::StpPre | ArmOpcode::AddImm | ArmOpcode::SubImm
      
        1260
                ) {
      
        1261
                    rebuilt.push(iter.next().unwrap());
      
        1262
                } else {
      
        1263
                    break;
      
        1264
                }
      
        1265
            }
      
        1266
        
        1267
            // Process arg-receipt parallel groups. Tolerate intervening
      
        1268
            // store instructions (which read but don't write registers in
      
        1269
            // the parallel-copy graph) so a result-address spill or
      
        1270
            // partial-receipt store doesn't fragment the receipts. Stop
      
        1271
            // once any instruction overwrites an incoming arg register —
      
        1272
            // beyond that point those values are no longer the original
      
        1273
            // arg values and parallel-copy resolution would be unsound.
      
        1274
            let mut pending: Vec<MachineInst> = Vec::new();
      
        1275
            loop {
      
        1276
                match iter.peek() {
      
        1277
                    None => break,
      
        1278
                    Some(inst) if is_arg_receipt_copy(inst) => {
      
        1279
                        pending.push(iter.next().unwrap());
      
        1280
                    }
      
        1281
                    Some(inst) if is_transparent_to_arg_receipts(inst) => {
      
        1282
                        if !pending.is_empty() {
      
        1283
                            rebuilt.extend(rewrite_call_arg_copies(std::mem::take(&mut pending)));
      
        1284
                        }
      
        1285
                        rebuilt.push(iter.next().unwrap());
      
        1286
                    }
      
        1287
                    Some(_) => break,
      
        1288
                }
      
        1289
            }
      
        1290
            if !pending.is_empty() {
      
        1291
                rebuilt.extend(rewrite_call_arg_copies(pending));
      
        1292
            }
      
        1293
            rebuilt.extend(iter);
      
        1294
            mf.blocks[0].insts = rebuilt;
      
        1295
        }
      
        1296
        
        1297
        /// An instruction is *transparent* to the arg-receipt parallel
      
        1298
        /// copy if executing it cannot change the value held in any
      
        1299
        /// incoming arg register (x0..x7 / d0..d7). Stores qualify (they
      
        1300
        /// only read), as does any compute whose destination lies outside
      
        1301
        /// the arg-reg range. Anything that writes into x0..x7 forms a
      
        1302
        /// barrier — the receipts that come after it are reading
      
        1303
        /// post-clobber values, so reordering across the barrier is
      
        1304
        /// unsound.
      
        1305
        fn is_transparent_to_arg_receipts(inst: &MachineInst) -> bool {
      
        1306
            if is_store_opcode(inst.opcode) {
      
        1307
                return true;
      
        1308
            }
      
        1309
            if is_branch_or_call_opcode(inst.opcode) {
      
        1310
                return false;
      
        1311
            }
      
        1312
            match inst.operands.first() {
      
        1313
                Some(MachineOperand::PhysReg(dst)) => !is_call_arg_reg(*dst),
      
        1314
                _ => true,
      
        1315
            }
      
        1316
        }
      
        1317
        
        1318
        fn is_store_opcode(op: ArmOpcode) -> bool {
      
        1319
            matches!(
      
        1320
                op,
      
        1321
                ArmOpcode::StrImm
      
        1322
                    | ArmOpcode::StrhImm
      
        1323
                    | ArmOpcode::StrbImm
      
        1324
                    | ArmOpcode::StrFpImm
      
        1325
                    | ArmOpcode::StrReg
      
        1326
                    | ArmOpcode::StrFpReg
      
        1327
                    | ArmOpcode::StrQ
      
        1328
                    | ArmOpcode::StpPre
      
        1329
                    | ArmOpcode::StpOffset
      
        1330
            )
      
        1331
        }
      
        1332
        
        1333
        fn is_branch_or_call_opcode(op: ArmOpcode) -> bool {
      
        1334
            matches!(
      
        1335
                op,
      
        1336
                ArmOpcode::B
      
        1337
                    | ArmOpcode::BCond
      
        1338
                    | ArmOpcode::Bl
      
        1339
                    | ArmOpcode::Blr
      
        1340
                    | ArmOpcode::Ret
      
        1341
                    | ArmOpcode::Cbz
      
        1342
                    | ArmOpcode::Cbnz
      
        1343
                    | ArmOpcode::Tbz
      
        1344
                    | ArmOpcode::Tbnz
      
        1345
            )
      
        1346
        }
      
        1347
        
        1348
        /// Reorder physical argument-register copies immediately before calls so later
      
        1349
        /// sources are not clobbered by earlier destination writes.
      
        1350
        ///
      
        1351
        /// For an indirect call (`BLR Xn`), the target register `Xn` is a
      
        1352
        /// live use of the call. If `Xn` lies in the argument-register
      
        1353
        /// range (x0..x7) and one of the pending arg-copy moves writes to
      
        1354
        /// it, the function pointer would be clobbered before the branch.
      
        1355
        /// Save it to a non-arg scratch register (`x10`, distinct from
      
        1356
        /// `x9` that `rewrite_call_arg_copies` uses for cycle-breaking) and
      
        1357
        /// rewrite the BLR's operand. This was hidden as long as
      
        1358
        /// indirect-call targets landed in callee-saved registers (which
      
        1359
        /// can never alias x0..x7) but surfaces once non-call-crossing
      
        1360
        /// values prefer caller-saved.
      
        1361
        pub fn parallelize_call_arg_moves(mf: &mut MachineFunction) {
      
        1362
            for block in &mut mf.blocks {
      
        1363
                let mut rebuilt: Vec<MachineInst> = Vec::with_capacity(block.insts.len());
      
        1364
                for inst in std::mem::take(&mut block.insts) {
      
        1365
                    if matches!(inst.opcode, ArmOpcode::Bl | ArmOpcode::Blr) {
      
        1366
                        let mut start = rebuilt.len();
      
        1367
                        while start > 0 && is_call_arg_copy(&rebuilt[start - 1]) {
      
        1368
                            start -= 1;
      
        1369
                        }
      
        1370
                        let mut adjusted = inst;
      
        1371
                        if matches!(adjusted.opcode, ArmOpcode::Blr) {
      
        1372
                            let target_phys = match adjusted.operands.first() {
      
        1373
                                Some(MachineOperand::PhysReg(t)) => Some(*t),
      
        1374
                                _ => None,
      
        1375
                            };
      
        1376
                            if let Some(target) = target_phys {
      
        1377
                                if is_call_arg_reg(target) {
      
        1378
                                    let target_alias = phys_reg_alias(target);
      
        1379
                                    let writes_target = rebuilt[start..].iter().any(|i| {
      
        1380
                                        matches!(i.operands.first(),
      
        1381
                                            Some(MachineOperand::PhysReg(d))
      
        1382
                                                if phys_reg_alias(*d) == target_alias)
      
        1383
                                    });
      
        1384
                                    if writes_target {
      
        1385
                                        let scratch = blr_target_scratch(target);
      
        1386
                                        let save = MachineInst {
      
        1387
                                            opcode: move_opcode_for_phys(target),
      
        1388
                                            operands: vec![
      
        1389
                                                MachineOperand::PhysReg(scratch),
      
        1390
                                                MachineOperand::PhysReg(target),
      
        1391
                                            ],
      
        1392
                                            def: None,
      
        1393
                                        };
      
        1394
                                        rebuilt.insert(start, save);
      
        1395
                                        adjusted.operands[0] = MachineOperand::PhysReg(scratch);
      
        1396
                                        start += 1;
      
        1397
                                    }
      
        1398
                                }
      
        1399
                            }
      
        1400
                        }
      
        1401
                        if start < rebuilt.len() {
      
        1402
                            let pending = rebuilt.split_off(start);
      
        1403
                            rebuilt.extend(rewrite_call_arg_copies(pending));
      
        1404
                        }
      
        1405
                        rebuilt.push(adjusted);
      
        1406
                    } else {
      
        1407
                        rebuilt.push(inst);
      
        1408
                    }
      
        1409
                }
      
        1410
                block.insts = rebuilt;
      
        1411
            }
      
        1412
        }
      
        1413
        
        1414
        /// Scratch used to preserve a BLR target across parallel arg
      
        1415
        /// setup. Distinct from the `x9` / `d29` used by
      
        1416
        /// `rewrite_call_arg_copies` for cycle-breaking so the two scratch
      
        1417
        /// uses never collide.
      
        1418
        fn blr_target_scratch(target: PhysReg) -> PhysReg {
      
        1419
            match target {
      
        1420
                PhysReg::Gp(_) => PhysReg::Gp(10),
      
        1421
                PhysReg::Gp32(_) => PhysReg::Gp32(10),
      
        1422
                _ => panic!("blr_target_scratch: BLR target is always a 64/32-bit GP register"),
      
        1423
            }
      
        1424
        }
      
        1425
        
        1426
        // ---- Helpers ----
      
        1427
        
        1428
        fn is_call_arg_copy(inst: &MachineInst) -> bool {
      
        1429
            matches!(inst.opcode, ArmOpcode::MovReg | ArmOpcode::FmovReg)
      
        1430
                && matches!(
      
        1431
                    inst.operands.as_slice(),
      
        1432
                    [MachineOperand::PhysReg(dst), MachineOperand::PhysReg(src)]
      
        1433
                        if is_call_arg_reg(*dst) && !matches!(src, PhysReg::Xzr | PhysReg::Wzr)
      
        1434
                )
      
        1435
        }
      
        1436
        
        1437
        /// Mirror of `is_call_arg_copy` for the function-entry direction:
      
        1438
        /// a phys-to-phys mov whose *source* is an incoming arg register.
      
        1439
        fn is_arg_receipt_copy(inst: &MachineInst) -> bool {
      
        1440
            matches!(inst.opcode, ArmOpcode::MovReg | ArmOpcode::FmovReg)
      
        1441
                && matches!(
      
        1442
                    inst.operands.as_slice(),
      
        1443
                    [MachineOperand::PhysReg(_), MachineOperand::PhysReg(src)]
      
        1444
                        if is_call_arg_reg(*src)
      
        1445
                )
      
        1446
        }
      
        1447
        
        1448
        fn is_call_arg_reg(reg: PhysReg) -> bool {
      
        1449
            match reg {
      
        1450
                PhysReg::Gp(n) | PhysReg::Gp32(n) => n < 8,
      
        1451
                PhysReg::Fp(n) | PhysReg::Fp32(n) => n < 8,
      
        1452
                _ => false,
      
        1453
            }
      
        1454
        }
      
        1455
        
        1456
        #[derive(Clone, Copy, PartialEq, Eq)]
      
        1457
        enum PhysRegAlias {
      
        1458
            Gp(u8),
      
        1459
            Fp(u8),
      
        1460
        }
      
        1461
        
        1462
        fn phys_reg_alias(reg: PhysReg) -> Option<PhysRegAlias> {
      
        1463
            match reg {
      
        1464
                PhysReg::Gp(n) | PhysReg::Gp32(n) => Some(PhysRegAlias::Gp(n)),
      
        1465
                PhysReg::Fp(n) | PhysReg::Fp32(n) => Some(PhysRegAlias::Fp(n)),
      
        1466
                _ => None,
      
        1467
            }
      
        1468
        }
      
        1469
        
        1470
        fn scratch_phys_for(reg: PhysReg) -> PhysReg {
      
        1471
            match reg {
      
        1472
                PhysReg::Gp(_) => PhysReg::Gp(9),
      
        1473
                PhysReg::Gp32(_) => PhysReg::Gp32(9),
      
        1474
                PhysReg::Fp(_) => PhysReg::Fp(29),
      
        1475
                PhysReg::Fp32(_) => PhysReg::Fp32(29),
      
        1476
                _ => panic!("call-arg scratch requested for non-register operand"),
      
        1477
            }
      
        1478
        }
      
        1479
        
        1480
        fn move_opcode_for_phys(reg: PhysReg) -> ArmOpcode {
      
        1481
            match reg {
      
        1482
                PhysReg::Fp(_) | PhysReg::Fp32(_) => ArmOpcode::FmovReg,
      
        1483
                PhysReg::Gp(_) | PhysReg::Gp32(_) => ArmOpcode::MovReg,
      
        1484
                _ => panic!("move opcode requested for non-register operand"),
      
        1485
            }
      
        1486
        }
      
        1487
        
        1488
        fn rewrite_call_arg_copies(pending_moves: Vec<MachineInst>) -> Vec<MachineInst> {
      
        1489
            let mut pending: Vec<(ArmOpcode, PhysReg, PhysReg)> = pending_moves
      
        1490
                .into_iter()
      
        1491
                .map(|inst| match inst.operands.as_slice() {
      
        1492
                    [MachineOperand::PhysReg(dst), MachineOperand::PhysReg(src)] => {
      
        1493
                        (inst.opcode, *dst, *src)
      
        1494
                    }
      
        1495
                    _ => panic!("call-arg copy rewrite saw unexpected operand shape"),
      
        1496
                })
      
        1497
                .collect();
      
        1498
            let mut rewritten = Vec::with_capacity(pending.len() + 1);
      
        1499
        
        1500
            while !pending.is_empty() {
      
        1501
                let safe_idx = (0..pending.len()).find(|&i| {
      
        1502
                    let (_, dst, _) = pending[i];
      
        1503
                    let dst_alias = phys_reg_alias(dst).expect("call-arg copy dst should alias");
      
        1504
                    !pending
      
        1505
                        .iter()
      
        1506
                        .enumerate()
      
        1507
                        .any(|(j, &(_, _, src))| j != i && phys_reg_alias(src) == Some(dst_alias))
      
        1508
                });
      
        1509
        
        1510
                if let Some(idx) = safe_idx {
      
        1511
                    let (opcode, dst, src) = pending.remove(idx);
      
        1512
                    rewritten.push(MachineInst {
      
        1513
                        opcode,
      
        1514
                        operands: vec![MachineOperand::PhysReg(dst), MachineOperand::PhysReg(src)],
      
        1515
                        def: None,
      
        1516
                    });
      
        1517
                    continue;
      
        1518
                }
      
        1519
        
        1520
                let (_, _, src) = pending[0];
      
        1521
                let scratch = scratch_phys_for(src);
      
        1522
                rewritten.push(MachineInst {
      
        1523
                    opcode: move_opcode_for_phys(src),
      
        1524
                    operands: vec![
      
        1525
                        MachineOperand::PhysReg(scratch),
      
        1526
                        MachineOperand::PhysReg(src),
      
        1527
                    ],
      
        1528
                    def: None,
      
        1529
                });
      
        1530
                pending[0].2 = scratch;
      
        1531
            }
      
        1532
        
        1533
            rewritten
      
        1534
        }
      
        1535
        
        1536
        fn expire_intervals(
      
        1537
            active: &mut Vec<(u8, u32, VRegId)>,
      
        1538
            free_caller: &mut Vec<u8>,
      
        1539
            free_callee: &mut Vec<u8>,
      
        1540
            callee_range: &std::ops::RangeInclusive<u8>,
      
        1541
            pos: u32,
      
        1542
        ) {
      
        1543
            let mut i = 0;
      
        1544
            while i < active.len() {
      
        1545
                if active[i].1 < pos {
      
        1546
                    let (reg, _, _) = active.remove(i);
      
        1547
                    if callee_range.contains(&reg) {
      
        1548
                        free_callee.push(reg);
      
        1549
                    } else {
      
        1550
                        free_caller.push(reg);
      
        1551
                    }
      
        1552
                } else {
      
        1553
                    i += 1;
      
        1554
                }
      
        1555
            }
      
        1556
        }
      
        1557
        
        1558
        /// Scratch registers for spill code. Multiple scratches needed when an
      
        1559
        /// instruction has multiple spilled operands.
      
        1560
        const GP_SPILL_SCRATCH: [u8; 3] = [9, 10, 11];
      
        1561
        const FP_SPILL_SCRATCH: [u8; 3] = [29, 30, 31];
      
        1562
        
        1563
        fn spill_scratch(class: RegClass, idx: usize) -> (PhysReg, ArmOpcode) {
      
        1564
            match class {
      
        1565
                RegClass::Fp64 => (PhysReg::Fp(FP_SPILL_SCRATCH[idx % 3]), ArmOpcode::LdrFpImm),
      
        1566
                RegClass::Fp32 => (
      
        1567
                    PhysReg::Fp32(FP_SPILL_SCRATCH[idx % 3]),
      
        1568
                    ArmOpcode::LdrFpImm,
      
        1569
                ),
      
        1570
                // 128-bit vector spills/fills via LdrQ — same FP scratch
      
        1571
                // bank (the V registers ARE the 128-bit form of the same
      
        1572
                // physical D/S registers).
      
        1573
                RegClass::V128 => (PhysReg::Fp(FP_SPILL_SCRATCH[idx % 3]), ArmOpcode::LdrQ),
      
        1574
                RegClass::Gp32 => (PhysReg::Gp32(GP_SPILL_SCRATCH[idx % 3]), ArmOpcode::LdrImm),
      
        1575
                RegClass::Gp64 => (PhysReg::Gp(GP_SPILL_SCRATCH[idx % 3]), ArmOpcode::LdrImm),
      
        1576
            }
      
        1577
        }
      
        1578
        
        1579
        #[cfg(test)]
      
        1580
        mod tests {
      
        1581
            use super::*;
      
        1582
            use crate::codegen::isel::select_function;
      
        1583
            use crate::ir::builder::FuncBuilder;
      
        1584
            use crate::ir::inst::*;
      
        1585
            use crate::ir::types::*;
      
        1586
        
        1587
            #[test]
      
        1588
            fn linear_scan_assigns_registers() {
      
        1589
                let mut func = Function::new("test".into(), vec![], IrType::Void);
      
        1590
                {
      
        1591
                    let mut b = FuncBuilder::new(&mut func);
      
        1592
                    let x = b.const_i32(10);
      
        1593
                    let y = b.const_i32(20);
      
        1594
                    let _z = b.iadd(x, y);
      
        1595
                    b.ret_void();
      
        1596
                }
      
        1597
                let mut mf = select_function(&func);
      
        1598
                let result = linear_scan(&mut mf);
      
        1599
                // Should have assignments for the vregs.
      
        1600
                assert!(
      
        1601
                    !result.assignments.is_empty(),
      
        1602
                    "should have register assignments"
      
        1603
                );
      
        1604
                // No spills needed for 3 vregs.
      
        1605
                assert!(
      
        1606
                    result.spills.is_empty(),
      
        1607
                    "should not spill with only 3 vregs"
      
        1608
                );
      
        1609
            }
      
        1610
        
        1611
            #[test]
      
        1612
            fn linear_scan_no_x18() {
      
        1613
                let mut func = Function::new("test".into(), vec![], IrType::Void);
      
        1614
                {
      
        1615
                    let mut b = FuncBuilder::new(&mut func);
      
        1616
                    // Create many vregs to exercise the allocator.
      
        1617
                    let mut vals = Vec::new();
      
        1618
                    for i in 0..20 {
      
        1619
                        vals.push(b.const_i32(i));
      
        1620
                    }
      
        1621
                    // Use them all.
      
        1622
                    let mut sum = vals[0];
      
        1623
                    for &v in &vals[1..] {
      
        1624
                        sum = b.iadd(sum, v);
      
        1625
                    }
      
        1626
                    b.ret_void();
      
        1627
                }
      
        1628
                let mut mf = select_function(&func);
      
        1629
                let result = linear_scan(&mut mf);
      
        1630
                // x18 must never be assigned.
      
        1631
                for phys in result.assignments.values() {
      
        1632
                    match phys {
      
        1633
                        PhysReg::Gp(18) | PhysReg::Gp32(18) => {
      
        1634
                            panic!("x18 assigned — platform reserved!");
      
        1635
                        }
      
        1636
                        _ => {}
      
        1637
                    }
      
        1638
                }
      
        1639
            }
      
        1640
        
        1641
            #[test]
      
        1642
            fn coalesce_eliminates_self_moves() {
      
        1643
                let mut mf = MachineFunction::new("test".into());
      
        1644
                let reg = PhysReg::Gp(9);
      
        1645
                mf.blocks[0].insts.push(MachineInst {
      
        1646
                    opcode: ArmOpcode::MovReg,
      
        1647
                    operands: vec![MachineOperand::PhysReg(reg), MachineOperand::PhysReg(reg)],
      
        1648
                    def: None,
      
        1649
                });
      
        1650
                mf.blocks[0].insts.push(MachineInst {
      
        1651
                    opcode: ArmOpcode::Ret,
      
        1652
                    operands: vec![],
      
        1653
                    def: None,
      
        1654
                });
      
        1655
                assert_eq!(mf.blocks[0].insts.len(), 2);
      
        1656
                coalesce_moves(&mut mf);
      
        1657
                assert_eq!(
      
        1658
                    mf.blocks[0].insts.len(),
      
        1659
                    1,
      
        1660
                    "self-move should be eliminated"
      
        1661
                );
      
        1662
            }
      
        1663
        
        1664
            #[test]
      
        1665
            fn parallelize_call_arg_moves_preserves_later_source_registers() {
      
        1666
                let mut mf = MachineFunction::new("test".into());
      
        1667
                mf.blocks[0].insts.push(MachineInst {
      
        1668
                    opcode: ArmOpcode::MovReg,
      
        1669
                    operands: vec![
      
        1670
                        MachineOperand::PhysReg(PhysReg::Gp(0)),
      
        1671
                        MachineOperand::PhysReg(PhysReg::Gp(28)),
      
        1672
                    ],
      
        1673
                    def: None,
      
        1674
                });
      
        1675
                mf.blocks[0].insts.push(MachineInst {
      
        1676
                    opcode: ArmOpcode::MovReg,
      
        1677
                    operands: vec![
      
        1678
                        MachineOperand::PhysReg(PhysReg::Gp(1)),
      
        1679
                        MachineOperand::PhysReg(PhysReg::Gp(0)),
      
        1680
                    ],
      
        1681
                    def: None,
      
        1682
                });
      
        1683
                mf.blocks[0].insts.push(MachineInst {
      
        1684
                    opcode: ArmOpcode::Blr,
      
        1685
                    operands: vec![MachineOperand::PhysReg(PhysReg::Gp(12))],
      
        1686
                    def: None,
      
        1687
                });
      
        1688
        
        1689
                parallelize_call_arg_moves(&mut mf);
      
        1690
        
        1691
                assert_eq!(mf.blocks[0].insts.len(), 3);
      
        1692
                assert_eq!(
      
        1693
                    mf.blocks[0].insts[0].operands,
      
        1694
                    vec![
      
        1695
                        MachineOperand::PhysReg(PhysReg::Gp(1)),
      
        1696
                        MachineOperand::PhysReg(PhysReg::Gp(0)),
      
        1697
                    ]
      
        1698
                );
      
        1699
                assert_eq!(
      
        1700
                    mf.blocks[0].insts[1].operands,
      
        1701
                    vec![
      
        1702
                        MachineOperand::PhysReg(PhysReg::Gp(0)),
      
        1703
                        MachineOperand::PhysReg(PhysReg::Gp(28)),
      
        1704
                    ]
      
        1705
                );
      
        1706
            }
      
        1707
        
        1708
            #[test]
      
        1709
            fn parallelize_call_arg_moves_breaks_cycles_with_scratch() {
      
        1710
                let mut mf = MachineFunction::new("test".into());
      
        1711
                mf.blocks[0].insts.push(MachineInst {
      
        1712
                    opcode: ArmOpcode::MovReg,
      
        1713
                    operands: vec![
      
        1714
                        MachineOperand::PhysReg(PhysReg::Gp(0)),
      
        1715
                        MachineOperand::PhysReg(PhysReg::Gp(1)),
      
        1716
                    ],
      
        1717
                    def: None,
      
        1718
                });
      
        1719
                mf.blocks[0].insts.push(MachineInst {
      
        1720
                    opcode: ArmOpcode::MovReg,
      
        1721
                    operands: vec![
      
        1722
                        MachineOperand::PhysReg(PhysReg::Gp(1)),
      
        1723
                        MachineOperand::PhysReg(PhysReg::Gp(0)),
      
        1724
                    ],
      
        1725
                    def: None,
      
        1726
                });
      
        1727
                mf.blocks[0].insts.push(MachineInst {
      
        1728
                    opcode: ArmOpcode::Bl,
      
        1729
                    operands: vec![MachineOperand::Extern("_callee".into())],
      
        1730
                    def: None,
      
        1731
                });
      
        1732
        
        1733
                parallelize_call_arg_moves(&mut mf);
      
        1734
        
        1735
                assert_eq!(mf.blocks[0].insts.len(), 4);
      
        1736
                assert_eq!(
      
        1737
                    mf.blocks[0].insts[0].operands,
      
        1738
                    vec![
      
        1739
                        MachineOperand::PhysReg(PhysReg::Gp(9)),
      
        1740
                        MachineOperand::PhysReg(PhysReg::Gp(1)),
      
        1741
                    ]
      
        1742
                );
      
        1743
                assert_eq!(
      
        1744
                    mf.blocks[0].insts[1].operands,
      
        1745
                    vec![
      
        1746
                        MachineOperand::PhysReg(PhysReg::Gp(1)),
      
        1747
                        MachineOperand::PhysReg(PhysReg::Gp(0)),
      
        1748
                    ]
      
        1749
                );
      
        1750
                assert_eq!(
      
        1751
                    mf.blocks[0].insts[2].operands,
      
        1752
                    vec![
      
        1753
                        MachineOperand::PhysReg(PhysReg::Gp(0)),
      
        1754
                        MachineOperand::PhysReg(PhysReg::Gp(9)),
      
        1755
                    ]
      
        1756
                );
      
        1757
            }
      
        1758
        
        1759
            #[test]
      
        1760
            fn parallelize_call_arg_moves_ignores_zero_materialization_into_arg_regs() {
      
        1761
                let mut mf = MachineFunction::new("test".into());
      
        1762
                mf.blocks[0].insts.push(MachineInst {
      
        1763
                    opcode: ArmOpcode::MovReg,
      
        1764
                    operands: vec![
      
        1765
                        MachineOperand::PhysReg(PhysReg::Gp(4)),
      
        1766
                        MachineOperand::PhysReg(PhysReg::Xzr),
      
        1767
                    ],
      
        1768
                    def: None,
      
        1769
                });
      
        1770
                mf.blocks[0].insts.push(MachineInst {
      
        1771
                    opcode: ArmOpcode::MovReg,
      
        1772
                    operands: vec![
      
        1773
                        MachineOperand::PhysReg(PhysReg::Gp(0)),
      
        1774
                        MachineOperand::PhysReg(PhysReg::Gp(4)),
      
        1775
                    ],
      
        1776
                    def: None,
      
        1777
                });
      
        1778
                mf.blocks[0].insts.push(MachineInst {
      
        1779
                    opcode: ArmOpcode::Blr,
      
        1780
                    operands: vec![MachineOperand::PhysReg(PhysReg::Gp(19))],
      
        1781
                    def: None,
      
        1782
                });
      
        1783
        
        1784
                parallelize_call_arg_moves(&mut mf);
      
        1785
        
        1786
                assert_eq!(mf.blocks[0].insts.len(), 3);
      
        1787
                assert_eq!(
      
        1788
                    mf.blocks[0].insts[0].operands,
      
        1789
                    vec![
      
        1790
                        MachineOperand::PhysReg(PhysReg::Gp(4)),
      
        1791
                        MachineOperand::PhysReg(PhysReg::Xzr),
      
        1792
                    ]
      
        1793
                );
      
        1794
                assert_eq!(
      
        1795
                    mf.blocks[0].insts[1].operands,
      
        1796
                    vec![
      
        1797
                        MachineOperand::PhysReg(PhysReg::Gp(0)),
      
        1798
                        MachineOperand::PhysReg(PhysReg::Gp(4)),
      
        1799
                    ]
      
        1800
                );
      
        1801
            }
      
        1802
        
        1803
            #[test]
      
        1804
            fn linear_scan_prefers_caller_saved_when_no_calls_cross() {
      
        1805
                // No call-crossing intervals — every assigned vreg should
      
        1806
                // land on a caller-saved register and callee_saved_used
      
        1807
                // should stay empty (no prologue STP/LDP overhead).
      
        1808
                let mut func = Function::new("test".into(), vec![], IrType::Void);
      
        1809
                {
      
        1810
                    let mut b = FuncBuilder::new(&mut func);
      
        1811
                    let x = b.const_i32(1);
      
        1812
                    let y = b.const_i32(2);
      
        1813
                    let z = b.iadd(x, y);
      
        1814
                    let _w = b.iadd(z, x);
      
        1815
                    b.ret_void();
      
        1816
                }
      
        1817
                let mut mf = select_function(&func);
      
        1818
                let result = linear_scan(&mut mf);
      
        1819
                assert!(
      
        1820
                    result.callee_saved_used.is_empty(),
      
        1821
                    "no callee-saved expected for call-free function, got {:?}",
      
        1822
                    result.callee_saved_used
      
        1823
                );
      
        1824
                for phys in result.assignments.values() {
      
        1825
                    if let PhysReg::Gp(n) | PhysReg::Gp32(n) = phys {
      
        1826
                        assert!(
      
        1827
                            !GP_CALLEE_SAVED.contains(n),
      
        1828
                            "vreg landed on callee-saved x{} despite caller-saved availability",
      
        1829
                            n
      
        1830
                        );
      
        1831
                    }
      
        1832
                }
      
        1833
            }
      
        1834
        
        1835
            #[test]
      
        1836
            fn parallelize_entry_arg_moves_resolves_swap_in_receipts() {
      
        1837
                let mut mf = MachineFunction::new("test".into());
      
        1838
                // Frame setup.
      
        1839
                mf.blocks[0].insts.push(MachineInst {
      
        1840
                    opcode: ArmOpcode::StpPre,
      
        1841
                    operands: vec![],
      
        1842
                    def: None,
      
        1843
                });
      
        1844
                mf.blocks[0].insts.push(MachineInst {
      
        1845
                    opcode: ArmOpcode::AddImm,
      
        1846
                    operands: vec![],
      
        1847
                    def: None,
      
        1848
                });
      
        1849
                // Arg-receipt swap: mov x4, x3; mov x3, x4. Naive
      
        1850
                // sequential emit would corrupt; the parallelizer must
      
        1851
                // route through scratch x9.
      
        1852
                mf.blocks[0].insts.push(MachineInst {
      
        1853
                    opcode: ArmOpcode::MovReg,
      
        1854
                    operands: vec![
      
        1855
                        MachineOperand::PhysReg(PhysReg::Gp(4)),
      
        1856
                        MachineOperand::PhysReg(PhysReg::Gp(3)),
      
        1857
                    ],
      
        1858
                    def: None,
      
        1859
                });
      
        1860
                mf.blocks[0].insts.push(MachineInst {
      
        1861
                    opcode: ArmOpcode::MovReg,
      
        1862
                    operands: vec![
      
        1863
                        MachineOperand::PhysReg(PhysReg::Gp(3)),
      
        1864
                        MachineOperand::PhysReg(PhysReg::Gp(4)),
      
        1865
                    ],
      
        1866
                    def: None,
      
        1867
                });
      
        1868
                parallelize_entry_arg_moves(&mut mf);
      
        1869
                let body: Vec<&[MachineOperand]> = mf.blocks[0].insts[2..]
      
        1870
                    .iter()
      
        1871
                    .map(|i| i.operands.as_slice())
      
        1872
                    .collect();
      
        1873
                // Expect the parallelizer to insert a scratch save then
      
        1874
                // emit the two receipts without clobbering.
      
        1875
                assert_eq!(body.len(), 3, "swap pair should expand to 3 insts");
      
        1876
                assert_eq!(
      
        1877
                    body[0],
      
        1878
                    &[
      
        1879
                        MachineOperand::PhysReg(PhysReg::Gp(9)),
      
        1880
                        MachineOperand::PhysReg(PhysReg::Gp(3)),
      
        1881
                    ]
      
        1882
                );
      
        1883
            }
      
        1884
        
        1885
            #[test]
      
        1886
            fn parallelize_entry_arg_moves_tolerates_intervening_store() {
      
        1887
                let mut mf = MachineFunction::new("test".into());
      
        1888
                mf.blocks[0].insts.push(MachineInst {
      
        1889
                    opcode: ArmOpcode::StpPre,
      
        1890
                    operands: vec![],
      
        1891
                    def: None,
      
        1892
                });
      
        1893
                mf.blocks[0].insts.push(MachineInst {
      
        1894
                    opcode: ArmOpcode::AddImm,
      
        1895
                    operands: vec![],
      
        1896
                    def: None,
      
        1897
                });
      
        1898
                // mov x10, x0  (transparent: dst x10 outside arg-reg range)
      
        1899
                mf.blocks[0].insts.push(MachineInst {
      
        1900
                    opcode: ArmOpcode::MovReg,
      
        1901
                    operands: vec![
      
        1902
                        MachineOperand::PhysReg(PhysReg::Gp(10)),
      
        1903
                        MachineOperand::PhysReg(PhysReg::Gp(0)),
      
        1904
                    ],
      
        1905
                    def: None,
      
        1906
                });
      
        1907
                // str x10, [fp, #-56]  (store, transparent — reads x10)
      
        1908
                mf.blocks[0].insts.push(MachineInst {
      
        1909
                    opcode: ArmOpcode::StrImm,
      
        1910
                    operands: vec![
      
        1911
                        MachineOperand::PhysReg(PhysReg::Gp(10)),
      
        1912
                        MachineOperand::PhysReg(PhysReg::FP),
      
        1913
                        MachineOperand::Imm(-56),
      
        1914
                    ],
      
        1915
                    def: None,
      
        1916
                });
      
        1917
                // Swap pair after the store — must still be parallelized.
      
        1918
                mf.blocks[0].insts.push(MachineInst {
      
        1919
                    opcode: ArmOpcode::MovReg,
      
        1920
                    operands: vec![
      
        1921
                        MachineOperand::PhysReg(PhysReg::Gp(4)),
      
        1922
                        MachineOperand::PhysReg(PhysReg::Gp(3)),
      
        1923
                    ],
      
        1924
                    def: None,
      
        1925
                });
      
        1926
                mf.blocks[0].insts.push(MachineInst {
      
        1927
                    opcode: ArmOpcode::MovReg,
      
        1928
                    operands: vec![
      
        1929
                        MachineOperand::PhysReg(PhysReg::Gp(3)),
      
        1930
                        MachineOperand::PhysReg(PhysReg::Gp(4)),
      
        1931
                    ],
      
        1932
                    def: None,
      
        1933
                });
      
        1934
                parallelize_entry_arg_moves(&mut mf);
      
        1935
                // The store should still appear (kept in place); the swap
      
        1936
                // pair should have expanded to 3 movs via scratch.
      
        1937
                let opcodes: Vec<ArmOpcode> = mf.blocks[0].insts.iter().map(|i| i.opcode).collect();
      
        1938
                assert!(
      
        1939
                    opcodes.contains(&ArmOpcode::StrImm),
      
        1940
                    "store should be preserved across parallelization"
      
        1941
                );
      
        1942
                let mov_count = opcodes
      
        1943
                    .iter()
      
        1944
                    .filter(|&&op| op == ArmOpcode::MovReg)
      
        1945
                    .count();
      
        1946
                assert_eq!(
      
        1947
                    mov_count, 4,
      
        1948
                    "expected 4 movs (1 receipt + 3 from swap expansion), got {}",
      
        1949
                    mov_count
      
        1950
                );
      
        1951
            }
      
        1952
        
        1953
            #[test]
      
        1954
            fn parallelize_call_arg_moves_routes_blr_target_through_scratch() {
      
        1955
                let mut mf = MachineFunction::new("test".into());
      
        1956
                // ldr x0, [...] — function pointer loaded into x0
      
        1957
                mf.blocks[0].insts.push(MachineInst {
      
        1958
                    opcode: ArmOpcode::LdrImm,
      
        1959
                    operands: vec![
      
        1960
                        MachineOperand::PhysReg(PhysReg::Gp(0)),
      
        1961
                        MachineOperand::PhysReg(PhysReg::FP),
      
        1962
                        MachineOperand::Imm(-8),
      
        1963
                    ],
      
        1964
                    def: None,
      
        1965
                });
      
        1966
                // mov x0, x12 — arg-setup also targets x0 (clobbers fn ptr).
      
        1967
                mf.blocks[0].insts.push(MachineInst {
      
        1968
                    opcode: ArmOpcode::MovReg,
      
        1969
                    operands: vec![
      
        1970
                        MachineOperand::PhysReg(PhysReg::Gp(0)),
      
        1971
                        MachineOperand::PhysReg(PhysReg::Gp(12)),
      
        1972
                    ],
      
        1973
                    def: None,
      
        1974
                });
      
        1975
                mf.blocks[0].insts.push(MachineInst {
      
        1976
                    opcode: ArmOpcode::Blr,
      
        1977
                    operands: vec![MachineOperand::PhysReg(PhysReg::Gp(0))],
      
        1978
                    def: None,
      
        1979
                });
      
        1980
                parallelize_call_arg_moves(&mut mf);
      
        1981
                // First inst should now be `mov x10, x0` (preserve the fn
      
        1982
                // pointer to scratch); the BLR's operand should be x10.
      
        1983
                assert_eq!(mf.blocks[0].insts[0].opcode, ArmOpcode::LdrImm);
      
        1984
                assert_eq!(mf.blocks[0].insts[1].opcode, ArmOpcode::MovReg);
      
        1985
                assert_eq!(
      
        1986
                    mf.blocks[0].insts[1].operands,
      
        1987
                    vec![
      
        1988
                        MachineOperand::PhysReg(PhysReg::Gp(10)),
      
        1989
                        MachineOperand::PhysReg(PhysReg::Gp(0)),
      
        1990
                    ]
      
        1991
                );
      
        1992
                let blr = mf.blocks[0].insts.last().unwrap();
      
        1993
                assert_eq!(blr.opcode, ArmOpcode::Blr);
      
        1994
                assert_eq!(
      
        1995
                    blr.operands,
      
        1996
                    vec![MachineOperand::PhysReg(PhysReg::Gp(10))]
      
        1997
                );
      
        1998
            }
      
        1999
        
        2000
            #[test]
      
        2001
            fn linear_scan_splits_call_crossing_when_callee_saved_exhausted() {
      
        2002
                // 12 vregs all live across a single BL forces the
      
        2003
                // allocator past the 10-deep callee-saved pool. Without
      
        2004
                // splitting the overflow vregs full-spill; with splitting
      
        2005
                // they go pre-call→caller-saved → str/ldr-bridge →
      
        2006
                // post-call→caller-saved (so each split is two halves on
      
        2007
                // caller-saved registers, no whole-range memory traffic).
      
        2008
                let mut mf = MachineFunction::new("test".into());
      
        2009
                let vs: Vec<VRegId> = (0..12).map(|_| mf.new_vreg(RegClass::Gp64)).collect();
      
        2010
                for (i, &v) in vs.iter().enumerate() {
      
        2011
                    mf.blocks[0].insts.push(MachineInst {
      
        2012
                        opcode: ArmOpcode::Movz,
      
        2013
                        operands: vec![MachineOperand::VReg(v), MachineOperand::Imm(i as i64 + 1)],
      
        2014
                        def: Some(v),
      
        2015
                    });
      
        2016
                }
      
        2017
                mf.blocks[0].insts.push(MachineInst {
      
        2018
                    opcode: ArmOpcode::Bl,
      
        2019
                    operands: vec![MachineOperand::Extern("_foo".into())],
      
        2020
                    def: None,
      
        2021
                });
      
        2022
                for &v in &vs {
      
        2023
                    mf.blocks[0].insts.push(MachineInst {
      
        2024
                        opcode: ArmOpcode::MovReg,
      
        2025
                        operands: vec![
      
        2026
                            MachineOperand::PhysReg(PhysReg::Gp(0)),
      
        2027
                            MachineOperand::VReg(v),
      
        2028
                        ],
      
        2029
                        def: None,
      
        2030
                    });
      
        2031
                }
      
        2032
                mf.blocks[0].insts.push(MachineInst {
      
        2033
                    opcode: ArmOpcode::Ret,
      
        2034
                    operands: vec![],
      
        2035
                    def: None,
      
        2036
                });
      
        2037
                let result = linear_scan(&mut mf);
      
        2038
                assert!(
      
        2039
                    !result.split_records.is_empty(),
      
        2040
                    "expected splits when 12 call-crossing vregs collide with the 10-deep callee-saved pool"
      
        2041
                );
      
        2042
                for r in &result.split_records {
      
        2043
                    // pre-half must be on a caller-saved register; that's
      
        2044
                    // the entire point of splitting (the callee-saved tier
      
        2045
                    // is exhausted at split time).
      
        2046
                    if let PhysReg::Gp(n) | PhysReg::Gp32(n) = r.pre_phys {
      
        2047
                        assert!(
      
        2048
                            !GP_CALLEE_SAVED.contains(&n),
      
        2049
                            "split pre_phys should be caller-saved, got x{}",
      
        2050
                            n
      
        2051
                        );
      
        2052
                    }
      
        2053
                    // bridge_slot must be a real frame offset.
      
        2054
                    assert!(r.bridge_slot < 0, "bridge_slot should be FP-negative");
      
        2055
                }
      
        2056
            }
      
        2057
        }
      
        2058