armfortas Public

Watch 0 Fork 0 Star 0
Rust · 56683 bytes Raw Blame History
  
        1
        //! Post-isel peephole optimizations.
      
        2
        //!
      
        3
        //! Runs on `MachineFunction` after instruction selection and before
      
        4
        //! register allocation. Virtual registers are still in use.
      
        5
        //!
      
        6
        //! ## FMADD / FMSUB fusion
      
        7
        //!
      
        8
        //! ARM64 has fused multiply-add instructions that are both faster and
      
        9
        //! more accurate (single rounding) than a separate fmul + fadd pair.
      
        10
        //!
      
        11
        //! Matched patterns and their replacement:
      
        12
        //!
      
        13
        //! | Pattern                               | Replacement                 | ARM64 opcode |
      
        14
        //! |---------------------------------------|-----------------------------|--------------|
      
        15
        //! | `fadd(fmul(a, b), c)` or commuted    | `result = c + a*b`          | FMADD        |
      
        16
        //! | `fsub(c, fmul(a, b))`                | `result = c - a*b`          | FMSUB        |
      
        17
        //! | `fsub(fmul(a, b), c)`                | `result = a*b - c`          | FNMSUB       |
      
        18
        //!
      
        19
        //! ## MADD / MSUB fusion (integer)
      
        20
        //!
      
        21
        //! Same shape as the FP family but for integer ops. ARM64's `madd` /
      
        22
        //! `msub` are 3-source integer multiply-add/subtract that fuse `mul`
      
        23
        //! followed by `add`/`sub` in a single instruction. We never spelled
      
        24
        //! these from `isel.rs` directly because the IR exposes `IAdd(IMul, c)`
      
        25
        //! as two separate opcodes and isel emits them naively.
      
        26
        //!
      
        27
        //! Matched patterns:
      
        28
        //!
      
        29
        //! | Pattern                               | Replacement                 | ARM64 opcode |
      
        30
        //! |---------------------------------------|-----------------------------|--------------|
      
        31
        //! | `add(mul(a, b), c)` or commuted      | `result = c + a*b`          | MADD         |
      
        32
        //! | `sub(c, mul(a, b))`                  | `result = c - a*b`          | MSUB         |
      
        33
        //!
      
        34
        //! Note: `sub(mul(a,b), c)` (i.e. `a*b - c`) has no single-instruction
      
        35
        //! ARM64 form for integer ops — `msub` computes `Xa - Xn*Xm`, not the
      
        36
        //! other order, and there is no integer analogue of `fnmsub`. We leave
      
        37
        //! that pattern as-is.
      
        38
        //!
      
        39
        //! ## Preconditions
      
        40
        //!
      
        41
        //! * The MUL/FMUL result is used exactly once (the subsequent ADD/SUB).
      
        42
        //! * Both instructions are in the same machine block.
      
        43
        //!
      
        44
        //! After fusion the multiply is removed and the add/sub is replaced
      
        45
        //! with the three-source instruction.
      
        46
        
        47
        use super::mir::{ArmOpcode, MachineFunction, MachineInst, MachineOperand, RegClass, VRegId};
      
        48
        use std::collections::HashMap;
      
        49
        
        50
        /// Run all peephole passes on a machine function. Iterates to fixpoint
      
        51
        /// so newly-exposed patterns (e.g., a strength-reduction that produces
      
        52
        /// an `add x, x, x` fed by a mul-add candidate) compose without
      
        53
        /// requiring callers to re-invoke us. Capped at a small number of
      
        54
        /// rounds — passes are monotone (only remove or simplify), so fixpoint
      
        55
        /// is bounded by instruction count and an 8-round ceiling is plenty in
      
        56
        /// practice while still surfacing pathological non-convergence as an
      
        57
        /// internal-error rather than a hang.
      
        58
        pub fn run_peephole(mf: &mut MachineFunction) {
      
        59
            const MAX_ROUNDS: u32 = 8;
      
        60
            for _ in 0..MAX_ROUNDS {
      
        61
                let before = total_inst_count(mf);
      
        62
                // Scaled-addressing fusion first so it can consume the
      
        63
                // Mul + AddReg pair before madd_fusion would collapse them
      
        64
                // into a Madd. A single load-with-shift beats a fused madd
      
        65
                // followed by a plain ldr.
      
        66
                scaled_addressing_fusion(mf);
      
        67
                fma_fusion(mf);
      
        68
                madd_fusion(mf);
      
        69
                ldp_stp_fusion(mf);
      
        70
                if total_inst_count(mf) == before {
      
        71
                    return;
      
        72
                }
      
        73
            }
      
        74
        }
      
        75
        
        76
        fn total_inst_count(mf: &MachineFunction) -> usize {
      
        77
            mf.blocks.iter().map(|b| b.insts.len()).sum()
      
        78
        }
      
        79
        
        80
        /// FMADD/FMSUB/FNMSUB fusion.
      
        81
        fn fma_fusion(mf: &mut MachineFunction) {
      
        82
            for mb_idx in 0..mf.blocks.len() {
      
        83
                fma_fuse_block(mf, mb_idx);
      
        84
            }
      
        85
        }
      
        86
        
        87
        fn fma_fuse_block(mf: &mut MachineFunction, mb_idx: usize) {
      
        88
            let block = &mf.blocks[mb_idx];
      
        89
        
        90
            // Count uses of each defined VReg within this block.
      
        91
            // A fmul result is fusable only when its use count == 1 (the fadd/fsub).
      
        92
            let mut use_count: HashMap<VRegId, usize> = HashMap::new();
      
        93
            for inst in &block.insts {
      
        94
                for op in &inst.operands {
      
        95
                    if let MachineOperand::VReg(v) = op {
      
        96
                        *use_count.entry(*v).or_insert(0) += 1;
      
        97
                    }
      
        98
                }
      
        99
            }
      
        100
            // Subtract the self-def (operands[0] is the dest, counted in use_count
      
        101
            // but it's a def not a use). For a 3-operand FmulS [dest, src0, src1]:
      
        102
            // dest appears once in operands but it's the definition, not a use.
      
        103
            // Re-compute: only operands[1..] are uses.
      
        104
            let mut use_count: HashMap<VRegId, usize> = HashMap::new();
      
        105
            for inst in &block.insts {
      
        106
                // Operands beyond index 0 are inputs (index 0 is the output dest).
      
        107
                for op in inst.operands.iter().skip(1) {
      
        108
                    if let MachineOperand::VReg(v) = op {
      
        109
                        *use_count.entry(*v).or_insert(0) += 1;
      
        110
                    }
      
        111
                }
      
        112
            }
      
        113
        
        114
            // Map: vreg defined by a fmul instruction → (block-instruction-index, precision)
      
        115
            #[derive(Clone, Copy)]
      
        116
            enum Prec {
      
        117
                S,
      
        118
                D,
      
        119
            }
      
        120
            let mut fmul_defs: HashMap<VRegId, (usize, Prec)> = HashMap::new();
      
        121
        
        122
            for (i, inst) in block.insts.iter().enumerate() {
      
        123
                match inst.opcode {
      
        124
                    ArmOpcode::FmulS => {
      
        125
                        if let Some(d) = inst.def {
      
        126
                            fmul_defs.insert(d, (i, Prec::S));
      
        127
                        }
      
        128
                    }
      
        129
                    ArmOpcode::FmulD => {
      
        130
                        if let Some(d) = inst.def {
      
        131
                            fmul_defs.insert(d, (i, Prec::D));
      
        132
                        }
      
        133
                    }
      
        134
                    _ => {}
      
        135
                }
      
        136
            }
      
        137
        
        138
            // Collect fusions to apply (instruction index → replacement info).
      
        139
            // Each entry: (fadd/fsub_idx, fmul_idx, new_opcode, new_operands)
      
        140
            struct FusionPlan {
      
        141
                add_idx: usize,
      
        142
                mul_idx: usize,
      
        143
                new_opcode: ArmOpcode,
      
        144
                // operands for the fused inst: [dest, Fn, Fm, Fa]
      
        145
                n: VRegId, // multiply lhs
      
        146
                m: VRegId, // multiply rhs
      
        147
                a: VRegId, // addend / subtrahend (accumulate register)
      
        148
            }
      
        149
            let mut plans: Vec<FusionPlan> = Vec::new();
      
        150
        
        151
            let block = &mf.blocks[mb_idx];
      
        152
            for (i, inst) in block.insts.iter().enumerate() {
      
        153
                let (is_add, is_sub, prec_s) = match inst.opcode {
      
        154
                    ArmOpcode::FaddS => (true, false, true),
      
        155
                    ArmOpcode::FaddD => (true, false, false),
      
        156
                    ArmOpcode::FsubS => (false, true, true),
      
        157
                    ArmOpcode::FsubD => (false, true, false),
      
        158
                    _ => continue,
      
        159
                };
      
        160
                if inst.operands.len() < 3 {
      
        161
                    continue;
      
        162
                }
      
        163
        
        164
                // operands: [dest, src0, src1]
      
        165
                let src0 = match &inst.operands[1] {
      
        166
                    MachineOperand::VReg(v) => *v,
      
        167
                    _ => continue,
      
        168
                };
      
        169
                let src1 = match &inst.operands[2] {
      
        170
                    MachineOperand::VReg(v) => *v,
      
        171
                    _ => continue,
      
        172
                };
      
        173
        
        174
                // Check if src0 is a single-use fmul result.
      
        175
                let try_fuse_src0 = fmul_defs
      
        176
                    .get(&src0)
      
        177
                    .filter(|_| use_count.get(&src0).copied().unwrap_or(0) == 1);
      
        178
                // Check if src1 is a single-use fmul result.
      
        179
                let try_fuse_src1 = if is_add {
      
        180
                    fmul_defs
      
        181
                        .get(&src1)
      
        182
                        .filter(|_| use_count.get(&src1).copied().unwrap_or(0) == 1)
      
        183
                } else {
      
        184
                    None // fsub(c, fmul(a,b)): src0=c, src1=fmul → handled separately
      
        185
                };
      
        186
        
        187
                if is_add {
      
        188
                    // fadd(fmul(a,b), c) → FMADD(a, b, c)
      
        189
                    if let Some(&(mul_idx, _)) = try_fuse_src0 {
      
        190
                        let mul_inst = &block.insts[mul_idx];
      
        191
                        let n = match &mul_inst.operands[1] {
      
        192
                            MachineOperand::VReg(v) => *v,
      
        193
                            _ => continue,
      
        194
                        };
      
        195
                        let m = match &mul_inst.operands[2] {
      
        196
                            MachineOperand::VReg(v) => *v,
      
        197
                            _ => continue,
      
        198
                        };
      
        199
                        let opcode = if prec_s {
      
        200
                            ArmOpcode::FmaddS
      
        201
                        } else {
      
        202
                            ArmOpcode::FmaddD
      
        203
                        };
      
        204
                        plans.push(FusionPlan {
      
        205
                            add_idx: i,
      
        206
                            mul_idx,
      
        207
                            new_opcode: opcode,
      
        208
                            n,
      
        209
                            m,
      
        210
                            a: src1,
      
        211
                        });
      
        212
                    } else if let Some(&(mul_idx, _)) = try_fuse_src1 {
      
        213
                        // fadd(c, fmul(a,b)) → FMADD(a, b, c)  [commuted]
      
        214
                        let mul_inst = &block.insts[mul_idx];
      
        215
                        let n = match &mul_inst.operands[1] {
      
        216
                            MachineOperand::VReg(v) => *v,
      
        217
                            _ => continue,
      
        218
                        };
      
        219
                        let m = match &mul_inst.operands[2] {
      
        220
                            MachineOperand::VReg(v) => *v,
      
        221
                            _ => continue,
      
        222
                        };
      
        223
                        let opcode = if prec_s {
      
        224
                            ArmOpcode::FmaddS
      
        225
                        } else {
      
        226
                            ArmOpcode::FmaddD
      
        227
                        };
      
        228
                        plans.push(FusionPlan {
      
        229
                            add_idx: i,
      
        230
                            mul_idx,
      
        231
                            new_opcode: opcode,
      
        232
                            n,
      
        233
                            m,
      
        234
                            a: src0,
      
        235
                        });
      
        236
                    }
      
        237
                } else if is_sub {
      
        238
                    // fsub(fmul(a,b), c) → FNMSUB(a, b, c)  [result = a*b - c]
      
        239
                    if let Some(&(mul_idx, _)) = try_fuse_src0 {
      
        240
                        let mul_inst = &block.insts[mul_idx];
      
        241
                        let n = match &mul_inst.operands[1] {
      
        242
                            MachineOperand::VReg(v) => *v,
      
        243
                            _ => continue,
      
        244
                        };
      
        245
                        let m = match &mul_inst.operands[2] {
      
        246
                            MachineOperand::VReg(v) => *v,
      
        247
                            _ => continue,
      
        248
                        };
      
        249
                        let opcode = if prec_s {
      
        250
                            ArmOpcode::FnmsubS
      
        251
                        } else {
      
        252
                            ArmOpcode::FnmsubD
      
        253
                        };
      
        254
                        plans.push(FusionPlan {
      
        255
                            add_idx: i,
      
        256
                            mul_idx,
      
        257
                            new_opcode: opcode,
      
        258
                            n,
      
        259
                            m,
      
        260
                            a: src1,
      
        261
                        });
      
        262
                    }
      
        263
                    // fsub(c, fmul(a,b)) → FMSUB(a, b, c)  [result = c - a*b]
      
        264
                    // src0=c, src1=fmul_result
      
        265
                    let try_fuse_sub1 = fmul_defs
      
        266
                        .get(&src1)
      
        267
                        .filter(|_| use_count.get(&src1).copied().unwrap_or(0) == 1);
      
        268
                    if let Some(&(mul_idx, _)) = try_fuse_sub1 {
      
        269
                        let mul_inst = &block.insts[mul_idx];
      
        270
                        let n = match &mul_inst.operands[1] {
      
        271
                            MachineOperand::VReg(v) => *v,
      
        272
                            _ => continue,
      
        273
                        };
      
        274
                        let m = match &mul_inst.operands[2] {
      
        275
                            MachineOperand::VReg(v) => *v,
      
        276
                            _ => continue,
      
        277
                        };
      
        278
                        let opcode = if prec_s {
      
        279
                            ArmOpcode::FmsubS
      
        280
                        } else {
      
        281
                            ArmOpcode::FmsubD
      
        282
                        };
      
        283
                        plans.push(FusionPlan {
      
        284
                            add_idx: i,
      
        285
                            mul_idx,
      
        286
                            new_opcode: opcode,
      
        287
                            n,
      
        288
                            m,
      
        289
                            a: src0,
      
        290
                        });
      
        291
                    }
      
        292
                }
      
        293
            }
      
        294
        
        295
            if plans.is_empty() {
      
        296
                return;
      
        297
            }
      
        298
        
        299
            // Apply plans in reverse order of add_idx to keep indices stable.
      
        300
            plans.sort_by_key(|plan| std::cmp::Reverse(plan.add_idx));
      
        301
        
        302
            // Collect mul_idxs to remove.
      
        303
            let mut remove_idxs: std::collections::HashSet<usize> =
      
        304
                plans.iter().map(|p| p.mul_idx).collect();
      
        305
        
        306
            // Rewrite the block.
      
        307
            let block = &mut mf.blocks[mb_idx];
      
        308
            for plan in &plans {
      
        309
                let dest = block.insts[plan.add_idx].def;
      
        310
                let dest_op = block.insts[plan.add_idx].operands[0].clone();
      
        311
                block.insts[plan.add_idx] = MachineInst {
      
        312
                    opcode: plan.new_opcode,
      
        313
                    operands: vec![
      
        314
                        dest_op,
      
        315
                        MachineOperand::VReg(plan.n),
      
        316
                        MachineOperand::VReg(plan.m),
      
        317
                        MachineOperand::VReg(plan.a),
      
        318
                    ],
      
        319
                    def: dest,
      
        320
                };
      
        321
            }
      
        322
            let mut idx = 0usize;
      
        323
            block.insts.retain(|_| {
      
        324
                let keep = !remove_idxs.remove(&idx);
      
        325
                idx += 1;
      
        326
                keep
      
        327
            });
      
        328
        }
      
        329
        
        330
        /// Integer MADD/MSUB fusion. Mirrors `fma_fusion` for `Mul + AddReg`
      
        331
        /// and `Mul + SubReg`. Matched patterns:
      
        332
        ///
      
        333
        /// * `AddReg(Mul(a, b), c)`  →  `Madd(a, b, c)`
      
        334
        /// * `AddReg(c, Mul(a, b))`  →  `Madd(a, b, c)`  (commuted)
      
        335
        /// * `SubReg(c, Mul(a, b))`  →  `Msub(a, b, c)`  (only this order — there
      
        336
        ///   is no integer FNMSUB analogue)
      
        337
        ///
      
        338
        /// Preconditions match the FP version: same block, mul result has
      
        339
        /// exactly one use (the add/sub).
      
        340
        fn madd_fusion(mf: &mut MachineFunction) {
      
        341
            for mb_idx in 0..mf.blocks.len() {
      
        342
                madd_fuse_block(mf, mb_idx);
      
        343
            }
      
        344
        }
      
        345
        
        346
        fn madd_fuse_block(mf: &mut MachineFunction, mb_idx: usize) {
      
        347
            let block = &mf.blocks[mb_idx];
      
        348
        
        349
            // Per-block use counts of values, *as inputs only*. operands[0] is
      
        350
            // the def for AddReg/SubReg/Mul, so we skip index 0 to count true
      
        351
            // uses.
      
        352
            let mut use_count: HashMap<VRegId, usize> = HashMap::new();
      
        353
            for inst in &block.insts {
      
        354
                for op in inst.operands.iter().skip(1) {
      
        355
                    if let MachineOperand::VReg(v) = op {
      
        356
                        *use_count.entry(*v).or_insert(0) += 1;
      
        357
                    }
      
        358
                }
      
        359
            }
      
        360
        
        361
            // VReg defined by an integer Mul → block index of that mul.
      
        362
            let mut mul_defs: HashMap<VRegId, usize> = HashMap::new();
      
        363
            for (i, inst) in block.insts.iter().enumerate() {
      
        364
                if inst.opcode == ArmOpcode::Mul {
      
        365
                    if let Some(d) = inst.def {
      
        366
                        mul_defs.insert(d, i);
      
        367
                    }
      
        368
                }
      
        369
            }
      
        370
        
        371
            struct FusionPlan {
      
        372
                add_idx: usize,
      
        373
                mul_idx: usize,
      
        374
                new_opcode: ArmOpcode,
      
        375
                // [dest, n, m, a]
      
        376
                n: VRegId,
      
        377
                m: VRegId,
      
        378
                a: VRegId,
      
        379
            }
      
        380
            let mut plans: Vec<FusionPlan> = Vec::new();
      
        381
        
        382
            for (i, inst) in block.insts.iter().enumerate() {
      
        383
                let (is_add, is_sub) = match inst.opcode {
      
        384
                    ArmOpcode::AddReg => (true, false),
      
        385
                    ArmOpcode::SubReg => (false, true),
      
        386
                    _ => continue,
      
        387
                };
      
        388
                if inst.operands.len() < 3 {
      
        389
                    continue;
      
        390
                }
      
        391
                let src0 = match &inst.operands[1] {
      
        392
                    MachineOperand::VReg(v) => *v,
      
        393
                    _ => continue,
      
        394
                };
      
        395
                let src1 = match &inst.operands[2] {
      
        396
                    MachineOperand::VReg(v) => *v,
      
        397
                    _ => continue,
      
        398
                };
      
        399
        
        400
                let single_use = |v: VRegId| use_count.get(&v).copied().unwrap_or(0) == 1;
      
        401
        
        402
                if is_add {
      
        403
                    // AddReg(Mul(a, b), c): src0 is the mul result.
      
        404
                    if let Some(&mul_idx) = mul_defs.get(&src0) {
      
        405
                        if !single_use(src0) {
      
        406
                            // mul result is read by something else too — can't fuse.
      
        407
                        } else {
      
        408
                            let mul_inst = &block.insts[mul_idx];
      
        409
                            let n = match &mul_inst.operands[1] {
      
        410
                                MachineOperand::VReg(v) => *v,
      
        411
                                _ => continue,
      
        412
                            };
      
        413
                            let m = match &mul_inst.operands[2] {
      
        414
                                MachineOperand::VReg(v) => *v,
      
        415
                                _ => continue,
      
        416
                            };
      
        417
                            plans.push(FusionPlan {
      
        418
                                add_idx: i,
      
        419
                                mul_idx,
      
        420
                                new_opcode: ArmOpcode::Madd,
      
        421
                                n,
      
        422
                                m,
      
        423
                                a: src1,
      
        424
                            });
      
        425
                            continue;
      
        426
                        }
      
        427
                    }
      
        428
                    // AddReg(c, Mul(a, b))  [commuted].
      
        429
                    if let Some(&mul_idx) = mul_defs.get(&src1) {
      
        430
                        if !single_use(src1) {
      
        431
                            continue;
      
        432
                        }
      
        433
                        let mul_inst = &block.insts[mul_idx];
      
        434
                        let n = match &mul_inst.operands[1] {
      
        435
                            MachineOperand::VReg(v) => *v,
      
        436
                            _ => continue,
      
        437
                        };
      
        438
                        let m = match &mul_inst.operands[2] {
      
        439
                            MachineOperand::VReg(v) => *v,
      
        440
                            _ => continue,
      
        441
                        };
      
        442
                        plans.push(FusionPlan {
      
        443
                            add_idx: i,
      
        444
                            mul_idx,
      
        445
                            new_opcode: ArmOpcode::Madd,
      
        446
                            n,
      
        447
                            m,
      
        448
                            a: src0,
      
        449
                        });
      
        450
                    }
      
        451
                } else if is_sub {
      
        452
                    // SubReg(c, Mul(a, b))  →  Msub. The other order
      
        453
                    // (`a*b - c`) has no integer fused form on ARM64.
      
        454
                    if let Some(&mul_idx) = mul_defs.get(&src1) {
      
        455
                        if !single_use(src1) {
      
        456
                            continue;
      
        457
                        }
      
        458
                        let mul_inst = &block.insts[mul_idx];
      
        459
                        let n = match &mul_inst.operands[1] {
      
        460
                            MachineOperand::VReg(v) => *v,
      
        461
                            _ => continue,
      
        462
                        };
      
        463
                        let m = match &mul_inst.operands[2] {
      
        464
                            MachineOperand::VReg(v) => *v,
      
        465
                            _ => continue,
      
        466
                        };
      
        467
                        plans.push(FusionPlan {
      
        468
                            add_idx: i,
      
        469
                            mul_idx,
      
        470
                            new_opcode: ArmOpcode::Msub,
      
        471
                            n,
      
        472
                            m,
      
        473
                            a: src0,
      
        474
                        });
      
        475
                    }
      
        476
                }
      
        477
            }
      
        478
        
        479
            if plans.is_empty() {
      
        480
                return;
      
        481
            }
      
        482
        
        483
            // Apply plans in reverse add_idx order to keep indices stable.
      
        484
            plans.sort_by_key(|plan| std::cmp::Reverse(plan.add_idx));
      
        485
            let mut remove_idxs: std::collections::HashSet<usize> =
      
        486
                plans.iter().map(|p| p.mul_idx).collect();
      
        487
        
        488
            let block = &mut mf.blocks[mb_idx];
      
        489
            for plan in &plans {
      
        490
                let dest = block.insts[plan.add_idx].def;
      
        491
                let dest_op = block.insts[plan.add_idx].operands[0].clone();
      
        492
                block.insts[plan.add_idx] = MachineInst {
      
        493
                    opcode: plan.new_opcode,
      
        494
                    operands: vec![
      
        495
                        dest_op,
      
        496
                        MachineOperand::VReg(plan.n),
      
        497
                        MachineOperand::VReg(plan.m),
      
        498
                        MachineOperand::VReg(plan.a),
      
        499
                    ],
      
        500
                    def: dest,
      
        501
                };
      
        502
            }
      
        503
            let mut idx = 0usize;
      
        504
            block.insts.retain(|_| {
      
        505
                let keep = !remove_idxs.remove(&idx);
      
        506
                idx += 1;
      
        507
                keep
      
        508
            });
      
        509
        }
      
        510
        
        511
        /// ARM64 scaled-register addressing fusion.
      
        512
        ///
      
        513
        /// Rewrites the 4-instruction sequence emitted by GEP+Load/Store
      
        514
        /// lowering when `elem_size` is a power of two ≤ 8:
      
        515
        ///
      
        516
        /// ```text
      
        517
        /// movz tmp, #elem_size, 0      ; (or const-materializing equivalent)
      
        518
        /// mul  scaled, idx, tmp
      
        519
        /// add  addr, base, scaled
      
        520
        /// ldr  dest, [addr]            ; LdrImm/LdrFpImm/StrImm/StrFpImm, offset 0
      
        521
        /// ```
      
        522
        ///
      
        523
        /// into a single instruction:
      
        524
        ///
      
        525
        /// ```text
      
        526
        /// ldr  dest, [base, idx, lsl #shift]
      
        527
        /// ```
      
        528
        ///
      
        529
        /// Preconditions checked per fusion site:
      
        530
        /// * `tmp` has exactly one use (the mul) and is defined by a single
      
        531
        ///   `Movz` whose chunk value ∈ {1, 2, 4, 8} with shift 0.
      
        532
        /// * `scaled` (the mul result) has exactly one use (the add).
      
        533
        /// * `addr` (the add result) has exactly one use (the load/store).
      
        534
        /// * The load/store's offset operand is `Imm(0)`.
      
        535
        /// * All four instructions live in the same machine block.
      
        536
        ///
      
        537
        /// Multi-use intermediates (e.g. the same GEP feeds two loads) keep
      
        538
        /// the unfused form — duplicating the index calculation across each
      
        539
        /// consumer would increase code size, not decrease it.
      
        540
        fn scaled_addressing_fusion(mf: &mut MachineFunction) {
      
        541
            // Compute use counts function-wide. The fold removes 3
      
        542
            // instructions (Movz, Mul, AddReg) and rewrites a 4th. If any of
      
        543
            // those defs is consumed in *another* block (cross-block live
      
        544
            // value), we must not fold — the consumer would lose its
      
        545
            // operand. A per-block count understates true uses when the
      
        546
            // value is live-out, which is what the cross-block path needs to
      
        547
            // reject. Counting once per function is cheap and correct.
      
        548
            let use_count = compute_global_use_counts(mf);
      
        549
            for mb_idx in 0..mf.blocks.len() {
      
        550
                scaled_addressing_fuse_block(mf, mb_idx, &use_count);
      
        551
            }
      
        552
        }
      
        553
        
        554
        fn compute_global_use_counts(mf: &MachineFunction) -> HashMap<VRegId, usize> {
      
        555
            let mut use_count: HashMap<VRegId, usize> = HashMap::new();
      
        556
            for block in &mf.blocks {
      
        557
                for inst in &block.insts {
      
        558
                    // operand[0] is the def for instructions that have one.
      
        559
                    let skip_first = inst.def.is_some();
      
        560
                    for (i, op) in inst.operands.iter().enumerate() {
      
        561
                        if skip_first && i == 0 {
      
        562
                            continue;
      
        563
                        }
      
        564
                        if let MachineOperand::VReg(v) = op {
      
        565
                            *use_count.entry(*v).or_insert(0) += 1;
      
        566
                        }
      
        567
                    }
      
        568
                }
      
        569
            }
      
        570
            use_count
      
        571
        }
      
        572
        
        573
        fn scaled_addressing_fuse_block(
      
        574
            mf: &mut MachineFunction,
      
        575
            mb_idx: usize,
      
        576
            use_count: &HashMap<VRegId, usize>,
      
        577
        ) {
      
        578
            let block = &mf.blocks[mb_idx];
      
        579
        
        580
            // Map vreg → (block-local index of defining instruction, opcode).
      
        581
            let mut defs: HashMap<VRegId, (usize, ArmOpcode)> = HashMap::new();
      
        582
            for (i, inst) in block.insts.iter().enumerate() {
      
        583
                if let Some(d) = inst.def {
      
        584
                    defs.insert(d, (i, inst.opcode));
      
        585
                }
      
        586
            }
      
        587
        
        588
            struct FusionPlan {
      
        589
                ldst_idx: usize,
      
        590
                add_idx: usize,
      
        591
                mul_idx: usize,
      
        592
                movz_idx: usize,
      
        593
                new_opcode: ArmOpcode,
      
        594
                // [dest, base, idx, Imm(shift)]
      
        595
                dest_op: MachineOperand,
      
        596
                dest_def: Option<VRegId>,
      
        597
                base: VRegId,
      
        598
                idx: VRegId,
      
        599
                shift: i64,
      
        600
            }
      
        601
            let mut plans: Vec<FusionPlan> = Vec::new();
      
        602
        
        603
            for (i, inst) in block.insts.iter().enumerate() {
      
        604
                let new_opcode = match inst.opcode {
      
        605
                    ArmOpcode::LdrImm => ArmOpcode::LdrReg,
      
        606
                    ArmOpcode::LdrFpImm => ArmOpcode::LdrFpReg,
      
        607
                    ArmOpcode::StrImm => ArmOpcode::StrReg,
      
        608
                    ArmOpcode::StrFpImm => ArmOpcode::StrFpReg,
      
        609
                    _ => continue,
      
        610
                };
      
        611
                // Format: [dest_or_src, base, offset]. Offset must be Imm(0).
      
        612
                if inst.operands.len() < 3 {
      
        613
                    continue;
      
        614
                }
      
        615
                let offset_zero = matches!(&inst.operands[2], MachineOperand::Imm(0));
      
        616
                if !offset_zero {
      
        617
                    continue;
      
        618
                }
      
        619
                let base_addr = match &inst.operands[1] {
      
        620
                    MachineOperand::VReg(v) => *v,
      
        621
                    _ => continue,
      
        622
                };
      
        623
        
        624
                // The base address must come from a single-use AddReg.
      
        625
                let (add_idx, add_op) = match defs.get(&base_addr) {
      
        626
                    Some(&(idx, op)) => (idx, op),
      
        627
                    None => continue,
      
        628
                };
      
        629
                if add_op != ArmOpcode::AddReg {
      
        630
                    continue;
      
        631
                }
      
        632
                if use_count.get(&base_addr).copied().unwrap_or(0) != 1 {
      
        633
                    continue;
      
        634
                }
      
        635
        
        636
                let add_inst = &block.insts[add_idx];
      
        637
                if add_inst.operands.len() < 3 {
      
        638
                    continue;
      
        639
                }
      
        640
                let add_lhs = match &add_inst.operands[1] {
      
        641
                    MachineOperand::VReg(v) => *v,
      
        642
                    _ => continue,
      
        643
                };
      
        644
                let add_rhs = match &add_inst.operands[2] {
      
        645
                    MachineOperand::VReg(v) => *v,
      
        646
                    _ => continue,
      
        647
                };
      
        648
        
        649
                // The scaled-index operand: defined by a single-use Mul. Try
      
        650
                // either commuted order — `add base, scaled` or `add scaled,
      
        651
                // base` (isel emits the former, but be defensive).
      
        652
                let try_pair = |scaled: VRegId, base: VRegId| -> Option<(usize, VRegId, VRegId, i64, usize)> {
      
        653
                    let &(mul_idx, mul_op) = defs.get(&scaled)?;
      
        654
                    if mul_op != ArmOpcode::Mul {
      
        655
                        return None;
      
        656
                    }
      
        657
                    if use_count.get(&scaled).copied().unwrap_or(0) != 1 {
      
        658
                        return None;
      
        659
                    }
      
        660
                    let mul_inst = &block.insts[mul_idx];
      
        661
                    if mul_inst.operands.len() < 3 {
      
        662
                        return None;
      
        663
                    }
      
        664
                    let m_lhs = match &mul_inst.operands[1] {
      
        665
                        MachineOperand::VReg(v) => *v,
      
        666
                        _ => return None,
      
        667
                    };
      
        668
                    let m_rhs = match &mul_inst.operands[2] {
      
        669
                        MachineOperand::VReg(v) => *v,
      
        670
                        _ => return None,
      
        671
                    };
      
        672
                    // The constant operand of the Mul — must be defined by a
      
        673
                    // single-use Movz with chunk ∈ {1,2,4,8} at shift 0.
      
        674
                    let try_const = |const_v: VRegId, idx_v: VRegId| -> Option<(VRegId, i64, usize)> {
      
        675
                        let &(movz_idx, movz_op) = defs.get(&const_v)?;
      
        676
                        if movz_op != ArmOpcode::Movz {
      
        677
                            return None;
      
        678
                        }
      
        679
                        if use_count.get(&const_v).copied().unwrap_or(0) != 1 {
      
        680
                            return None;
      
        681
                        }
      
        682
                        let movz_inst = &block.insts[movz_idx];
      
        683
                        // Movz operands: [dest, Imm(chunk), Shift(amount)]. We
      
        684
                        // require shift==0 and chunk ∈ {1,2,4,8}.
      
        685
                        if movz_inst.operands.len() < 3 {
      
        686
                            return None;
      
        687
                        }
      
        688
                        let chunk = match &movz_inst.operands[1] {
      
        689
                            MachineOperand::Imm(v) => *v,
      
        690
                            _ => return None,
      
        691
                        };
      
        692
                        let shift_amount = match &movz_inst.operands[2] {
      
        693
                            MachineOperand::Shift(s) => *s,
      
        694
                            _ => return None,
      
        695
                        };
      
        696
                        if shift_amount != 0 {
      
        697
                            return None;
      
        698
                        }
      
        699
                        let lsl_shift: i64 = match chunk {
      
        700
                            1 => 0,
      
        701
                            2 => 1,
      
        702
                            4 => 2,
      
        703
                            8 => 3,
      
        704
                            _ => return None,
      
        705
                        };
      
        706
                        Some((idx_v, lsl_shift, movz_idx))
      
        707
                    };
      
        708
                    try_const(m_rhs, m_lhs)
      
        709
                        .or_else(|| try_const(m_lhs, m_rhs))
      
        710
                        .map(|(idx, shift, movz_idx)| (mul_idx, base, idx, shift, movz_idx))
      
        711
                };
      
        712
        
        713
                let resolved = try_pair(add_rhs, add_lhs).or_else(|| try_pair(add_lhs, add_rhs));
      
        714
                let Some((mul_idx, base_v, idx_v, shift, movz_idx)) = resolved else {
      
        715
                    continue;
      
        716
                };
      
        717
        
        718
                plans.push(FusionPlan {
      
        719
                    ldst_idx: i,
      
        720
                    add_idx,
      
        721
                    mul_idx,
      
        722
                    movz_idx,
      
        723
                    new_opcode,
      
        724
                    dest_op: inst.operands[0].clone(),
      
        725
                    dest_def: inst.def,
      
        726
                    base: base_v,
      
        727
                    idx: idx_v,
      
        728
                    shift,
      
        729
                });
      
        730
            }
      
        731
        
        732
            if plans.is_empty() {
      
        733
                return;
      
        734
            }
      
        735
        
        736
            // Apply: rewrite the load/store in place; mark the mul/add/movz
      
        737
            // for removal. Iterate in reverse ldst_idx order so removals
      
        738
            // before the rewrite don't shift indices.
      
        739
            plans.sort_by_key(|plan| std::cmp::Reverse(plan.ldst_idx));
      
        740
            let mut remove_idxs: std::collections::HashSet<usize> = plans
      
        741
                .iter()
      
        742
                .flat_map(|p| [p.add_idx, p.mul_idx, p.movz_idx])
      
        743
                .collect();
      
        744
        
        745
            let block = &mut mf.blocks[mb_idx];
      
        746
            for plan in &plans {
      
        747
                block.insts[plan.ldst_idx] = MachineInst {
      
        748
                    opcode: plan.new_opcode,
      
        749
                    operands: vec![
      
        750
                        plan.dest_op.clone(),
      
        751
                        MachineOperand::VReg(plan.base),
      
        752
                        MachineOperand::VReg(plan.idx),
      
        753
                        MachineOperand::Imm(plan.shift),
      
        754
                    ],
      
        755
                    def: plan.dest_def,
      
        756
                };
      
        757
            }
      
        758
            let mut idx = 0usize;
      
        759
            block.insts.retain(|_| {
      
        760
                let keep = !remove_idxs.remove(&idx);
      
        761
                idx += 1;
      
        762
                keep
      
        763
            });
      
        764
        }
      
        765
        
        766
        /// LDP/STP pair fusion.
      
        767
        ///
      
        768
        /// Walks each basic block and fuses **immediately-adjacent** ldr/str
      
        769
        /// pairs that share a base register and have offsets differing by
      
        770
        /// exactly the access width. The two get rewritten as a single
      
        771
        /// `LdpOffset` / `StpOffset` whose imm targets the LOWER address
      
        772
        /// (Rt1 sits at imm, Rt2 at imm + width per ARM ARM C6.2.156).
      
        773
        ///
      
        774
        /// Restrictions kept narrow on this first cut:
      
        775
        /// * Only adjacent instructions in the block (no window scan; no
      
        776
        ///   intervening writes to base or memory). Broadening to a small
      
        777
        ///   window with proper aliasing analysis is a follow-up if the
      
        778
        ///   adjacent-only form proves too restrictive.
      
        779
        /// * Same opcode and same width on both ops. We support 4-byte
      
        780
        ///   (`LdrImm`/`StrImm` Gp32, `LdrFpImm`/`StrFpImm` Fp32) and 8-byte
      
        781
        ///   (Gp64, Fp64) variants. The `LdrshImm`/`LdrsbImm`/`StrhImm`/
      
        782
        ///   `StrbImm` half/byte forms have no LDP/STP analogue and are
      
        783
        ///   skipped.
      
        784
        /// * Both offsets must be plain `Imm(_)` (not `FrameSlot` — the
      
        785
        ///   prologue STP/LDP already covers callee-save spills, and frame
      
        786
        ///   slots get materialized to immediates downstream anyway).
      
        787
        /// * For loads, the two destination registers must differ. ARM ARM
      
        788
        ///   declares `LDP Rt, Rt, ...` UNPREDICTABLE.
      
        789
        /// * For loads, the first load's destination must not equal the base
      
        790
        ///   register. The fused LDP would still read the unmodified base,
      
        791
        ///   which silently breaks the original semantics where the second
      
        792
        ///   ldr saw the post-write base.
      
        793
        /// * Combined offset must lie within signed 7-bit × width range.
      
        794
        fn ldp_stp_fusion(mf: &mut MachineFunction) {
      
        795
            for mb_idx in 0..mf.blocks.len() {
      
        796
                ldp_stp_fuse_block(mf, mb_idx);
      
        797
            }
      
        798
        }
      
        799
        
        800
        fn vreg_class(mf: &MachineFunction, v: VRegId) -> Option<RegClass> {
      
        801
            mf.vregs.get(v.0 as usize).map(|r| r.class)
      
        802
        }
      
        803
        
        804
        fn class_byte_width(c: RegClass) -> u32 {
      
        805
            match c {
      
        806
                RegClass::Gp32 | RegClass::Fp32 => 4,
      
        807
                RegClass::Gp64 | RegClass::Fp64 => 8,
      
        808
                RegClass::V128 => 16,
      
        809
            }
      
        810
        }
      
        811
        
        812
        /// Width inferred from operand[0] of a load/store instruction. For
      
        813
        /// loads the operand is the dest vreg; for stores it is the source
      
        814
        /// vreg — both carry the access width via their RegClass.
      
        815
        fn ldst_access_width(mf: &MachineFunction, inst: &MachineInst) -> Option<u32> {
      
        816
            let v = match inst.operands.first()? {
      
        817
                MachineOperand::VReg(v) => *v,
      
        818
                _ => return None,
      
        819
            };
      
        820
            let class = vreg_class(mf, v)?;
      
        821
            Some(class_byte_width(class))
      
        822
        }
      
        823
        
        824
        fn ldp_imm_in_range(off: i64, width: u32) -> bool {
      
        825
            let stride = width as i64;
      
        826
            if off % stride != 0 {
      
        827
                return false;
      
        828
            }
      
        829
            let units = off / stride;
      
        830
            // signed 7-bit imm range: [-64, 63] units.
      
        831
            (-64..=63).contains(&units)
      
        832
        }
      
        833
        
        834
        fn ldp_stp_fuse_block(mf: &mut MachineFunction, mb_idx: usize) {
      
        835
            let block_len = mf.blocks[mb_idx].insts.len();
      
        836
            if block_len < 2 {
      
        837
                return;
      
        838
            }
      
        839
        
        840
            enum LdSt {
      
        841
                LdrInt,
      
        842
                LdrFp,
      
        843
                StrInt,
      
        844
                StrFp,
      
        845
            }
      
        846
            fn classify(op: ArmOpcode) -> Option<LdSt> {
      
        847
                Some(match op {
      
        848
                    ArmOpcode::LdrImm => LdSt::LdrInt,
      
        849
                    ArmOpcode::LdrFpImm => LdSt::LdrFp,
      
        850
                    ArmOpcode::StrImm => LdSt::StrInt,
      
        851
                    ArmOpcode::StrFpImm => LdSt::StrFp,
      
        852
                    _ => return None,
      
        853
                })
      
        854
            }
      
        855
            fn paired_opcode(kind: &LdSt) -> ArmOpcode {
      
        856
                // STP and LDP are width-agnostic at the opcode level; the
      
        857
                // emit layer picks `ldp x/w/d/s` from operand[0]'s register
      
        858
                // class. (afs-as does the same.)
      
        859
                match kind {
      
        860
                    LdSt::LdrInt | LdSt::LdrFp => ArmOpcode::LdpOffset,
      
        861
                    LdSt::StrInt | LdSt::StrFp => ArmOpcode::StpOffset,
      
        862
                }
      
        863
            }
      
        864
        
        865
            // Plan: collect (lower_idx, upper_idx, replacement) and, after the
      
        866
            // walk, rewrite lower_idx in place and mark upper_idx for removal.
      
        867
            struct PairPlan {
      
        868
                lower_idx: usize,
      
        869
                upper_idx: usize,
      
        870
                new_inst: MachineInst,
      
        871
            }
      
        872
            let mut plans: Vec<PairPlan> = Vec::new();
      
        873
            let mut consumed: std::collections::HashSet<usize> = std::collections::HashSet::new();
      
        874
        
        875
            let mut i = 0;
      
        876
            while i + 1 < block_len {
      
        877
                if consumed.contains(&i) {
      
        878
                    i += 1;
      
        879
                    continue;
      
        880
                }
      
        881
                let a = &mf.blocks[mb_idx].insts[i];
      
        882
                let b = &mf.blocks[mb_idx].insts[i + 1];
      
        883
                let (Some(ka), Some(kb)) = (classify(a.opcode), classify(b.opcode)) else {
      
        884
                    i += 1;
      
        885
                    continue;
      
        886
                };
      
        887
                // Must be the same opcode (loads can fuse only with loads,
      
        888
                // stores with stores, integer with integer, float with float).
      
        889
                if a.opcode != b.opcode {
      
        890
                    i += 1;
      
        891
                    continue;
      
        892
                }
      
        893
        
        894
                if a.operands.len() < 3 || b.operands.len() < 3 {
      
        895
                    i += 1;
      
        896
                    continue;
      
        897
                }
      
        898
                let (av, bv) = (&a.operands[0], &b.operands[0]);
      
        899
                let (a_base, b_base) = (&a.operands[1], &b.operands[1]);
      
        900
                let (a_off_op, b_off_op) = (&a.operands[2], &b.operands[2]);
      
        901
                let (Some(a_off), Some(b_off)) = (
      
        902
                    match a_off_op {
      
        903
                        MachineOperand::Imm(v) => Some(*v),
      
        904
                        _ => None,
      
        905
                    },
      
        906
                    match b_off_op {
      
        907
                        MachineOperand::Imm(v) => Some(*v),
      
        908
                        _ => None,
      
        909
                    },
      
        910
                ) else {
      
        911
                    i += 1;
      
        912
                    continue;
      
        913
                };
      
        914
                if a_base != b_base {
      
        915
                    i += 1;
      
        916
                    continue;
      
        917
                }
      
        918
        
        919
                let Some(width) = ldst_access_width(mf, a) else {
      
        920
                    i += 1;
      
        921
                    continue;
      
        922
                };
      
        923
                // Both ops must access the same width — the operand[0] vregs
      
        924
                // share a register class for legal LDP/STP.
      
        925
                let b_width = ldst_access_width(mf, b);
      
        926
                if Some(width) != b_width {
      
        927
                    i += 1;
      
        928
                    continue;
      
        929
                }
      
        930
                // Adjacent offsets: the higher one is exactly width above the lower.
      
        931
                let (lower_off, lower_idx_in_pair, upper_op_value) = if b_off == a_off + width as i64 {
      
        932
                    (a_off, 0usize, bv.clone())
      
        933
                } else if a_off == b_off + width as i64 {
      
        934
                    (b_off, 1usize, av.clone())
      
        935
                } else {
      
        936
                    i += 1;
      
        937
                    continue;
      
        938
                };
      
        939
                let lower_op_value = if lower_idx_in_pair == 0 { av.clone() } else { bv.clone() };
      
        940
        
        941
                if !ldp_imm_in_range(lower_off, width) {
      
        942
                    i += 1;
      
        943
                    continue;
      
        944
                }
      
        945
        
        946
                // For loads: forbid same-dest UNPREDICTABLE form, and forbid
      
        947
                // dest1 == base which would silently change the second load's
      
        948
                // address.
      
        949
                let (is_load, base_v) = match (&ka, a_base) {
      
        950
                    (LdSt::LdrInt | LdSt::LdrFp, MachineOperand::VReg(v)) => (true, Some(*v)),
      
        951
                    _ => (false, None),
      
        952
                };
      
        953
                if is_load {
      
        954
                    let (MachineOperand::VReg(va), MachineOperand::VReg(vb)) = (av, bv) else {
      
        955
                        i += 1;
      
        956
                        continue;
      
        957
                    };
      
        958
                    if va == vb {
      
        959
                        i += 1;
      
        960
                        continue;
      
        961
                    }
      
        962
                    if let Some(b_v) = base_v {
      
        963
                        // The first load (program order) is at index i. Its
      
        964
                        // dest must not be the base — otherwise the second
      
        965
                        // load (i+1) would be reading a different base.
      
        966
                        let first_dest = match &mf.blocks[mb_idx].insts[i].operands[0] {
      
        967
                            MachineOperand::VReg(v) => *v,
      
        968
                            _ => {
      
        969
                                i += 1;
      
        970
                                continue;
      
        971
                            }
      
        972
                        };
      
        973
                        if first_dest == b_v {
      
        974
                            i += 1;
      
        975
                            continue;
      
        976
                        }
      
        977
                    }
      
        978
                }
      
        979
                let _ = kb; // suppress "unused" — symmetry only.
      
        980
        
        981
                // Build the LdpOffset / StpOffset. Operands: [r1, r2, base, imm].
      
        982
                let new_opcode = paired_opcode(&ka);
      
        983
                let new_def = if is_load {
      
        984
                    // LDP defines two regs but our MachineInst.def is single-
      
        985
                    // value. Track the higher of the two so reg-allocator
      
        986
                    // liveness sees at least one def; the actual two-write
      
        987
                    // semantics live in the operand list. (Existing prologue
      
        988
                    // LdpOffset uses this same shape.)
      
        989
                    match &lower_op_value {
      
        990
                        MachineOperand::VReg(_) => Some(match upper_op_value {
      
        991
                            MachineOperand::VReg(v) => v,
      
        992
                            _ => {
      
        993
                                i += 1;
      
        994
                                continue;
      
        995
                            }
      
        996
                        }),
      
        997
                        _ => None,
      
        998
                    }
      
        999
                } else {
      
        1000
                    None
      
        1001
                };
      
        1002
                let new_inst = MachineInst {
      
        1003
                    opcode: new_opcode,
      
        1004
                    operands: vec![
      
        1005
                        lower_op_value,
      
        1006
                        upper_op_value,
      
        1007
                        a_base.clone(),
      
        1008
                        MachineOperand::Imm(lower_off),
      
        1009
                    ],
      
        1010
                    def: new_def,
      
        1011
                };
      
        1012
                plans.push(PairPlan {
      
        1013
                    lower_idx: i,
      
        1014
                    upper_idx: i + 1,
      
        1015
                    new_inst,
      
        1016
                });
      
        1017
                consumed.insert(i);
      
        1018
                consumed.insert(i + 1);
      
        1019
                i += 2;
      
        1020
            }
      
        1021
        
        1022
            if plans.is_empty() {
      
        1023
                return;
      
        1024
            }
      
        1025
        
        1026
            // Apply rewrites. Order doesn't matter because plans don't
      
        1027
            // overlap (consumed set ensures each index is in at most one).
      
        1028
            let block = &mut mf.blocks[mb_idx];
      
        1029
            let mut remove_idxs: std::collections::HashSet<usize> = plans.iter().map(|p| p.upper_idx).collect();
      
        1030
            for plan in &plans {
      
        1031
                block.insts[plan.lower_idx] = plan.new_inst.clone();
      
        1032
            }
      
        1033
            let mut idx = 0usize;
      
        1034
            block.insts.retain(|_| {
      
        1035
                let keep = !remove_idxs.remove(&idx);
      
        1036
                idx += 1;
      
        1037
                keep
      
        1038
            });
      
        1039
        }
      
        1040
        
        1041
        #[cfg(test)]
      
        1042
        mod tests {
      
        1043
            use super::*;
      
        1044
            use crate::codegen::mir::{
      
        1045
                ArmOpcode, MBlockId, MachineBlock, MachineFunction, MachineInst, MachineOperand, RegClass,
      
        1046
            };
      
        1047
        
        1048
            fn mf_with_insts(insts: Vec<MachineInst>) -> MachineFunction {
      
        1049
                let mut mf = MachineFunction::new("test".into());
      
        1050
                // Allocate enough vregs.
      
        1051
                for _ in 0..10 {
      
        1052
                    mf.new_vreg(RegClass::Fp32);
      
        1053
                }
      
        1054
                let bid = MBlockId(0);
      
        1055
                mf.blocks = vec![MachineBlock {
      
        1056
                    id: bid,
      
        1057
                    label: "entry".into(),
      
        1058
                    insts,
      
        1059
                }];
      
        1060
                mf
      
        1061
            }
      
        1062
        
        1063
            fn vreg(v: u32) -> MachineOperand {
      
        1064
                MachineOperand::VReg(VRegId(v))
      
        1065
            }
      
        1066
            fn vid(v: u32) -> VRegId {
      
        1067
                VRegId(v)
      
        1068
            }
      
        1069
        
        1070
            /// fadd(fmul(v1, v2), v3) → fmadd(v1, v2, v3)
      
        1071
            #[test]
      
        1072
            fn fmadd_f32() {
      
        1073
                let fmul = MachineInst {
      
        1074
                    opcode: ArmOpcode::FmulS,
      
        1075
                    operands: vec![vreg(0), vreg(1), vreg(2)],
      
        1076
                    def: Some(vid(0)),
      
        1077
                };
      
        1078
                let fadd = MachineInst {
      
        1079
                    opcode: ArmOpcode::FaddS,
      
        1080
                    operands: vec![vreg(4), vreg(0), vreg(3)], // src0 = fmul result
      
        1081
                    def: Some(vid(4)),
      
        1082
                };
      
        1083
                let mut mf = mf_with_insts(vec![fmul, fadd]);
      
        1084
                fma_fusion(&mut mf);
      
        1085
                let block = &mf.blocks[0];
      
        1086
                assert_eq!(block.insts.len(), 1, "fmul should be removed");
      
        1087
                assert_eq!(block.insts[0].opcode, ArmOpcode::FmaddS);
      
        1088
                // operands: [dest, n, m, a] = [v4, v1, v2, v3]
      
        1089
                assert_eq!(block.insts[0].operands[1], vreg(1));
      
        1090
                assert_eq!(block.insts[0].operands[2], vreg(2));
      
        1091
                assert_eq!(block.insts[0].operands[3], vreg(3));
      
        1092
            }
      
        1093
        
        1094
            /// fadd(v3, fmul(v1, v2)) → fmadd(v1, v2, v3)  [commuted]
      
        1095
            #[test]
      
        1096
            fn fmadd_commuted() {
      
        1097
                let fmul = MachineInst {
      
        1098
                    opcode: ArmOpcode::FmulD,
      
        1099
                    operands: vec![vreg(0), vreg(1), vreg(2)],
      
        1100
                    def: Some(vid(0)),
      
        1101
                };
      
        1102
                let fadd = MachineInst {
      
        1103
                    opcode: ArmOpcode::FaddD,
      
        1104
                    operands: vec![vreg(4), vreg(3), vreg(0)], // src1 = fmul result
      
        1105
                    def: Some(vid(4)),
      
        1106
                };
      
        1107
                let mut mf = mf_with_insts(vec![fmul, fadd]);
      
        1108
                fma_fusion(&mut mf);
      
        1109
                let block = &mf.blocks[0];
      
        1110
                assert_eq!(block.insts.len(), 1);
      
        1111
                assert_eq!(block.insts[0].opcode, ArmOpcode::FmaddD);
      
        1112
                assert_eq!(block.insts[0].operands[3], vreg(3)); // accumulator
      
        1113
            }
      
        1114
        
        1115
            /// fsub(c, fmul(a,b)) → fmsub(a, b, c)  [result = c - a*b]
      
        1116
            #[test]
      
        1117
            fn fmsub_f32() {
      
        1118
                let fmul = MachineInst {
      
        1119
                    opcode: ArmOpcode::FmulS,
      
        1120
                    operands: vec![vreg(0), vreg(1), vreg(2)],
      
        1121
                    def: Some(vid(0)),
      
        1122
                };
      
        1123
                let fsub = MachineInst {
      
        1124
                    opcode: ArmOpcode::FsubS,
      
        1125
                    operands: vec![vreg(4), vreg(3), vreg(0)], // src1 = fmul result
      
        1126
                    def: Some(vid(4)),
      
        1127
                };
      
        1128
                let mut mf = mf_with_insts(vec![fmul, fsub]);
      
        1129
                fma_fusion(&mut mf);
      
        1130
                let block = &mf.blocks[0];
      
        1131
                assert_eq!(block.insts.len(), 1);
      
        1132
                assert_eq!(block.insts[0].opcode, ArmOpcode::FmsubS);
      
        1133
                assert_eq!(block.insts[0].operands[3], vreg(3)); // Sa = c
      
        1134
            }
      
        1135
        
        1136
            /// fsub(fmul(a,b), c) → fnmsub(a, b, c)  [result = a*b - c]
      
        1137
            #[test]
      
        1138
            fn fnmsub_f64() {
      
        1139
                let fmul = MachineInst {
      
        1140
                    opcode: ArmOpcode::FmulD,
      
        1141
                    operands: vec![vreg(0), vreg(1), vreg(2)],
      
        1142
                    def: Some(vid(0)),
      
        1143
                };
      
        1144
                let fsub = MachineInst {
      
        1145
                    opcode: ArmOpcode::FsubD,
      
        1146
                    operands: vec![vreg(4), vreg(0), vreg(3)], // src0 = fmul result
      
        1147
                    def: Some(vid(4)),
      
        1148
                };
      
        1149
                let mut mf = mf_with_insts(vec![fmul, fsub]);
      
        1150
                fma_fusion(&mut mf);
      
        1151
                let block = &mf.blocks[0];
      
        1152
                assert_eq!(block.insts.len(), 1);
      
        1153
                assert_eq!(block.insts[0].opcode, ArmOpcode::FnmsubD);
      
        1154
            }
      
        1155
        
        1156
            fn vreg_gp(v: u32) -> MachineOperand {
      
        1157
                MachineOperand::VReg(VRegId(v))
      
        1158
            }
      
        1159
        
        1160
            /// add(mul(v1, v2), v3)  →  madd v4, v1, v2, v3
      
        1161
            #[test]
      
        1162
            fn integer_madd_fusion() {
      
        1163
                let mul = MachineInst {
      
        1164
                    opcode: ArmOpcode::Mul,
      
        1165
                    operands: vec![vreg_gp(0), vreg_gp(1), vreg_gp(2)],
      
        1166
                    def: Some(vid(0)),
      
        1167
                };
      
        1168
                let add = MachineInst {
      
        1169
                    opcode: ArmOpcode::AddReg,
      
        1170
                    operands: vec![vreg_gp(4), vreg_gp(0), vreg_gp(3)],
      
        1171
                    def: Some(vid(4)),
      
        1172
                };
      
        1173
                let mut mf = mf_with_insts(vec![mul, add]);
      
        1174
                madd_fusion(&mut mf);
      
        1175
                let block = &mf.blocks[0];
      
        1176
                assert_eq!(block.insts.len(), 1, "mul should be removed");
      
        1177
                assert_eq!(block.insts[0].opcode, ArmOpcode::Madd);
      
        1178
                assert_eq!(block.insts[0].operands[1], vreg_gp(1));
      
        1179
                assert_eq!(block.insts[0].operands[2], vreg_gp(2));
      
        1180
                assert_eq!(block.insts[0].operands[3], vreg_gp(3));
      
        1181
            }
      
        1182
        
        1183
            /// add(v3, mul(v1, v2))  →  madd v4, v1, v2, v3   (commuted)
      
        1184
            #[test]
      
        1185
            fn integer_madd_fusion_commuted() {
      
        1186
                let mul = MachineInst {
      
        1187
                    opcode: ArmOpcode::Mul,
      
        1188
                    operands: vec![vreg_gp(0), vreg_gp(1), vreg_gp(2)],
      
        1189
                    def: Some(vid(0)),
      
        1190
                };
      
        1191
                let add = MachineInst {
      
        1192
                    opcode: ArmOpcode::AddReg,
      
        1193
                    operands: vec![vreg_gp(4), vreg_gp(3), vreg_gp(0)],
      
        1194
                    def: Some(vid(4)),
      
        1195
                };
      
        1196
                let mut mf = mf_with_insts(vec![mul, add]);
      
        1197
                madd_fusion(&mut mf);
      
        1198
                let block = &mf.blocks[0];
      
        1199
                assert_eq!(block.insts.len(), 1);
      
        1200
                assert_eq!(block.insts[0].opcode, ArmOpcode::Madd);
      
        1201
                assert_eq!(block.insts[0].operands[3], vreg_gp(3), "accumulator is c");
      
        1202
            }
      
        1203
        
        1204
            /// sub(c, mul(a, b))  →  msub a, b, c   (result = c - a*b)
      
        1205
            #[test]
      
        1206
            fn integer_msub_fusion() {
      
        1207
                let mul = MachineInst {
      
        1208
                    opcode: ArmOpcode::Mul,
      
        1209
                    operands: vec![vreg_gp(0), vreg_gp(1), vreg_gp(2)],
      
        1210
                    def: Some(vid(0)),
      
        1211
                };
      
        1212
                let sub = MachineInst {
      
        1213
                    opcode: ArmOpcode::SubReg,
      
        1214
                    operands: vec![vreg_gp(4), vreg_gp(3), vreg_gp(0)],
      
        1215
                    def: Some(vid(4)),
      
        1216
                };
      
        1217
                let mut mf = mf_with_insts(vec![mul, sub]);
      
        1218
                madd_fusion(&mut mf);
      
        1219
                let block = &mf.blocks[0];
      
        1220
                assert_eq!(block.insts.len(), 1);
      
        1221
                assert_eq!(block.insts[0].opcode, ArmOpcode::Msub);
      
        1222
                assert_eq!(block.insts[0].operands[3], vreg_gp(3), "Xa = c");
      
        1223
            }
      
        1224
        
        1225
            /// sub(mul(a, b), c)  is `a*b - c`  — no integer FNMSUB form.
      
        1226
            /// The pass should leave it as-is.
      
        1227
            #[test]
      
        1228
            fn integer_sub_mul_first_not_fused() {
      
        1229
                let mul = MachineInst {
      
        1230
                    opcode: ArmOpcode::Mul,
      
        1231
                    operands: vec![vreg_gp(0), vreg_gp(1), vreg_gp(2)],
      
        1232
                    def: Some(vid(0)),
      
        1233
                };
      
        1234
                let sub = MachineInst {
      
        1235
                    opcode: ArmOpcode::SubReg,
      
        1236
                    operands: vec![vreg_gp(4), vreg_gp(0), vreg_gp(3)],
      
        1237
                    def: Some(vid(4)),
      
        1238
                };
      
        1239
                let mut mf = mf_with_insts(vec![mul, sub]);
      
        1240
                madd_fusion(&mut mf);
      
        1241
                // Both instructions should remain — there's no msub form for a*b - c.
      
        1242
                assert_eq!(mf.blocks[0].insts.len(), 2);
      
        1243
                assert_eq!(mf.blocks[0].insts[0].opcode, ArmOpcode::Mul);
      
        1244
                assert_eq!(mf.blocks[0].insts[1].opcode, ArmOpcode::SubReg);
      
        1245
            }
      
        1246
        
        1247
            /// mul result with multiple consumers must NOT fuse.
      
        1248
            #[test]
      
        1249
            fn integer_mul_used_twice_not_fused() {
      
        1250
                let mul = MachineInst {
      
        1251
                    opcode: ArmOpcode::Mul,
      
        1252
                    operands: vec![vreg_gp(0), vreg_gp(1), vreg_gp(2)],
      
        1253
                    def: Some(vid(0)),
      
        1254
                };
      
        1255
                let add1 = MachineInst {
      
        1256
                    opcode: ArmOpcode::AddReg,
      
        1257
                    operands: vec![vreg_gp(4), vreg_gp(0), vreg_gp(3)],
      
        1258
                    def: Some(vid(4)),
      
        1259
                };
      
        1260
                let add2 = MachineInst {
      
        1261
                    opcode: ArmOpcode::AddReg,
      
        1262
                    operands: vec![vreg_gp(5), vreg_gp(0), vreg_gp(3)],
      
        1263
                    def: Some(vid(5)),
      
        1264
                };
      
        1265
                let mut mf = mf_with_insts(vec![mul, add1, add2]);
      
        1266
                madd_fusion(&mut mf);
      
        1267
                assert_eq!(mf.blocks[0].insts.len(), 3, "mul has 2 uses, can't fuse");
      
        1268
                assert_eq!(mf.blocks[0].insts[0].opcode, ArmOpcode::Mul);
      
        1269
            }
      
        1270
        
        1271
            /// fmul result used twice: must NOT be fused.
      
        1272
            #[test]
      
        1273
            fn no_fusion_if_mul_used_twice() {
      
        1274
                let fmul = MachineInst {
      
        1275
                    opcode: ArmOpcode::FmulS,
      
        1276
                    operands: vec![vreg(0), vreg(1), vreg(2)],
      
        1277
                    def: Some(vid(0)),
      
        1278
                };
      
        1279
                // Two consumers of vreg(0):
      
        1280
                let fadd1 = MachineInst {
      
        1281
                    opcode: ArmOpcode::FaddS,
      
        1282
                    operands: vec![vreg(4), vreg(0), vreg(3)],
      
        1283
                    def: Some(vid(4)),
      
        1284
                };
      
        1285
                let fadd2 = MachineInst {
      
        1286
                    opcode: ArmOpcode::FaddS,
      
        1287
                    operands: vec![vreg(5), vreg(0), vreg(3)],
      
        1288
                    def: Some(vid(5)),
      
        1289
                };
      
        1290
                let mut mf = mf_with_insts(vec![fmul, fadd1, fadd2]);
      
        1291
                fma_fusion(&mut mf);
      
        1292
                // No fusion — fmul has 2 uses.
      
        1293
                assert_eq!(mf.blocks[0].insts.len(), 3);
      
        1294
                assert_eq!(mf.blocks[0].insts[0].opcode, ArmOpcode::FmulS);
      
        1295
            }
      
        1296
        
        1297
            /// LdrFpImm fed by AddReg(base, Mul(idx, Movz(8))) → LdrFpReg with lsl=3.
      
        1298
            #[test]
      
        1299
            fn scaled_addressing_real8_load() {
      
        1300
                let movz = MachineInst {
      
        1301
                    opcode: ArmOpcode::Movz,
      
        1302
                    operands: vec![vreg(0), MachineOperand::Imm(8), MachineOperand::Shift(0)],
      
        1303
                    def: Some(vid(0)),
      
        1304
                };
      
        1305
                let mul = MachineInst {
      
        1306
                    opcode: ArmOpcode::Mul,
      
        1307
                    operands: vec![vreg(1), vreg(2), vreg(0)], // scaled = idx(2) * const(0)
      
        1308
                    def: Some(vid(1)),
      
        1309
                };
      
        1310
                let add = MachineInst {
      
        1311
                    opcode: ArmOpcode::AddReg,
      
        1312
                    operands: vec![vreg(3), vreg(4), vreg(1)], // addr = base(4) + scaled(1)
      
        1313
                    def: Some(vid(3)),
      
        1314
                };
      
        1315
                let ldr = MachineInst {
      
        1316
                    opcode: ArmOpcode::LdrFpImm,
      
        1317
                    operands: vec![vreg(5), vreg(3), MachineOperand::Imm(0)], // ldr d5, [addr, #0]
      
        1318
                    def: Some(vid(5)),
      
        1319
                };
      
        1320
                let mut mf = mf_with_insts(vec![movz, mul, add, ldr]);
      
        1321
                scaled_addressing_fusion(&mut mf);
      
        1322
                let block = &mf.blocks[0];
      
        1323
                assert_eq!(block.insts.len(), 1, "movz/mul/add should be removed");
      
        1324
                assert_eq!(block.insts[0].opcode, ArmOpcode::LdrFpReg);
      
        1325
                assert_eq!(block.insts[0].operands[0], vreg(5)); // dest
      
        1326
                assert_eq!(block.insts[0].operands[1], vreg(4)); // base
      
        1327
                assert_eq!(block.insts[0].operands[2], vreg(2)); // idx
      
        1328
                assert_eq!(block.insts[0].operands[3], MachineOperand::Imm(3)); // log2(8) = 3
      
        1329
            }
      
        1330
        
        1331
            /// Multi-use Mul result → no fold (the mul result is consumed twice).
      
        1332
            #[test]
      
        1333
            fn scaled_addressing_no_fold_on_multi_use_mul() {
      
        1334
                let movz = MachineInst {
      
        1335
                    opcode: ArmOpcode::Movz,
      
        1336
                    operands: vec![vreg(0), MachineOperand::Imm(8), MachineOperand::Shift(0)],
      
        1337
                    def: Some(vid(0)),
      
        1338
                };
      
        1339
                let mul = MachineInst {
      
        1340
                    opcode: ArmOpcode::Mul,
      
        1341
                    operands: vec![vreg(1), vreg(2), vreg(0)],
      
        1342
                    def: Some(vid(1)),
      
        1343
                };
      
        1344
                let add = MachineInst {
      
        1345
                    opcode: ArmOpcode::AddReg,
      
        1346
                    operands: vec![vreg(3), vreg(4), vreg(1)],
      
        1347
                    def: Some(vid(3)),
      
        1348
                };
      
        1349
                let ldr = MachineInst {
      
        1350
                    opcode: ArmOpcode::LdrFpImm,
      
        1351
                    operands: vec![vreg(5), vreg(3), MachineOperand::Imm(0)],
      
        1352
                    def: Some(vid(5)),
      
        1353
                };
      
        1354
                // Second consumer of the mul result — extra reference to vreg(1).
      
        1355
                let extra = MachineInst {
      
        1356
                    opcode: ArmOpcode::AddReg,
      
        1357
                    operands: vec![vreg(6), vreg(7), vreg(1)],
      
        1358
                    def: Some(vid(6)),
      
        1359
                };
      
        1360
                let mut mf = mf_with_insts(vec![movz, mul, add, ldr, extra]);
      
        1361
                scaled_addressing_fusion(&mut mf);
      
        1362
                // Nothing fused.
      
        1363
                assert_eq!(mf.blocks[0].insts.len(), 5);
      
        1364
                assert_eq!(mf.blocks[0].insts[3].opcode, ArmOpcode::LdrFpImm);
      
        1365
            }
      
        1366
        
        1367
            /// Non-power-of-two const (e.g. element size 12 from a 3-int derived
      
        1368
            /// type) must NOT fold — there's no `lsl` encoding for it.
      
        1369
            #[test]
      
        1370
            fn scaled_addressing_no_fold_on_non_power_of_two() {
      
        1371
                let movz = MachineInst {
      
        1372
                    opcode: ArmOpcode::Movz,
      
        1373
                    operands: vec![vreg(0), MachineOperand::Imm(12), MachineOperand::Shift(0)],
      
        1374
                    def: Some(vid(0)),
      
        1375
                };
      
        1376
                let mul = MachineInst {
      
        1377
                    opcode: ArmOpcode::Mul,
      
        1378
                    operands: vec![vreg(1), vreg(2), vreg(0)],
      
        1379
                    def: Some(vid(1)),
      
        1380
                };
      
        1381
                let add = MachineInst {
      
        1382
                    opcode: ArmOpcode::AddReg,
      
        1383
                    operands: vec![vreg(3), vreg(4), vreg(1)],
      
        1384
                    def: Some(vid(3)),
      
        1385
                };
      
        1386
                let ldr = MachineInst {
      
        1387
                    opcode: ArmOpcode::LdrImm,
      
        1388
                    operands: vec![vreg(5), vreg(3), MachineOperand::Imm(0)],
      
        1389
                    def: Some(vid(5)),
      
        1390
                };
      
        1391
                let mut mf = mf_with_insts(vec![movz, mul, add, ldr]);
      
        1392
                scaled_addressing_fusion(&mut mf);
      
        1393
                assert_eq!(mf.blocks[0].insts.len(), 4);
      
        1394
                assert_eq!(mf.blocks[0].insts[3].opcode, ArmOpcode::LdrImm);
      
        1395
            }
      
        1396
        
        1397
            fn mf_with_classes(insts: Vec<MachineInst>, classes: &[RegClass]) -> MachineFunction {
      
        1398
                let mut mf = MachineFunction::new("test".into());
      
        1399
                for c in classes {
      
        1400
                    mf.new_vreg(*c);
      
        1401
                }
      
        1402
                let bid = MBlockId(0);
      
        1403
                mf.blocks = vec![MachineBlock {
      
        1404
                    id: bid,
      
        1405
                    label: "entry".into(),
      
        1406
                    insts,
      
        1407
                }];
      
        1408
                mf
      
        1409
            }
      
        1410
        
        1411
            /// Two adjacent `ldr x_, [base, #imm]` with offsets 0 and 8 fuse to `ldp`.
      
        1412
            #[test]
      
        1413
            fn ldp_fusion_int64_pair() {
      
        1414
                // vregs: 0,1,2 = Gp64
      
        1415
                let classes = vec![RegClass::Gp64; 3];
      
        1416
                let ldr1 = MachineInst {
      
        1417
                    opcode: ArmOpcode::LdrImm,
      
        1418
                    operands: vec![vreg(0), vreg(2), MachineOperand::Imm(0)],
      
        1419
                    def: Some(vid(0)),
      
        1420
                };
      
        1421
                let ldr2 = MachineInst {
      
        1422
                    opcode: ArmOpcode::LdrImm,
      
        1423
                    operands: vec![vreg(1), vreg(2), MachineOperand::Imm(8)],
      
        1424
                    def: Some(vid(1)),
      
        1425
                };
      
        1426
                let mut mf = mf_with_classes(vec![ldr1, ldr2], &classes);
      
        1427
                ldp_stp_fusion(&mut mf);
      
        1428
                let block = &mf.blocks[0];
      
        1429
                assert_eq!(block.insts.len(), 1);
      
        1430
                assert_eq!(block.insts[0].opcode, ArmOpcode::LdpOffset);
      
        1431
                // Lower-offset (0) load goes into Rt1 = vreg(0); higher (8) into Rt2 = vreg(1).
      
        1432
                assert_eq!(block.insts[0].operands[0], vreg(0));
      
        1433
                assert_eq!(block.insts[0].operands[1], vreg(1));
      
        1434
                assert_eq!(block.insts[0].operands[2], vreg(2));
      
        1435
                assert_eq!(block.insts[0].operands[3], MachineOperand::Imm(0));
      
        1436
            }
      
        1437
        
        1438
            /// Two adjacent `str d_, [base, #imm]` with descending offsets fuse to `stp d`.
      
        1439
            #[test]
      
        1440
            fn stp_fusion_f64_pair_descending() {
      
        1441
                let classes = vec![RegClass::Fp64, RegClass::Fp64, RegClass::Gp64];
      
        1442
                let str_high = MachineInst {
      
        1443
                    opcode: ArmOpcode::StrFpImm,
      
        1444
                    operands: vec![vreg(0), vreg(2), MachineOperand::Imm(-16)],
      
        1445
                    def: None,
      
        1446
                };
      
        1447
                let str_low = MachineInst {
      
        1448
                    opcode: ArmOpcode::StrFpImm,
      
        1449
                    operands: vec![vreg(1), vreg(2), MachineOperand::Imm(-24)],
      
        1450
                    def: None,
      
        1451
                };
      
        1452
                let mut mf = mf_with_classes(vec![str_high, str_low], &classes);
      
        1453
                ldp_stp_fusion(&mut mf);
      
        1454
                let block = &mf.blocks[0];
      
        1455
                assert_eq!(block.insts.len(), 1);
      
        1456
                assert_eq!(block.insts[0].opcode, ArmOpcode::StpOffset);
      
        1457
                // Lower-offset (-24) source goes into Rt1, so Rt1 = vreg(1).
      
        1458
                assert_eq!(block.insts[0].operands[0], vreg(1));
      
        1459
                assert_eq!(block.insts[0].operands[1], vreg(0));
      
        1460
                assert_eq!(block.insts[0].operands[3], MachineOperand::Imm(-24));
      
        1461
            }
      
        1462
        
        1463
            /// Different bases — no fusion.
      
        1464
            #[test]
      
        1465
            fn ldp_no_fusion_different_base() {
      
        1466
                let classes = vec![RegClass::Gp64; 4];
      
        1467
                let ldr1 = MachineInst {
      
        1468
                    opcode: ArmOpcode::LdrImm,
      
        1469
                    operands: vec![vreg(0), vreg(2), MachineOperand::Imm(0)],
      
        1470
                    def: Some(vid(0)),
      
        1471
                };
      
        1472
                let ldr2 = MachineInst {
      
        1473
                    opcode: ArmOpcode::LdrImm,
      
        1474
                    operands: vec![vreg(1), vreg(3), MachineOperand::Imm(8)],
      
        1475
                    def: Some(vid(1)),
      
        1476
                };
      
        1477
                let mut mf = mf_with_classes(vec![ldr1, ldr2], &classes);
      
        1478
                ldp_stp_fusion(&mut mf);
      
        1479
                assert_eq!(mf.blocks[0].insts.len(), 2);
      
        1480
                assert_eq!(mf.blocks[0].insts[0].opcode, ArmOpcode::LdrImm);
      
        1481
            }
      
        1482
        
        1483
            /// Non-adjacent offsets (gap of 16 instead of 8) — no fusion.
      
        1484
            #[test]
      
        1485
            fn ldp_no_fusion_non_adjacent() {
      
        1486
                let classes = vec![RegClass::Gp64; 3];
      
        1487
                let ldr1 = MachineInst {
      
        1488
                    opcode: ArmOpcode::LdrImm,
      
        1489
                    operands: vec![vreg(0), vreg(2), MachineOperand::Imm(0)],
      
        1490
                    def: Some(vid(0)),
      
        1491
                };
      
        1492
                let ldr2 = MachineInst {
      
        1493
                    opcode: ArmOpcode::LdrImm,
      
        1494
                    operands: vec![vreg(1), vreg(2), MachineOperand::Imm(16)],
      
        1495
                    def: Some(vid(1)),
      
        1496
                };
      
        1497
                let mut mf = mf_with_classes(vec![ldr1, ldr2], &classes);
      
        1498
                ldp_stp_fusion(&mut mf);
      
        1499
                assert_eq!(mf.blocks[0].insts.len(), 2);
      
        1500
            }
      
        1501
        
        1502
            /// Mixed-width pair (i32 + i64) — no fusion since LDP needs matching widths.
      
        1503
            #[test]
      
        1504
            fn ldp_no_fusion_mixed_width() {
      
        1505
                let classes = vec![RegClass::Gp32, RegClass::Gp64, RegClass::Gp64];
      
        1506
                let ldr1 = MachineInst {
      
        1507
                    opcode: ArmOpcode::LdrImm,
      
        1508
                    operands: vec![vreg(0), vreg(2), MachineOperand::Imm(0)],
      
        1509
                    def: Some(vid(0)),
      
        1510
                };
      
        1511
                let ldr2 = MachineInst {
      
        1512
                    opcode: ArmOpcode::LdrImm,
      
        1513
                    operands: vec![vreg(1), vreg(2), MachineOperand::Imm(4)],
      
        1514
                    def: Some(vid(1)),
      
        1515
                };
      
        1516
                let mut mf = mf_with_classes(vec![ldr1, ldr2], &classes);
      
        1517
                ldp_stp_fusion(&mut mf);
      
        1518
                assert_eq!(mf.blocks[0].insts.len(), 2);
      
        1519
            }
      
        1520
        
        1521
            /// LDR with dest == base — must not fuse (the second load would otherwise read
      
        1522
            /// the now-overwritten base).
      
        1523
            #[test]
      
        1524
            fn ldp_no_fusion_dest_equals_base() {
      
        1525
                let classes = vec![RegClass::Gp64; 2];
      
        1526
                // dest of first load is the base
      
        1527
                let ldr1 = MachineInst {
      
        1528
                    opcode: ArmOpcode::LdrImm,
      
        1529
                    operands: vec![vreg(1), vreg(1), MachineOperand::Imm(0)],
      
        1530
                    def: Some(vid(1)),
      
        1531
                };
      
        1532
                let ldr2 = MachineInst {
      
        1533
                    opcode: ArmOpcode::LdrImm,
      
        1534
                    operands: vec![vreg(0), vreg(1), MachineOperand::Imm(8)],
      
        1535
                    def: Some(vid(0)),
      
        1536
                };
      
        1537
                let mut mf = mf_with_classes(vec![ldr1, ldr2], &classes);
      
        1538
                ldp_stp_fusion(&mut mf);
      
        1539
                assert_eq!(mf.blocks[0].insts.len(), 2);
      
        1540
            }
      
        1541
        
        1542
            /// StrImm gets folded too (mirror of the load path).
      
        1543
            #[test]
      
        1544
            fn scaled_addressing_int4_store() {
      
        1545
                let movz = MachineInst {
      
        1546
                    opcode: ArmOpcode::Movz,
      
        1547
                    operands: vec![vreg(0), MachineOperand::Imm(4), MachineOperand::Shift(0)],
      
        1548
                    def: Some(vid(0)),
      
        1549
                };
      
        1550
                let mul = MachineInst {
      
        1551
                    opcode: ArmOpcode::Mul,
      
        1552
                    operands: vec![vreg(1), vreg(2), vreg(0)],
      
        1553
                    def: Some(vid(1)),
      
        1554
                };
      
        1555
                let add = MachineInst {
      
        1556
                    opcode: ArmOpcode::AddReg,
      
        1557
                    operands: vec![vreg(3), vreg(4), vreg(1)],
      
        1558
                    def: Some(vid(3)),
      
        1559
                };
      
        1560
                let str_inst = MachineInst {
      
        1561
                    opcode: ArmOpcode::StrImm,
      
        1562
                    operands: vec![vreg(5), vreg(3), MachineOperand::Imm(0)],
      
        1563
                    def: None, // store has no def
      
        1564
                };
      
        1565
                let mut mf = mf_with_insts(vec![movz, mul, add, str_inst]);
      
        1566
                scaled_addressing_fusion(&mut mf);
      
        1567
                let block = &mf.blocks[0];
      
        1568
                assert_eq!(block.insts.len(), 1);
      
        1569
                assert_eq!(block.insts[0].opcode, ArmOpcode::StrReg);
      
        1570
                assert_eq!(block.insts[0].operands[3], MachineOperand::Imm(2)); // log2(4) = 2
      
        1571
            }
      
        1572
        }
      
        1573