Rust · 56683 bytes Raw Blame History
1 //! Post-isel peephole optimizations.
2 //!
3 //! Runs on `MachineFunction` after instruction selection and before
4 //! register allocation. Virtual registers are still in use.
5 //!
6 //! ## FMADD / FMSUB fusion
7 //!
8 //! ARM64 has fused multiply-add instructions that are both faster and
9 //! more accurate (single rounding) than a separate fmul + fadd pair.
10 //!
11 //! Matched patterns and their replacement:
12 //!
13 //! | Pattern | Replacement | ARM64 opcode |
14 //! |---------------------------------------|-----------------------------|--------------|
15 //! | `fadd(fmul(a, b), c)` or commuted | `result = c + a*b` | FMADD |
16 //! | `fsub(c, fmul(a, b))` | `result = c - a*b` | FMSUB |
17 //! | `fsub(fmul(a, b), c)` | `result = a*b - c` | FNMSUB |
18 //!
19 //! ## MADD / MSUB fusion (integer)
20 //!
21 //! Same shape as the FP family but for integer ops. ARM64's `madd` /
22 //! `msub` are 3-source integer multiply-add/subtract that fuse `mul`
23 //! followed by `add`/`sub` in a single instruction. We never spelled
24 //! these from `isel.rs` directly because the IR exposes `IAdd(IMul, c)`
25 //! as two separate opcodes and isel emits them naively.
26 //!
27 //! Matched patterns:
28 //!
29 //! | Pattern | Replacement | ARM64 opcode |
30 //! |---------------------------------------|-----------------------------|--------------|
31 //! | `add(mul(a, b), c)` or commuted | `result = c + a*b` | MADD |
32 //! | `sub(c, mul(a, b))` | `result = c - a*b` | MSUB |
33 //!
34 //! Note: `sub(mul(a,b), c)` (i.e. `a*b - c`) has no single-instruction
35 //! ARM64 form for integer ops — `msub` computes `Xa - Xn*Xm`, not the
36 //! other order, and there is no integer analogue of `fnmsub`. We leave
37 //! that pattern as-is.
38 //!
39 //! ## Preconditions
40 //!
41 //! * The MUL/FMUL result is used exactly once (the subsequent ADD/SUB).
42 //! * Both instructions are in the same machine block.
43 //!
44 //! After fusion the multiply is removed and the add/sub is replaced
45 //! with the three-source instruction.
46
47 use super::mir::{ArmOpcode, MachineFunction, MachineInst, MachineOperand, RegClass, VRegId};
48 use std::collections::HashMap;
49
50 /// Run all peephole passes on a machine function. Iterates to fixpoint
51 /// so newly-exposed patterns (e.g., a strength-reduction that produces
52 /// an `add x, x, x` fed by a mul-add candidate) compose without
53 /// requiring callers to re-invoke us. Capped at a small number of
54 /// rounds — passes are monotone (only remove or simplify), so fixpoint
55 /// is bounded by instruction count and an 8-round ceiling is plenty in
56 /// practice while still surfacing pathological non-convergence as an
57 /// internal-error rather than a hang.
58 pub fn run_peephole(mf: &mut MachineFunction) {
59 const MAX_ROUNDS: u32 = 8;
60 for _ in 0..MAX_ROUNDS {
61 let before = total_inst_count(mf);
62 // Scaled-addressing fusion first so it can consume the
63 // Mul + AddReg pair before madd_fusion would collapse them
64 // into a Madd. A single load-with-shift beats a fused madd
65 // followed by a plain ldr.
66 scaled_addressing_fusion(mf);
67 fma_fusion(mf);
68 madd_fusion(mf);
69 ldp_stp_fusion(mf);
70 if total_inst_count(mf) == before {
71 return;
72 }
73 }
74 }
75
76 fn total_inst_count(mf: &MachineFunction) -> usize {
77 mf.blocks.iter().map(|b| b.insts.len()).sum()
78 }
79
80 /// FMADD/FMSUB/FNMSUB fusion.
81 fn fma_fusion(mf: &mut MachineFunction) {
82 for mb_idx in 0..mf.blocks.len() {
83 fma_fuse_block(mf, mb_idx);
84 }
85 }
86
87 fn fma_fuse_block(mf: &mut MachineFunction, mb_idx: usize) {
88 let block = &mf.blocks[mb_idx];
89
90 // Count uses of each defined VReg within this block.
91 // A fmul result is fusable only when its use count == 1 (the fadd/fsub).
92 let mut use_count: HashMap<VRegId, usize> = HashMap::new();
93 for inst in &block.insts {
94 for op in &inst.operands {
95 if let MachineOperand::VReg(v) = op {
96 *use_count.entry(*v).or_insert(0) += 1;
97 }
98 }
99 }
100 // Subtract the self-def (operands[0] is the dest, counted in use_count
101 // but it's a def not a use). For a 3-operand FmulS [dest, src0, src1]:
102 // dest appears once in operands but it's the definition, not a use.
103 // Re-compute: only operands[1..] are uses.
104 let mut use_count: HashMap<VRegId, usize> = HashMap::new();
105 for inst in &block.insts {
106 // Operands beyond index 0 are inputs (index 0 is the output dest).
107 for op in inst.operands.iter().skip(1) {
108 if let MachineOperand::VReg(v) = op {
109 *use_count.entry(*v).or_insert(0) += 1;
110 }
111 }
112 }
113
114 // Map: vreg defined by a fmul instruction → (block-instruction-index, precision)
115 #[derive(Clone, Copy)]
116 enum Prec {
117 S,
118 D,
119 }
120 let mut fmul_defs: HashMap<VRegId, (usize, Prec)> = HashMap::new();
121
122 for (i, inst) in block.insts.iter().enumerate() {
123 match inst.opcode {
124 ArmOpcode::FmulS => {
125 if let Some(d) = inst.def {
126 fmul_defs.insert(d, (i, Prec::S));
127 }
128 }
129 ArmOpcode::FmulD => {
130 if let Some(d) = inst.def {
131 fmul_defs.insert(d, (i, Prec::D));
132 }
133 }
134 _ => {}
135 }
136 }
137
138 // Collect fusions to apply (instruction index → replacement info).
139 // Each entry: (fadd/fsub_idx, fmul_idx, new_opcode, new_operands)
140 struct FusionPlan {
141 add_idx: usize,
142 mul_idx: usize,
143 new_opcode: ArmOpcode,
144 // operands for the fused inst: [dest, Fn, Fm, Fa]
145 n: VRegId, // multiply lhs
146 m: VRegId, // multiply rhs
147 a: VRegId, // addend / subtrahend (accumulate register)
148 }
149 let mut plans: Vec<FusionPlan> = Vec::new();
150
151 let block = &mf.blocks[mb_idx];
152 for (i, inst) in block.insts.iter().enumerate() {
153 let (is_add, is_sub, prec_s) = match inst.opcode {
154 ArmOpcode::FaddS => (true, false, true),
155 ArmOpcode::FaddD => (true, false, false),
156 ArmOpcode::FsubS => (false, true, true),
157 ArmOpcode::FsubD => (false, true, false),
158 _ => continue,
159 };
160 if inst.operands.len() < 3 {
161 continue;
162 }
163
164 // operands: [dest, src0, src1]
165 let src0 = match &inst.operands[1] {
166 MachineOperand::VReg(v) => *v,
167 _ => continue,
168 };
169 let src1 = match &inst.operands[2] {
170 MachineOperand::VReg(v) => *v,
171 _ => continue,
172 };
173
174 // Check if src0 is a single-use fmul result.
175 let try_fuse_src0 = fmul_defs
176 .get(&src0)
177 .filter(|_| use_count.get(&src0).copied().unwrap_or(0) == 1);
178 // Check if src1 is a single-use fmul result.
179 let try_fuse_src1 = if is_add {
180 fmul_defs
181 .get(&src1)
182 .filter(|_| use_count.get(&src1).copied().unwrap_or(0) == 1)
183 } else {
184 None // fsub(c, fmul(a,b)): src0=c, src1=fmul → handled separately
185 };
186
187 if is_add {
188 // fadd(fmul(a,b), c) → FMADD(a, b, c)
189 if let Some(&(mul_idx, _)) = try_fuse_src0 {
190 let mul_inst = &block.insts[mul_idx];
191 let n = match &mul_inst.operands[1] {
192 MachineOperand::VReg(v) => *v,
193 _ => continue,
194 };
195 let m = match &mul_inst.operands[2] {
196 MachineOperand::VReg(v) => *v,
197 _ => continue,
198 };
199 let opcode = if prec_s {
200 ArmOpcode::FmaddS
201 } else {
202 ArmOpcode::FmaddD
203 };
204 plans.push(FusionPlan {
205 add_idx: i,
206 mul_idx,
207 new_opcode: opcode,
208 n,
209 m,
210 a: src1,
211 });
212 } else if let Some(&(mul_idx, _)) = try_fuse_src1 {
213 // fadd(c, fmul(a,b)) → FMADD(a, b, c) [commuted]
214 let mul_inst = &block.insts[mul_idx];
215 let n = match &mul_inst.operands[1] {
216 MachineOperand::VReg(v) => *v,
217 _ => continue,
218 };
219 let m = match &mul_inst.operands[2] {
220 MachineOperand::VReg(v) => *v,
221 _ => continue,
222 };
223 let opcode = if prec_s {
224 ArmOpcode::FmaddS
225 } else {
226 ArmOpcode::FmaddD
227 };
228 plans.push(FusionPlan {
229 add_idx: i,
230 mul_idx,
231 new_opcode: opcode,
232 n,
233 m,
234 a: src0,
235 });
236 }
237 } else if is_sub {
238 // fsub(fmul(a,b), c) → FNMSUB(a, b, c) [result = a*b - c]
239 if let Some(&(mul_idx, _)) = try_fuse_src0 {
240 let mul_inst = &block.insts[mul_idx];
241 let n = match &mul_inst.operands[1] {
242 MachineOperand::VReg(v) => *v,
243 _ => continue,
244 };
245 let m = match &mul_inst.operands[2] {
246 MachineOperand::VReg(v) => *v,
247 _ => continue,
248 };
249 let opcode = if prec_s {
250 ArmOpcode::FnmsubS
251 } else {
252 ArmOpcode::FnmsubD
253 };
254 plans.push(FusionPlan {
255 add_idx: i,
256 mul_idx,
257 new_opcode: opcode,
258 n,
259 m,
260 a: src1,
261 });
262 }
263 // fsub(c, fmul(a,b)) → FMSUB(a, b, c) [result = c - a*b]
264 // src0=c, src1=fmul_result
265 let try_fuse_sub1 = fmul_defs
266 .get(&src1)
267 .filter(|_| use_count.get(&src1).copied().unwrap_or(0) == 1);
268 if let Some(&(mul_idx, _)) = try_fuse_sub1 {
269 let mul_inst = &block.insts[mul_idx];
270 let n = match &mul_inst.operands[1] {
271 MachineOperand::VReg(v) => *v,
272 _ => continue,
273 };
274 let m = match &mul_inst.operands[2] {
275 MachineOperand::VReg(v) => *v,
276 _ => continue,
277 };
278 let opcode = if prec_s {
279 ArmOpcode::FmsubS
280 } else {
281 ArmOpcode::FmsubD
282 };
283 plans.push(FusionPlan {
284 add_idx: i,
285 mul_idx,
286 new_opcode: opcode,
287 n,
288 m,
289 a: src0,
290 });
291 }
292 }
293 }
294
295 if plans.is_empty() {
296 return;
297 }
298
299 // Apply plans in reverse order of add_idx to keep indices stable.
300 plans.sort_by_key(|plan| std::cmp::Reverse(plan.add_idx));
301
302 // Collect mul_idxs to remove.
303 let mut remove_idxs: std::collections::HashSet<usize> =
304 plans.iter().map(|p| p.mul_idx).collect();
305
306 // Rewrite the block.
307 let block = &mut mf.blocks[mb_idx];
308 for plan in &plans {
309 let dest = block.insts[plan.add_idx].def;
310 let dest_op = block.insts[plan.add_idx].operands[0].clone();
311 block.insts[plan.add_idx] = MachineInst {
312 opcode: plan.new_opcode,
313 operands: vec![
314 dest_op,
315 MachineOperand::VReg(plan.n),
316 MachineOperand::VReg(plan.m),
317 MachineOperand::VReg(plan.a),
318 ],
319 def: dest,
320 };
321 }
322 let mut idx = 0usize;
323 block.insts.retain(|_| {
324 let keep = !remove_idxs.remove(&idx);
325 idx += 1;
326 keep
327 });
328 }
329
330 /// Integer MADD/MSUB fusion. Mirrors `fma_fusion` for `Mul + AddReg`
331 /// and `Mul + SubReg`. Matched patterns:
332 ///
333 /// * `AddReg(Mul(a, b), c)` → `Madd(a, b, c)`
334 /// * `AddReg(c, Mul(a, b))` → `Madd(a, b, c)` (commuted)
335 /// * `SubReg(c, Mul(a, b))` → `Msub(a, b, c)` (only this order — there
336 /// is no integer FNMSUB analogue)
337 ///
338 /// Preconditions match the FP version: same block, mul result has
339 /// exactly one use (the add/sub).
340 fn madd_fusion(mf: &mut MachineFunction) {
341 for mb_idx in 0..mf.blocks.len() {
342 madd_fuse_block(mf, mb_idx);
343 }
344 }
345
346 fn madd_fuse_block(mf: &mut MachineFunction, mb_idx: usize) {
347 let block = &mf.blocks[mb_idx];
348
349 // Per-block use counts of values, *as inputs only*. operands[0] is
350 // the def for AddReg/SubReg/Mul, so we skip index 0 to count true
351 // uses.
352 let mut use_count: HashMap<VRegId, usize> = HashMap::new();
353 for inst in &block.insts {
354 for op in inst.operands.iter().skip(1) {
355 if let MachineOperand::VReg(v) = op {
356 *use_count.entry(*v).or_insert(0) += 1;
357 }
358 }
359 }
360
361 // VReg defined by an integer Mul → block index of that mul.
362 let mut mul_defs: HashMap<VRegId, usize> = HashMap::new();
363 for (i, inst) in block.insts.iter().enumerate() {
364 if inst.opcode == ArmOpcode::Mul {
365 if let Some(d) = inst.def {
366 mul_defs.insert(d, i);
367 }
368 }
369 }
370
371 struct FusionPlan {
372 add_idx: usize,
373 mul_idx: usize,
374 new_opcode: ArmOpcode,
375 // [dest, n, m, a]
376 n: VRegId,
377 m: VRegId,
378 a: VRegId,
379 }
380 let mut plans: Vec<FusionPlan> = Vec::new();
381
382 for (i, inst) in block.insts.iter().enumerate() {
383 let (is_add, is_sub) = match inst.opcode {
384 ArmOpcode::AddReg => (true, false),
385 ArmOpcode::SubReg => (false, true),
386 _ => continue,
387 };
388 if inst.operands.len() < 3 {
389 continue;
390 }
391 let src0 = match &inst.operands[1] {
392 MachineOperand::VReg(v) => *v,
393 _ => continue,
394 };
395 let src1 = match &inst.operands[2] {
396 MachineOperand::VReg(v) => *v,
397 _ => continue,
398 };
399
400 let single_use = |v: VRegId| use_count.get(&v).copied().unwrap_or(0) == 1;
401
402 if is_add {
403 // AddReg(Mul(a, b), c): src0 is the mul result.
404 if let Some(&mul_idx) = mul_defs.get(&src0) {
405 if !single_use(src0) {
406 // mul result is read by something else too — can't fuse.
407 } else {
408 let mul_inst = &block.insts[mul_idx];
409 let n = match &mul_inst.operands[1] {
410 MachineOperand::VReg(v) => *v,
411 _ => continue,
412 };
413 let m = match &mul_inst.operands[2] {
414 MachineOperand::VReg(v) => *v,
415 _ => continue,
416 };
417 plans.push(FusionPlan {
418 add_idx: i,
419 mul_idx,
420 new_opcode: ArmOpcode::Madd,
421 n,
422 m,
423 a: src1,
424 });
425 continue;
426 }
427 }
428 // AddReg(c, Mul(a, b)) [commuted].
429 if let Some(&mul_idx) = mul_defs.get(&src1) {
430 if !single_use(src1) {
431 continue;
432 }
433 let mul_inst = &block.insts[mul_idx];
434 let n = match &mul_inst.operands[1] {
435 MachineOperand::VReg(v) => *v,
436 _ => continue,
437 };
438 let m = match &mul_inst.operands[2] {
439 MachineOperand::VReg(v) => *v,
440 _ => continue,
441 };
442 plans.push(FusionPlan {
443 add_idx: i,
444 mul_idx,
445 new_opcode: ArmOpcode::Madd,
446 n,
447 m,
448 a: src0,
449 });
450 }
451 } else if is_sub {
452 // SubReg(c, Mul(a, b)) → Msub. The other order
453 // (`a*b - c`) has no integer fused form on ARM64.
454 if let Some(&mul_idx) = mul_defs.get(&src1) {
455 if !single_use(src1) {
456 continue;
457 }
458 let mul_inst = &block.insts[mul_idx];
459 let n = match &mul_inst.operands[1] {
460 MachineOperand::VReg(v) => *v,
461 _ => continue,
462 };
463 let m = match &mul_inst.operands[2] {
464 MachineOperand::VReg(v) => *v,
465 _ => continue,
466 };
467 plans.push(FusionPlan {
468 add_idx: i,
469 mul_idx,
470 new_opcode: ArmOpcode::Msub,
471 n,
472 m,
473 a: src0,
474 });
475 }
476 }
477 }
478
479 if plans.is_empty() {
480 return;
481 }
482
483 // Apply plans in reverse add_idx order to keep indices stable.
484 plans.sort_by_key(|plan| std::cmp::Reverse(plan.add_idx));
485 let mut remove_idxs: std::collections::HashSet<usize> =
486 plans.iter().map(|p| p.mul_idx).collect();
487
488 let block = &mut mf.blocks[mb_idx];
489 for plan in &plans {
490 let dest = block.insts[plan.add_idx].def;
491 let dest_op = block.insts[plan.add_idx].operands[0].clone();
492 block.insts[plan.add_idx] = MachineInst {
493 opcode: plan.new_opcode,
494 operands: vec![
495 dest_op,
496 MachineOperand::VReg(plan.n),
497 MachineOperand::VReg(plan.m),
498 MachineOperand::VReg(plan.a),
499 ],
500 def: dest,
501 };
502 }
503 let mut idx = 0usize;
504 block.insts.retain(|_| {
505 let keep = !remove_idxs.remove(&idx);
506 idx += 1;
507 keep
508 });
509 }
510
511 /// ARM64 scaled-register addressing fusion.
512 ///
513 /// Rewrites the 4-instruction sequence emitted by GEP+Load/Store
514 /// lowering when `elem_size` is a power of two ≤ 8:
515 ///
516 /// ```text
517 /// movz tmp, #elem_size, 0 ; (or const-materializing equivalent)
518 /// mul scaled, idx, tmp
519 /// add addr, base, scaled
520 /// ldr dest, [addr] ; LdrImm/LdrFpImm/StrImm/StrFpImm, offset 0
521 /// ```
522 ///
523 /// into a single instruction:
524 ///
525 /// ```text
526 /// ldr dest, [base, idx, lsl #shift]
527 /// ```
528 ///
529 /// Preconditions checked per fusion site:
530 /// * `tmp` has exactly one use (the mul) and is defined by a single
531 /// `Movz` whose chunk value ∈ {1, 2, 4, 8} with shift 0.
532 /// * `scaled` (the mul result) has exactly one use (the add).
533 /// * `addr` (the add result) has exactly one use (the load/store).
534 /// * The load/store's offset operand is `Imm(0)`.
535 /// * All four instructions live in the same machine block.
536 ///
537 /// Multi-use intermediates (e.g. the same GEP feeds two loads) keep
538 /// the unfused form — duplicating the index calculation across each
539 /// consumer would increase code size, not decrease it.
540 fn scaled_addressing_fusion(mf: &mut MachineFunction) {
541 // Compute use counts function-wide. The fold removes 3
542 // instructions (Movz, Mul, AddReg) and rewrites a 4th. If any of
543 // those defs is consumed in *another* block (cross-block live
544 // value), we must not fold — the consumer would lose its
545 // operand. A per-block count understates true uses when the
546 // value is live-out, which is what the cross-block path needs to
547 // reject. Counting once per function is cheap and correct.
548 let use_count = compute_global_use_counts(mf);
549 for mb_idx in 0..mf.blocks.len() {
550 scaled_addressing_fuse_block(mf, mb_idx, &use_count);
551 }
552 }
553
554 fn compute_global_use_counts(mf: &MachineFunction) -> HashMap<VRegId, usize> {
555 let mut use_count: HashMap<VRegId, usize> = HashMap::new();
556 for block in &mf.blocks {
557 for inst in &block.insts {
558 // operand[0] is the def for instructions that have one.
559 let skip_first = inst.def.is_some();
560 for (i, op) in inst.operands.iter().enumerate() {
561 if skip_first && i == 0 {
562 continue;
563 }
564 if let MachineOperand::VReg(v) = op {
565 *use_count.entry(*v).or_insert(0) += 1;
566 }
567 }
568 }
569 }
570 use_count
571 }
572
573 fn scaled_addressing_fuse_block(
574 mf: &mut MachineFunction,
575 mb_idx: usize,
576 use_count: &HashMap<VRegId, usize>,
577 ) {
578 let block = &mf.blocks[mb_idx];
579
580 // Map vreg → (block-local index of defining instruction, opcode).
581 let mut defs: HashMap<VRegId, (usize, ArmOpcode)> = HashMap::new();
582 for (i, inst) in block.insts.iter().enumerate() {
583 if let Some(d) = inst.def {
584 defs.insert(d, (i, inst.opcode));
585 }
586 }
587
588 struct FusionPlan {
589 ldst_idx: usize,
590 add_idx: usize,
591 mul_idx: usize,
592 movz_idx: usize,
593 new_opcode: ArmOpcode,
594 // [dest, base, idx, Imm(shift)]
595 dest_op: MachineOperand,
596 dest_def: Option<VRegId>,
597 base: VRegId,
598 idx: VRegId,
599 shift: i64,
600 }
601 let mut plans: Vec<FusionPlan> = Vec::new();
602
603 for (i, inst) in block.insts.iter().enumerate() {
604 let new_opcode = match inst.opcode {
605 ArmOpcode::LdrImm => ArmOpcode::LdrReg,
606 ArmOpcode::LdrFpImm => ArmOpcode::LdrFpReg,
607 ArmOpcode::StrImm => ArmOpcode::StrReg,
608 ArmOpcode::StrFpImm => ArmOpcode::StrFpReg,
609 _ => continue,
610 };
611 // Format: [dest_or_src, base, offset]. Offset must be Imm(0).
612 if inst.operands.len() < 3 {
613 continue;
614 }
615 let offset_zero = matches!(&inst.operands[2], MachineOperand::Imm(0));
616 if !offset_zero {
617 continue;
618 }
619 let base_addr = match &inst.operands[1] {
620 MachineOperand::VReg(v) => *v,
621 _ => continue,
622 };
623
624 // The base address must come from a single-use AddReg.
625 let (add_idx, add_op) = match defs.get(&base_addr) {
626 Some(&(idx, op)) => (idx, op),
627 None => continue,
628 };
629 if add_op != ArmOpcode::AddReg {
630 continue;
631 }
632 if use_count.get(&base_addr).copied().unwrap_or(0) != 1 {
633 continue;
634 }
635
636 let add_inst = &block.insts[add_idx];
637 if add_inst.operands.len() < 3 {
638 continue;
639 }
640 let add_lhs = match &add_inst.operands[1] {
641 MachineOperand::VReg(v) => *v,
642 _ => continue,
643 };
644 let add_rhs = match &add_inst.operands[2] {
645 MachineOperand::VReg(v) => *v,
646 _ => continue,
647 };
648
649 // The scaled-index operand: defined by a single-use Mul. Try
650 // either commuted order — `add base, scaled` or `add scaled,
651 // base` (isel emits the former, but be defensive).
652 let try_pair = |scaled: VRegId, base: VRegId| -> Option<(usize, VRegId, VRegId, i64, usize)> {
653 let &(mul_idx, mul_op) = defs.get(&scaled)?;
654 if mul_op != ArmOpcode::Mul {
655 return None;
656 }
657 if use_count.get(&scaled).copied().unwrap_or(0) != 1 {
658 return None;
659 }
660 let mul_inst = &block.insts[mul_idx];
661 if mul_inst.operands.len() < 3 {
662 return None;
663 }
664 let m_lhs = match &mul_inst.operands[1] {
665 MachineOperand::VReg(v) => *v,
666 _ => return None,
667 };
668 let m_rhs = match &mul_inst.operands[2] {
669 MachineOperand::VReg(v) => *v,
670 _ => return None,
671 };
672 // The constant operand of the Mul — must be defined by a
673 // single-use Movz with chunk ∈ {1,2,4,8} at shift 0.
674 let try_const = |const_v: VRegId, idx_v: VRegId| -> Option<(VRegId, i64, usize)> {
675 let &(movz_idx, movz_op) = defs.get(&const_v)?;
676 if movz_op != ArmOpcode::Movz {
677 return None;
678 }
679 if use_count.get(&const_v).copied().unwrap_or(0) != 1 {
680 return None;
681 }
682 let movz_inst = &block.insts[movz_idx];
683 // Movz operands: [dest, Imm(chunk), Shift(amount)]. We
684 // require shift==0 and chunk ∈ {1,2,4,8}.
685 if movz_inst.operands.len() < 3 {
686 return None;
687 }
688 let chunk = match &movz_inst.operands[1] {
689 MachineOperand::Imm(v) => *v,
690 _ => return None,
691 };
692 let shift_amount = match &movz_inst.operands[2] {
693 MachineOperand::Shift(s) => *s,
694 _ => return None,
695 };
696 if shift_amount != 0 {
697 return None;
698 }
699 let lsl_shift: i64 = match chunk {
700 1 => 0,
701 2 => 1,
702 4 => 2,
703 8 => 3,
704 _ => return None,
705 };
706 Some((idx_v, lsl_shift, movz_idx))
707 };
708 try_const(m_rhs, m_lhs)
709 .or_else(|| try_const(m_lhs, m_rhs))
710 .map(|(idx, shift, movz_idx)| (mul_idx, base, idx, shift, movz_idx))
711 };
712
713 let resolved = try_pair(add_rhs, add_lhs).or_else(|| try_pair(add_lhs, add_rhs));
714 let Some((mul_idx, base_v, idx_v, shift, movz_idx)) = resolved else {
715 continue;
716 };
717
718 plans.push(FusionPlan {
719 ldst_idx: i,
720 add_idx,
721 mul_idx,
722 movz_idx,
723 new_opcode,
724 dest_op: inst.operands[0].clone(),
725 dest_def: inst.def,
726 base: base_v,
727 idx: idx_v,
728 shift,
729 });
730 }
731
732 if plans.is_empty() {
733 return;
734 }
735
736 // Apply: rewrite the load/store in place; mark the mul/add/movz
737 // for removal. Iterate in reverse ldst_idx order so removals
738 // before the rewrite don't shift indices.
739 plans.sort_by_key(|plan| std::cmp::Reverse(plan.ldst_idx));
740 let mut remove_idxs: std::collections::HashSet<usize> = plans
741 .iter()
742 .flat_map(|p| [p.add_idx, p.mul_idx, p.movz_idx])
743 .collect();
744
745 let block = &mut mf.blocks[mb_idx];
746 for plan in &plans {
747 block.insts[plan.ldst_idx] = MachineInst {
748 opcode: plan.new_opcode,
749 operands: vec![
750 plan.dest_op.clone(),
751 MachineOperand::VReg(plan.base),
752 MachineOperand::VReg(plan.idx),
753 MachineOperand::Imm(plan.shift),
754 ],
755 def: plan.dest_def,
756 };
757 }
758 let mut idx = 0usize;
759 block.insts.retain(|_| {
760 let keep = !remove_idxs.remove(&idx);
761 idx += 1;
762 keep
763 });
764 }
765
766 /// LDP/STP pair fusion.
767 ///
768 /// Walks each basic block and fuses **immediately-adjacent** ldr/str
769 /// pairs that share a base register and have offsets differing by
770 /// exactly the access width. The two get rewritten as a single
771 /// `LdpOffset` / `StpOffset` whose imm targets the LOWER address
772 /// (Rt1 sits at imm, Rt2 at imm + width per ARM ARM C6.2.156).
773 ///
774 /// Restrictions kept narrow on this first cut:
775 /// * Only adjacent instructions in the block (no window scan; no
776 /// intervening writes to base or memory). Broadening to a small
777 /// window with proper aliasing analysis is a follow-up if the
778 /// adjacent-only form proves too restrictive.
779 /// * Same opcode and same width on both ops. We support 4-byte
780 /// (`LdrImm`/`StrImm` Gp32, `LdrFpImm`/`StrFpImm` Fp32) and 8-byte
781 /// (Gp64, Fp64) variants. The `LdrshImm`/`LdrsbImm`/`StrhImm`/
782 /// `StrbImm` half/byte forms have no LDP/STP analogue and are
783 /// skipped.
784 /// * Both offsets must be plain `Imm(_)` (not `FrameSlot` — the
785 /// prologue STP/LDP already covers callee-save spills, and frame
786 /// slots get materialized to immediates downstream anyway).
787 /// * For loads, the two destination registers must differ. ARM ARM
788 /// declares `LDP Rt, Rt, ...` UNPREDICTABLE.
789 /// * For loads, the first load's destination must not equal the base
790 /// register. The fused LDP would still read the unmodified base,
791 /// which silently breaks the original semantics where the second
792 /// ldr saw the post-write base.
793 /// * Combined offset must lie within signed 7-bit × width range.
794 fn ldp_stp_fusion(mf: &mut MachineFunction) {
795 for mb_idx in 0..mf.blocks.len() {
796 ldp_stp_fuse_block(mf, mb_idx);
797 }
798 }
799
800 fn vreg_class(mf: &MachineFunction, v: VRegId) -> Option<RegClass> {
801 mf.vregs.get(v.0 as usize).map(|r| r.class)
802 }
803
804 fn class_byte_width(c: RegClass) -> u32 {
805 match c {
806 RegClass::Gp32 | RegClass::Fp32 => 4,
807 RegClass::Gp64 | RegClass::Fp64 => 8,
808 RegClass::V128 => 16,
809 }
810 }
811
812 /// Width inferred from operand[0] of a load/store instruction. For
813 /// loads the operand is the dest vreg; for stores it is the source
814 /// vreg — both carry the access width via their RegClass.
815 fn ldst_access_width(mf: &MachineFunction, inst: &MachineInst) -> Option<u32> {
816 let v = match inst.operands.first()? {
817 MachineOperand::VReg(v) => *v,
818 _ => return None,
819 };
820 let class = vreg_class(mf, v)?;
821 Some(class_byte_width(class))
822 }
823
824 fn ldp_imm_in_range(off: i64, width: u32) -> bool {
825 let stride = width as i64;
826 if off % stride != 0 {
827 return false;
828 }
829 let units = off / stride;
830 // signed 7-bit imm range: [-64, 63] units.
831 (-64..=63).contains(&units)
832 }
833
834 fn ldp_stp_fuse_block(mf: &mut MachineFunction, mb_idx: usize) {
835 let block_len = mf.blocks[mb_idx].insts.len();
836 if block_len < 2 {
837 return;
838 }
839
840 enum LdSt {
841 LdrInt,
842 LdrFp,
843 StrInt,
844 StrFp,
845 }
846 fn classify(op: ArmOpcode) -> Option<LdSt> {
847 Some(match op {
848 ArmOpcode::LdrImm => LdSt::LdrInt,
849 ArmOpcode::LdrFpImm => LdSt::LdrFp,
850 ArmOpcode::StrImm => LdSt::StrInt,
851 ArmOpcode::StrFpImm => LdSt::StrFp,
852 _ => return None,
853 })
854 }
855 fn paired_opcode(kind: &LdSt) -> ArmOpcode {
856 // STP and LDP are width-agnostic at the opcode level; the
857 // emit layer picks `ldp x/w/d/s` from operand[0]'s register
858 // class. (afs-as does the same.)
859 match kind {
860 LdSt::LdrInt | LdSt::LdrFp => ArmOpcode::LdpOffset,
861 LdSt::StrInt | LdSt::StrFp => ArmOpcode::StpOffset,
862 }
863 }
864
865 // Plan: collect (lower_idx, upper_idx, replacement) and, after the
866 // walk, rewrite lower_idx in place and mark upper_idx for removal.
867 struct PairPlan {
868 lower_idx: usize,
869 upper_idx: usize,
870 new_inst: MachineInst,
871 }
872 let mut plans: Vec<PairPlan> = Vec::new();
873 let mut consumed: std::collections::HashSet<usize> = std::collections::HashSet::new();
874
875 let mut i = 0;
876 while i + 1 < block_len {
877 if consumed.contains(&i) {
878 i += 1;
879 continue;
880 }
881 let a = &mf.blocks[mb_idx].insts[i];
882 let b = &mf.blocks[mb_idx].insts[i + 1];
883 let (Some(ka), Some(kb)) = (classify(a.opcode), classify(b.opcode)) else {
884 i += 1;
885 continue;
886 };
887 // Must be the same opcode (loads can fuse only with loads,
888 // stores with stores, integer with integer, float with float).
889 if a.opcode != b.opcode {
890 i += 1;
891 continue;
892 }
893
894 if a.operands.len() < 3 || b.operands.len() < 3 {
895 i += 1;
896 continue;
897 }
898 let (av, bv) = (&a.operands[0], &b.operands[0]);
899 let (a_base, b_base) = (&a.operands[1], &b.operands[1]);
900 let (a_off_op, b_off_op) = (&a.operands[2], &b.operands[2]);
901 let (Some(a_off), Some(b_off)) = (
902 match a_off_op {
903 MachineOperand::Imm(v) => Some(*v),
904 _ => None,
905 },
906 match b_off_op {
907 MachineOperand::Imm(v) => Some(*v),
908 _ => None,
909 },
910 ) else {
911 i += 1;
912 continue;
913 };
914 if a_base != b_base {
915 i += 1;
916 continue;
917 }
918
919 let Some(width) = ldst_access_width(mf, a) else {
920 i += 1;
921 continue;
922 };
923 // Both ops must access the same width — the operand[0] vregs
924 // share a register class for legal LDP/STP.
925 let b_width = ldst_access_width(mf, b);
926 if Some(width) != b_width {
927 i += 1;
928 continue;
929 }
930 // Adjacent offsets: the higher one is exactly width above the lower.
931 let (lower_off, lower_idx_in_pair, upper_op_value) = if b_off == a_off + width as i64 {
932 (a_off, 0usize, bv.clone())
933 } else if a_off == b_off + width as i64 {
934 (b_off, 1usize, av.clone())
935 } else {
936 i += 1;
937 continue;
938 };
939 let lower_op_value = if lower_idx_in_pair == 0 { av.clone() } else { bv.clone() };
940
941 if !ldp_imm_in_range(lower_off, width) {
942 i += 1;
943 continue;
944 }
945
946 // For loads: forbid same-dest UNPREDICTABLE form, and forbid
947 // dest1 == base which would silently change the second load's
948 // address.
949 let (is_load, base_v) = match (&ka, a_base) {
950 (LdSt::LdrInt | LdSt::LdrFp, MachineOperand::VReg(v)) => (true, Some(*v)),
951 _ => (false, None),
952 };
953 if is_load {
954 let (MachineOperand::VReg(va), MachineOperand::VReg(vb)) = (av, bv) else {
955 i += 1;
956 continue;
957 };
958 if va == vb {
959 i += 1;
960 continue;
961 }
962 if let Some(b_v) = base_v {
963 // The first load (program order) is at index i. Its
964 // dest must not be the base — otherwise the second
965 // load (i+1) would be reading a different base.
966 let first_dest = match &mf.blocks[mb_idx].insts[i].operands[0] {
967 MachineOperand::VReg(v) => *v,
968 _ => {
969 i += 1;
970 continue;
971 }
972 };
973 if first_dest == b_v {
974 i += 1;
975 continue;
976 }
977 }
978 }
979 let _ = kb; // suppress "unused" — symmetry only.
980
981 // Build the LdpOffset / StpOffset. Operands: [r1, r2, base, imm].
982 let new_opcode = paired_opcode(&ka);
983 let new_def = if is_load {
984 // LDP defines two regs but our MachineInst.def is single-
985 // value. Track the higher of the two so reg-allocator
986 // liveness sees at least one def; the actual two-write
987 // semantics live in the operand list. (Existing prologue
988 // LdpOffset uses this same shape.)
989 match &lower_op_value {
990 MachineOperand::VReg(_) => Some(match upper_op_value {
991 MachineOperand::VReg(v) => v,
992 _ => {
993 i += 1;
994 continue;
995 }
996 }),
997 _ => None,
998 }
999 } else {
1000 None
1001 };
1002 let new_inst = MachineInst {
1003 opcode: new_opcode,
1004 operands: vec![
1005 lower_op_value,
1006 upper_op_value,
1007 a_base.clone(),
1008 MachineOperand::Imm(lower_off),
1009 ],
1010 def: new_def,
1011 };
1012 plans.push(PairPlan {
1013 lower_idx: i,
1014 upper_idx: i + 1,
1015 new_inst,
1016 });
1017 consumed.insert(i);
1018 consumed.insert(i + 1);
1019 i += 2;
1020 }
1021
1022 if plans.is_empty() {
1023 return;
1024 }
1025
1026 // Apply rewrites. Order doesn't matter because plans don't
1027 // overlap (consumed set ensures each index is in at most one).
1028 let block = &mut mf.blocks[mb_idx];
1029 let mut remove_idxs: std::collections::HashSet<usize> = plans.iter().map(|p| p.upper_idx).collect();
1030 for plan in &plans {
1031 block.insts[plan.lower_idx] = plan.new_inst.clone();
1032 }
1033 let mut idx = 0usize;
1034 block.insts.retain(|_| {
1035 let keep = !remove_idxs.remove(&idx);
1036 idx += 1;
1037 keep
1038 });
1039 }
1040
1041 #[cfg(test)]
1042 mod tests {
1043 use super::*;
1044 use crate::codegen::mir::{
1045 ArmOpcode, MBlockId, MachineBlock, MachineFunction, MachineInst, MachineOperand, RegClass,
1046 };
1047
1048 fn mf_with_insts(insts: Vec<MachineInst>) -> MachineFunction {
1049 let mut mf = MachineFunction::new("test".into());
1050 // Allocate enough vregs.
1051 for _ in 0..10 {
1052 mf.new_vreg(RegClass::Fp32);
1053 }
1054 let bid = MBlockId(0);
1055 mf.blocks = vec![MachineBlock {
1056 id: bid,
1057 label: "entry".into(),
1058 insts,
1059 }];
1060 mf
1061 }
1062
1063 fn vreg(v: u32) -> MachineOperand {
1064 MachineOperand::VReg(VRegId(v))
1065 }
1066 fn vid(v: u32) -> VRegId {
1067 VRegId(v)
1068 }
1069
1070 /// fadd(fmul(v1, v2), v3) → fmadd(v1, v2, v3)
1071 #[test]
1072 fn fmadd_f32() {
1073 let fmul = MachineInst {
1074 opcode: ArmOpcode::FmulS,
1075 operands: vec![vreg(0), vreg(1), vreg(2)],
1076 def: Some(vid(0)),
1077 };
1078 let fadd = MachineInst {
1079 opcode: ArmOpcode::FaddS,
1080 operands: vec![vreg(4), vreg(0), vreg(3)], // src0 = fmul result
1081 def: Some(vid(4)),
1082 };
1083 let mut mf = mf_with_insts(vec![fmul, fadd]);
1084 fma_fusion(&mut mf);
1085 let block = &mf.blocks[0];
1086 assert_eq!(block.insts.len(), 1, "fmul should be removed");
1087 assert_eq!(block.insts[0].opcode, ArmOpcode::FmaddS);
1088 // operands: [dest, n, m, a] = [v4, v1, v2, v3]
1089 assert_eq!(block.insts[0].operands[1], vreg(1));
1090 assert_eq!(block.insts[0].operands[2], vreg(2));
1091 assert_eq!(block.insts[0].operands[3], vreg(3));
1092 }
1093
1094 /// fadd(v3, fmul(v1, v2)) → fmadd(v1, v2, v3) [commuted]
1095 #[test]
1096 fn fmadd_commuted() {
1097 let fmul = MachineInst {
1098 opcode: ArmOpcode::FmulD,
1099 operands: vec![vreg(0), vreg(1), vreg(2)],
1100 def: Some(vid(0)),
1101 };
1102 let fadd = MachineInst {
1103 opcode: ArmOpcode::FaddD,
1104 operands: vec![vreg(4), vreg(3), vreg(0)], // src1 = fmul result
1105 def: Some(vid(4)),
1106 };
1107 let mut mf = mf_with_insts(vec![fmul, fadd]);
1108 fma_fusion(&mut mf);
1109 let block = &mf.blocks[0];
1110 assert_eq!(block.insts.len(), 1);
1111 assert_eq!(block.insts[0].opcode, ArmOpcode::FmaddD);
1112 assert_eq!(block.insts[0].operands[3], vreg(3)); // accumulator
1113 }
1114
1115 /// fsub(c, fmul(a,b)) → fmsub(a, b, c) [result = c - a*b]
1116 #[test]
1117 fn fmsub_f32() {
1118 let fmul = MachineInst {
1119 opcode: ArmOpcode::FmulS,
1120 operands: vec![vreg(0), vreg(1), vreg(2)],
1121 def: Some(vid(0)),
1122 };
1123 let fsub = MachineInst {
1124 opcode: ArmOpcode::FsubS,
1125 operands: vec![vreg(4), vreg(3), vreg(0)], // src1 = fmul result
1126 def: Some(vid(4)),
1127 };
1128 let mut mf = mf_with_insts(vec![fmul, fsub]);
1129 fma_fusion(&mut mf);
1130 let block = &mf.blocks[0];
1131 assert_eq!(block.insts.len(), 1);
1132 assert_eq!(block.insts[0].opcode, ArmOpcode::FmsubS);
1133 assert_eq!(block.insts[0].operands[3], vreg(3)); // Sa = c
1134 }
1135
1136 /// fsub(fmul(a,b), c) → fnmsub(a, b, c) [result = a*b - c]
1137 #[test]
1138 fn fnmsub_f64() {
1139 let fmul = MachineInst {
1140 opcode: ArmOpcode::FmulD,
1141 operands: vec![vreg(0), vreg(1), vreg(2)],
1142 def: Some(vid(0)),
1143 };
1144 let fsub = MachineInst {
1145 opcode: ArmOpcode::FsubD,
1146 operands: vec![vreg(4), vreg(0), vreg(3)], // src0 = fmul result
1147 def: Some(vid(4)),
1148 };
1149 let mut mf = mf_with_insts(vec![fmul, fsub]);
1150 fma_fusion(&mut mf);
1151 let block = &mf.blocks[0];
1152 assert_eq!(block.insts.len(), 1);
1153 assert_eq!(block.insts[0].opcode, ArmOpcode::FnmsubD);
1154 }
1155
1156 fn vreg_gp(v: u32) -> MachineOperand {
1157 MachineOperand::VReg(VRegId(v))
1158 }
1159
1160 /// add(mul(v1, v2), v3) → madd v4, v1, v2, v3
1161 #[test]
1162 fn integer_madd_fusion() {
1163 let mul = MachineInst {
1164 opcode: ArmOpcode::Mul,
1165 operands: vec![vreg_gp(0), vreg_gp(1), vreg_gp(2)],
1166 def: Some(vid(0)),
1167 };
1168 let add = MachineInst {
1169 opcode: ArmOpcode::AddReg,
1170 operands: vec![vreg_gp(4), vreg_gp(0), vreg_gp(3)],
1171 def: Some(vid(4)),
1172 };
1173 let mut mf = mf_with_insts(vec![mul, add]);
1174 madd_fusion(&mut mf);
1175 let block = &mf.blocks[0];
1176 assert_eq!(block.insts.len(), 1, "mul should be removed");
1177 assert_eq!(block.insts[0].opcode, ArmOpcode::Madd);
1178 assert_eq!(block.insts[0].operands[1], vreg_gp(1));
1179 assert_eq!(block.insts[0].operands[2], vreg_gp(2));
1180 assert_eq!(block.insts[0].operands[3], vreg_gp(3));
1181 }
1182
1183 /// add(v3, mul(v1, v2)) → madd v4, v1, v2, v3 (commuted)
1184 #[test]
1185 fn integer_madd_fusion_commuted() {
1186 let mul = MachineInst {
1187 opcode: ArmOpcode::Mul,
1188 operands: vec![vreg_gp(0), vreg_gp(1), vreg_gp(2)],
1189 def: Some(vid(0)),
1190 };
1191 let add = MachineInst {
1192 opcode: ArmOpcode::AddReg,
1193 operands: vec![vreg_gp(4), vreg_gp(3), vreg_gp(0)],
1194 def: Some(vid(4)),
1195 };
1196 let mut mf = mf_with_insts(vec![mul, add]);
1197 madd_fusion(&mut mf);
1198 let block = &mf.blocks[0];
1199 assert_eq!(block.insts.len(), 1);
1200 assert_eq!(block.insts[0].opcode, ArmOpcode::Madd);
1201 assert_eq!(block.insts[0].operands[3], vreg_gp(3), "accumulator is c");
1202 }
1203
1204 /// sub(c, mul(a, b)) → msub a, b, c (result = c - a*b)
1205 #[test]
1206 fn integer_msub_fusion() {
1207 let mul = MachineInst {
1208 opcode: ArmOpcode::Mul,
1209 operands: vec![vreg_gp(0), vreg_gp(1), vreg_gp(2)],
1210 def: Some(vid(0)),
1211 };
1212 let sub = MachineInst {
1213 opcode: ArmOpcode::SubReg,
1214 operands: vec![vreg_gp(4), vreg_gp(3), vreg_gp(0)],
1215 def: Some(vid(4)),
1216 };
1217 let mut mf = mf_with_insts(vec![mul, sub]);
1218 madd_fusion(&mut mf);
1219 let block = &mf.blocks[0];
1220 assert_eq!(block.insts.len(), 1);
1221 assert_eq!(block.insts[0].opcode, ArmOpcode::Msub);
1222 assert_eq!(block.insts[0].operands[3], vreg_gp(3), "Xa = c");
1223 }
1224
1225 /// sub(mul(a, b), c) is `a*b - c` — no integer FNMSUB form.
1226 /// The pass should leave it as-is.
1227 #[test]
1228 fn integer_sub_mul_first_not_fused() {
1229 let mul = MachineInst {
1230 opcode: ArmOpcode::Mul,
1231 operands: vec![vreg_gp(0), vreg_gp(1), vreg_gp(2)],
1232 def: Some(vid(0)),
1233 };
1234 let sub = MachineInst {
1235 opcode: ArmOpcode::SubReg,
1236 operands: vec![vreg_gp(4), vreg_gp(0), vreg_gp(3)],
1237 def: Some(vid(4)),
1238 };
1239 let mut mf = mf_with_insts(vec![mul, sub]);
1240 madd_fusion(&mut mf);
1241 // Both instructions should remain — there's no msub form for a*b - c.
1242 assert_eq!(mf.blocks[0].insts.len(), 2);
1243 assert_eq!(mf.blocks[0].insts[0].opcode, ArmOpcode::Mul);
1244 assert_eq!(mf.blocks[0].insts[1].opcode, ArmOpcode::SubReg);
1245 }
1246
1247 /// mul result with multiple consumers must NOT fuse.
1248 #[test]
1249 fn integer_mul_used_twice_not_fused() {
1250 let mul = MachineInst {
1251 opcode: ArmOpcode::Mul,
1252 operands: vec![vreg_gp(0), vreg_gp(1), vreg_gp(2)],
1253 def: Some(vid(0)),
1254 };
1255 let add1 = MachineInst {
1256 opcode: ArmOpcode::AddReg,
1257 operands: vec![vreg_gp(4), vreg_gp(0), vreg_gp(3)],
1258 def: Some(vid(4)),
1259 };
1260 let add2 = MachineInst {
1261 opcode: ArmOpcode::AddReg,
1262 operands: vec![vreg_gp(5), vreg_gp(0), vreg_gp(3)],
1263 def: Some(vid(5)),
1264 };
1265 let mut mf = mf_with_insts(vec![mul, add1, add2]);
1266 madd_fusion(&mut mf);
1267 assert_eq!(mf.blocks[0].insts.len(), 3, "mul has 2 uses, can't fuse");
1268 assert_eq!(mf.blocks[0].insts[0].opcode, ArmOpcode::Mul);
1269 }
1270
1271 /// fmul result used twice: must NOT be fused.
1272 #[test]
1273 fn no_fusion_if_mul_used_twice() {
1274 let fmul = MachineInst {
1275 opcode: ArmOpcode::FmulS,
1276 operands: vec![vreg(0), vreg(1), vreg(2)],
1277 def: Some(vid(0)),
1278 };
1279 // Two consumers of vreg(0):
1280 let fadd1 = MachineInst {
1281 opcode: ArmOpcode::FaddS,
1282 operands: vec![vreg(4), vreg(0), vreg(3)],
1283 def: Some(vid(4)),
1284 };
1285 let fadd2 = MachineInst {
1286 opcode: ArmOpcode::FaddS,
1287 operands: vec![vreg(5), vreg(0), vreg(3)],
1288 def: Some(vid(5)),
1289 };
1290 let mut mf = mf_with_insts(vec![fmul, fadd1, fadd2]);
1291 fma_fusion(&mut mf);
1292 // No fusion — fmul has 2 uses.
1293 assert_eq!(mf.blocks[0].insts.len(), 3);
1294 assert_eq!(mf.blocks[0].insts[0].opcode, ArmOpcode::FmulS);
1295 }
1296
1297 /// LdrFpImm fed by AddReg(base, Mul(idx, Movz(8))) → LdrFpReg with lsl=3.
1298 #[test]
1299 fn scaled_addressing_real8_load() {
1300 let movz = MachineInst {
1301 opcode: ArmOpcode::Movz,
1302 operands: vec![vreg(0), MachineOperand::Imm(8), MachineOperand::Shift(0)],
1303 def: Some(vid(0)),
1304 };
1305 let mul = MachineInst {
1306 opcode: ArmOpcode::Mul,
1307 operands: vec![vreg(1), vreg(2), vreg(0)], // scaled = idx(2) * const(0)
1308 def: Some(vid(1)),
1309 };
1310 let add = MachineInst {
1311 opcode: ArmOpcode::AddReg,
1312 operands: vec![vreg(3), vreg(4), vreg(1)], // addr = base(4) + scaled(1)
1313 def: Some(vid(3)),
1314 };
1315 let ldr = MachineInst {
1316 opcode: ArmOpcode::LdrFpImm,
1317 operands: vec![vreg(5), vreg(3), MachineOperand::Imm(0)], // ldr d5, [addr, #0]
1318 def: Some(vid(5)),
1319 };
1320 let mut mf = mf_with_insts(vec![movz, mul, add, ldr]);
1321 scaled_addressing_fusion(&mut mf);
1322 let block = &mf.blocks[0];
1323 assert_eq!(block.insts.len(), 1, "movz/mul/add should be removed");
1324 assert_eq!(block.insts[0].opcode, ArmOpcode::LdrFpReg);
1325 assert_eq!(block.insts[0].operands[0], vreg(5)); // dest
1326 assert_eq!(block.insts[0].operands[1], vreg(4)); // base
1327 assert_eq!(block.insts[0].operands[2], vreg(2)); // idx
1328 assert_eq!(block.insts[0].operands[3], MachineOperand::Imm(3)); // log2(8) = 3
1329 }
1330
1331 /// Multi-use Mul result → no fold (the mul result is consumed twice).
1332 #[test]
1333 fn scaled_addressing_no_fold_on_multi_use_mul() {
1334 let movz = MachineInst {
1335 opcode: ArmOpcode::Movz,
1336 operands: vec![vreg(0), MachineOperand::Imm(8), MachineOperand::Shift(0)],
1337 def: Some(vid(0)),
1338 };
1339 let mul = MachineInst {
1340 opcode: ArmOpcode::Mul,
1341 operands: vec![vreg(1), vreg(2), vreg(0)],
1342 def: Some(vid(1)),
1343 };
1344 let add = MachineInst {
1345 opcode: ArmOpcode::AddReg,
1346 operands: vec![vreg(3), vreg(4), vreg(1)],
1347 def: Some(vid(3)),
1348 };
1349 let ldr = MachineInst {
1350 opcode: ArmOpcode::LdrFpImm,
1351 operands: vec![vreg(5), vreg(3), MachineOperand::Imm(0)],
1352 def: Some(vid(5)),
1353 };
1354 // Second consumer of the mul result — extra reference to vreg(1).
1355 let extra = MachineInst {
1356 opcode: ArmOpcode::AddReg,
1357 operands: vec![vreg(6), vreg(7), vreg(1)],
1358 def: Some(vid(6)),
1359 };
1360 let mut mf = mf_with_insts(vec![movz, mul, add, ldr, extra]);
1361 scaled_addressing_fusion(&mut mf);
1362 // Nothing fused.
1363 assert_eq!(mf.blocks[0].insts.len(), 5);
1364 assert_eq!(mf.blocks[0].insts[3].opcode, ArmOpcode::LdrFpImm);
1365 }
1366
1367 /// Non-power-of-two const (e.g. element size 12 from a 3-int derived
1368 /// type) must NOT fold — there's no `lsl` encoding for it.
1369 #[test]
1370 fn scaled_addressing_no_fold_on_non_power_of_two() {
1371 let movz = MachineInst {
1372 opcode: ArmOpcode::Movz,
1373 operands: vec![vreg(0), MachineOperand::Imm(12), MachineOperand::Shift(0)],
1374 def: Some(vid(0)),
1375 };
1376 let mul = MachineInst {
1377 opcode: ArmOpcode::Mul,
1378 operands: vec![vreg(1), vreg(2), vreg(0)],
1379 def: Some(vid(1)),
1380 };
1381 let add = MachineInst {
1382 opcode: ArmOpcode::AddReg,
1383 operands: vec![vreg(3), vreg(4), vreg(1)],
1384 def: Some(vid(3)),
1385 };
1386 let ldr = MachineInst {
1387 opcode: ArmOpcode::LdrImm,
1388 operands: vec![vreg(5), vreg(3), MachineOperand::Imm(0)],
1389 def: Some(vid(5)),
1390 };
1391 let mut mf = mf_with_insts(vec![movz, mul, add, ldr]);
1392 scaled_addressing_fusion(&mut mf);
1393 assert_eq!(mf.blocks[0].insts.len(), 4);
1394 assert_eq!(mf.blocks[0].insts[3].opcode, ArmOpcode::LdrImm);
1395 }
1396
1397 fn mf_with_classes(insts: Vec<MachineInst>, classes: &[RegClass]) -> MachineFunction {
1398 let mut mf = MachineFunction::new("test".into());
1399 for c in classes {
1400 mf.new_vreg(*c);
1401 }
1402 let bid = MBlockId(0);
1403 mf.blocks = vec![MachineBlock {
1404 id: bid,
1405 label: "entry".into(),
1406 insts,
1407 }];
1408 mf
1409 }
1410
1411 /// Two adjacent `ldr x_, [base, #imm]` with offsets 0 and 8 fuse to `ldp`.
1412 #[test]
1413 fn ldp_fusion_int64_pair() {
1414 // vregs: 0,1,2 = Gp64
1415 let classes = vec![RegClass::Gp64; 3];
1416 let ldr1 = MachineInst {
1417 opcode: ArmOpcode::LdrImm,
1418 operands: vec![vreg(0), vreg(2), MachineOperand::Imm(0)],
1419 def: Some(vid(0)),
1420 };
1421 let ldr2 = MachineInst {
1422 opcode: ArmOpcode::LdrImm,
1423 operands: vec![vreg(1), vreg(2), MachineOperand::Imm(8)],
1424 def: Some(vid(1)),
1425 };
1426 let mut mf = mf_with_classes(vec![ldr1, ldr2], &classes);
1427 ldp_stp_fusion(&mut mf);
1428 let block = &mf.blocks[0];
1429 assert_eq!(block.insts.len(), 1);
1430 assert_eq!(block.insts[0].opcode, ArmOpcode::LdpOffset);
1431 // Lower-offset (0) load goes into Rt1 = vreg(0); higher (8) into Rt2 = vreg(1).
1432 assert_eq!(block.insts[0].operands[0], vreg(0));
1433 assert_eq!(block.insts[0].operands[1], vreg(1));
1434 assert_eq!(block.insts[0].operands[2], vreg(2));
1435 assert_eq!(block.insts[0].operands[3], MachineOperand::Imm(0));
1436 }
1437
1438 /// Two adjacent `str d_, [base, #imm]` with descending offsets fuse to `stp d`.
1439 #[test]
1440 fn stp_fusion_f64_pair_descending() {
1441 let classes = vec![RegClass::Fp64, RegClass::Fp64, RegClass::Gp64];
1442 let str_high = MachineInst {
1443 opcode: ArmOpcode::StrFpImm,
1444 operands: vec![vreg(0), vreg(2), MachineOperand::Imm(-16)],
1445 def: None,
1446 };
1447 let str_low = MachineInst {
1448 opcode: ArmOpcode::StrFpImm,
1449 operands: vec![vreg(1), vreg(2), MachineOperand::Imm(-24)],
1450 def: None,
1451 };
1452 let mut mf = mf_with_classes(vec![str_high, str_low], &classes);
1453 ldp_stp_fusion(&mut mf);
1454 let block = &mf.blocks[0];
1455 assert_eq!(block.insts.len(), 1);
1456 assert_eq!(block.insts[0].opcode, ArmOpcode::StpOffset);
1457 // Lower-offset (-24) source goes into Rt1, so Rt1 = vreg(1).
1458 assert_eq!(block.insts[0].operands[0], vreg(1));
1459 assert_eq!(block.insts[0].operands[1], vreg(0));
1460 assert_eq!(block.insts[0].operands[3], MachineOperand::Imm(-24));
1461 }
1462
1463 /// Different bases — no fusion.
1464 #[test]
1465 fn ldp_no_fusion_different_base() {
1466 let classes = vec![RegClass::Gp64; 4];
1467 let ldr1 = MachineInst {
1468 opcode: ArmOpcode::LdrImm,
1469 operands: vec![vreg(0), vreg(2), MachineOperand::Imm(0)],
1470 def: Some(vid(0)),
1471 };
1472 let ldr2 = MachineInst {
1473 opcode: ArmOpcode::LdrImm,
1474 operands: vec![vreg(1), vreg(3), MachineOperand::Imm(8)],
1475 def: Some(vid(1)),
1476 };
1477 let mut mf = mf_with_classes(vec![ldr1, ldr2], &classes);
1478 ldp_stp_fusion(&mut mf);
1479 assert_eq!(mf.blocks[0].insts.len(), 2);
1480 assert_eq!(mf.blocks[0].insts[0].opcode, ArmOpcode::LdrImm);
1481 }
1482
1483 /// Non-adjacent offsets (gap of 16 instead of 8) — no fusion.
1484 #[test]
1485 fn ldp_no_fusion_non_adjacent() {
1486 let classes = vec![RegClass::Gp64; 3];
1487 let ldr1 = MachineInst {
1488 opcode: ArmOpcode::LdrImm,
1489 operands: vec![vreg(0), vreg(2), MachineOperand::Imm(0)],
1490 def: Some(vid(0)),
1491 };
1492 let ldr2 = MachineInst {
1493 opcode: ArmOpcode::LdrImm,
1494 operands: vec![vreg(1), vreg(2), MachineOperand::Imm(16)],
1495 def: Some(vid(1)),
1496 };
1497 let mut mf = mf_with_classes(vec![ldr1, ldr2], &classes);
1498 ldp_stp_fusion(&mut mf);
1499 assert_eq!(mf.blocks[0].insts.len(), 2);
1500 }
1501
1502 /// Mixed-width pair (i32 + i64) — no fusion since LDP needs matching widths.
1503 #[test]
1504 fn ldp_no_fusion_mixed_width() {
1505 let classes = vec![RegClass::Gp32, RegClass::Gp64, RegClass::Gp64];
1506 let ldr1 = MachineInst {
1507 opcode: ArmOpcode::LdrImm,
1508 operands: vec![vreg(0), vreg(2), MachineOperand::Imm(0)],
1509 def: Some(vid(0)),
1510 };
1511 let ldr2 = MachineInst {
1512 opcode: ArmOpcode::LdrImm,
1513 operands: vec![vreg(1), vreg(2), MachineOperand::Imm(4)],
1514 def: Some(vid(1)),
1515 };
1516 let mut mf = mf_with_classes(vec![ldr1, ldr2], &classes);
1517 ldp_stp_fusion(&mut mf);
1518 assert_eq!(mf.blocks[0].insts.len(), 2);
1519 }
1520
1521 /// LDR with dest == base — must not fuse (the second load would otherwise read
1522 /// the now-overwritten base).
1523 #[test]
1524 fn ldp_no_fusion_dest_equals_base() {
1525 let classes = vec![RegClass::Gp64; 2];
1526 // dest of first load is the base
1527 let ldr1 = MachineInst {
1528 opcode: ArmOpcode::LdrImm,
1529 operands: vec![vreg(1), vreg(1), MachineOperand::Imm(0)],
1530 def: Some(vid(1)),
1531 };
1532 let ldr2 = MachineInst {
1533 opcode: ArmOpcode::LdrImm,
1534 operands: vec![vreg(0), vreg(1), MachineOperand::Imm(8)],
1535 def: Some(vid(0)),
1536 };
1537 let mut mf = mf_with_classes(vec![ldr1, ldr2], &classes);
1538 ldp_stp_fusion(&mut mf);
1539 assert_eq!(mf.blocks[0].insts.len(), 2);
1540 }
1541
1542 /// StrImm gets folded too (mirror of the load path).
1543 #[test]
1544 fn scaled_addressing_int4_store() {
1545 let movz = MachineInst {
1546 opcode: ArmOpcode::Movz,
1547 operands: vec![vreg(0), MachineOperand::Imm(4), MachineOperand::Shift(0)],
1548 def: Some(vid(0)),
1549 };
1550 let mul = MachineInst {
1551 opcode: ArmOpcode::Mul,
1552 operands: vec![vreg(1), vreg(2), vreg(0)],
1553 def: Some(vid(1)),
1554 };
1555 let add = MachineInst {
1556 opcode: ArmOpcode::AddReg,
1557 operands: vec![vreg(3), vreg(4), vreg(1)],
1558 def: Some(vid(3)),
1559 };
1560 let str_inst = MachineInst {
1561 opcode: ArmOpcode::StrImm,
1562 operands: vec![vreg(5), vreg(3), MachineOperand::Imm(0)],
1563 def: None, // store has no def
1564 };
1565 let mut mf = mf_with_insts(vec![movz, mul, add, str_inst]);
1566 scaled_addressing_fusion(&mut mf);
1567 let block = &mf.blocks[0];
1568 assert_eq!(block.insts.len(), 1);
1569 assert_eq!(block.insts[0].opcode, ArmOpcode::StrReg);
1570 assert_eq!(block.insts[0].operands[3], MachineOperand::Imm(2)); // log2(4) = 2
1571 }
1572 }
1573