Rust · 163107 bytes Raw Blame History
1 //! Instruction selection — translate SSA IR to Machine IR.
2 //!
3 //! Maps each IR instruction to one or more ARM64 machine instructions.
4 //! Uses virtual registers throughout; physical register assignment
5 //! happens in the register allocator (Sprint 21).
6 //!
7 //! Strategy: naive spill-everything. Every vreg lives on the stack.
8 //! Load before use, store after def. Correct but slow — optimized later.
9
10 use super::mir::*;
11 use crate::ir::inst::*;
12 use crate::ir::types::*;
13 use std::collections::{HashMap, HashSet};
14
15 /// Select machine instructions for an entire IR module.
16 pub fn select_module(module: &Module) -> Vec<MachineFunction> {
17 // Build function name table for resolving Internal call refs.
18 let func_names: Vec<String> = module.functions.iter().map(|f| f.name.clone()).collect();
19 module
20 .functions
21 .iter()
22 .map(|f| select_function_with_names(f, &func_names))
23 .collect()
24 }
25
26 fn select_function_with_names(func: &Function, func_names: &[String]) -> MachineFunction {
27 let mut mf = select_function(func);
28 // Resolve any Internal call references to actual function names.
29 for block in &mut mf.blocks {
30 for inst in &mut block.insts {
31 if let super::mir::ArmOpcode::Bl = inst.opcode {
32 if let Some(super::mir::MachineOperand::Extern(ref mut name)) =
33 inst.operands.first_mut()
34 {
35 // Check if this is a placeholder "_func_N" name from isel.
36 if name.starts_with("_func_") {
37 if let Ok(idx) = name[6..].parse::<usize>() {
38 if idx < func_names.len() {
39 *name = func_names[idx].clone();
40 }
41 }
42 }
43 }
44 }
45 }
46 }
47 mf
48 }
49
50 use super::abi::{classify_abi_arg, AbiArgLoc, AbiArgState};
51
52 /// Select machine instructions for one IR function.
53 pub fn select_function(func: &Function) -> MachineFunction {
54 let mut mf = MachineFunction::new(func.name.clone());
55 mf.internal_only = func.internal_only;
56 let mut ctx = ISelCtx::new();
57
58 // Phase 1: allocate stack slots for all IR alloca instructions.
59 for block in &func.blocks {
60 for inst in &block.insts {
61 if let InstKind::Alloca(ty) = &inst.kind {
62 let size = alloca_size(ty);
63 let offset = mf.alloc_local(size);
64 ctx.alloca_offsets.insert(inst.id, offset);
65 }
66 }
67 }
68
69 // Phase 2: create machine blocks corresponding to IR blocks.
70 // Entry block already exists as MBlockId(0).
71 //
72 // Block labels are prefixed with the function name so two
73 // functions in the same .s file don't collide on common names
74 // like `do_check_1`. The `L` prefix turns them into local
75 // symbols on Apple's assembler.
76 ctx.block_map.insert(func.entry, MBlockId(0));
77 for block in &func.blocks {
78 if block.id != func.entry {
79 let label = format!("L{}_{}", mf.name, block.name);
80 let mb_id = mf.new_block(&label);
81 ctx.block_map.insert(block.id, mb_id);
82 }
83 }
84
85 enum IncomingParam {
86 Narrow(VRegId, RegClass, AbiArgLoc, IrType),
87 Wide(i32, AbiArgLoc),
88 }
89
90 // Phase 2.5: handle incoming parameters.
91 // Create a vreg or a wide stack slot for each param.
92 // The physical register save happens after the prologue.
93 let mut param_info: Vec<IncomingParam> = Vec::new();
94 let mut abi_state = AbiArgState::default();
95 for param in &func.params {
96 let loc = classify_abi_arg(&param.ty, &mut abi_state);
97 if matches!(param.ty, IrType::Int(IntWidth::I128)) {
98 let offset = mf.alloc_local(16);
99 ctx.wide_value_slots.insert(param.id, offset);
100 param_info.push(IncomingParam::Wide(offset, loc));
101 continue;
102 }
103 let class = type_to_reg_class(&param.ty);
104 let vreg = mf.new_vreg(class);
105 ctx.value_map.insert(param.id, vreg);
106 param_info.push(IncomingParam::Narrow(vreg, class, loc, param.ty.clone()));
107 }
108
109 // Phase 3: emit prologue in entry block.
110 emit_prologue(&mut mf, MBlockId(0));
111
112 // Phase 3.5: move incoming argument registers into param vregs.
113 // Dispatch by register class: GP args from x0-x7, FP args from d0-d7.
114 for info in &param_info {
115 match info {
116 IncomingParam::Wide(offset, AbiArgLoc::GpPair(reg)) => {
117 emit_store_phys_i128_pair(
118 &mut mf,
119 MBlockId(0),
120 MachineOperand::PhysReg(PhysReg::FP),
121 *offset as i64,
122 PhysReg::Gp(*reg),
123 PhysReg::Gp(*reg + 1),
124 );
125 }
126 IncomingParam::Wide(offset, AbiArgLoc::Stack(stack_offset)) => {
127 emit_load_phys_i128_pair(
128 &mut mf,
129 MBlockId(0),
130 MachineOperand::PhysReg(PhysReg::FP),
131 16 + *stack_offset,
132 PhysReg::Gp(16),
133 PhysReg::Gp(17),
134 );
135 emit_store_phys_i128_pair(
136 &mut mf,
137 MBlockId(0),
138 MachineOperand::PhysReg(PhysReg::FP),
139 *offset as i64,
140 PhysReg::Gp(16),
141 PhysReg::Gp(17),
142 );
143 }
144 IncomingParam::Narrow(vreg, RegClass::Fp64, AbiArgLoc::Fp(reg), _) => {
145 mf.block_mut(MBlockId(0)).insts.push(MachineInst {
146 opcode: ArmOpcode::FmovReg,
147 operands: vec![
148 MachineOperand::VReg(*vreg),
149 MachineOperand::PhysReg(PhysReg::Fp(*reg)),
150 ],
151 def: Some(*vreg),
152 });
153 }
154 IncomingParam::Narrow(vreg, RegClass::Fp32, AbiArgLoc::Fp32(reg), _) => {
155 mf.block_mut(MBlockId(0)).insts.push(MachineInst {
156 opcode: ArmOpcode::FmovReg,
157 operands: vec![
158 MachineOperand::VReg(*vreg),
159 MachineOperand::PhysReg(PhysReg::Fp32(*reg)),
160 ],
161 def: Some(*vreg),
162 });
163 }
164 IncomingParam::Narrow(vreg, RegClass::Gp32, AbiArgLoc::Gp32(reg), _) => {
165 mf.block_mut(MBlockId(0)).insts.push(MachineInst {
166 opcode: ArmOpcode::MovReg,
167 operands: vec![
168 MachineOperand::VReg(*vreg),
169 MachineOperand::PhysReg(PhysReg::Gp32(*reg)),
170 ],
171 def: Some(*vreg),
172 });
173 }
174 IncomingParam::Narrow(vreg, _, AbiArgLoc::Gp(reg), _) => {
175 mf.block_mut(MBlockId(0)).insts.push(MachineInst {
176 opcode: ArmOpcode::MovReg,
177 operands: vec![
178 MachineOperand::VReg(*vreg),
179 MachineOperand::PhysReg(PhysReg::Gp(*reg)),
180 ],
181 def: Some(*vreg),
182 });
183 }
184 IncomingParam::Narrow(vreg, class, AbiArgLoc::Stack(stack_offset), ty) => {
185 emit_load_stack_arg_into_vreg(
186 &mut mf,
187 MBlockId(0),
188 *vreg,
189 *class,
190 ty,
191 16 + *stack_offset,
192 );
193 }
194 IncomingParam::Wide(_, other) => {
195 panic!(
196 "isel: unexpected ABI loc {:?} for incoming i128 param",
197 other
198 );
199 }
200 IncomingParam::Narrow(_, class, other, _) => {
201 panic!(
202 "isel: unexpected ABI loc {:?} for incoming {:?} param",
203 other, class
204 );
205 }
206 }
207 }
208
209 // Phase 4a: allocate vregs for EVERY block parameter AND every
210 // instruction result *before* walking any instructions. We need
211 // this upfront because:
212 //
213 // - A branch terminator needs to know the target block's
214 // param vregs to emit "move branch arg → target param"
215 // copies, and the target block may not have been walked yet.
216 //
217 // - An instruction in block A may reference an SSA value
218 // defined in block B that appears later in `func.blocks`
219 // vec order (perfectly legal under SSA dominance — block B
220 // can dominate block A even if it comes later in the vec).
221 // Without upfront allocation, the lookup fails.
222 //
223 // Allocation here doesn't emit machine instructions; it just
224 // reserves vreg IDs for every IR ValueId so Phase 4b can use
225 // `lookup_vreg` without ordering concerns.
226 for block in &func.blocks {
227 for bp in &block.params {
228 if matches!(bp.ty, IrType::Int(IntWidth::I128)) {
229 let offset = mf.alloc_local(16);
230 ctx.wide_value_slots.insert(bp.id, offset);
231 continue;
232 }
233 let class = type_to_reg_class(&bp.ty);
234 let vreg = mf.new_vreg(class);
235 ctx.value_map.insert(bp.id, vreg);
236 }
237 for inst in &block.insts {
238 // Allocas already have their backing stack slots from
239 // Phase 1, but the SSA value they produce is still a real
240 // pointer that later blocks may pass to calls or branch
241 // params before the defining block is selected.
242 //
243 // Reserve the vreg here so forward-dominating alloca uses
244 // are safe even when block vec order puts the use before
245 // the definition.
246 // Void-typed insts (Store, RuntimeCall returning void,
247 // etc.) don't produce a usable value.
248 if matches!(inst.ty, IrType::Void) {
249 continue;
250 }
251 if matches!(inst.ty, IrType::Int(IntWidth::I128)) {
252 let offset = mf.alloc_local(16);
253 ctx.wide_value_slots.insert(inst.id, offset);
254 continue;
255 }
256 let class = type_to_reg_class(&inst.ty);
257 let vreg = mf.new_vreg(class);
258 ctx.value_map.insert(inst.id, vreg);
259 }
260 }
261
262 // Snapshot just each IR block's params into ctx so
263 // `select_terminator` can look them up while we hold a separate
264 // &mut MachineFunction borrow. We don't need a full BasicBlock
265 // clone — only the param list — so this avoids cloning every
266 // instruction in the function for each terminator we visit.
267 for block in &func.blocks {
268 ctx.block_params.insert(block.id, block.params.clone());
269 }
270
271 // Phase 4a.5: identify ICmp/FCmp → Select fusion candidates.
272 //
273 // An ICmp whose boolean result is used only by a single Select in
274 // the same block (with no intervening flag-clobbering instruction)
275 // can be fused: we suppress the CSET and pass the CMP flags
276 // directly into the CSEL. This turns 4 instructions into 2:
277 //
278 // CMP a, b; CSET cond, LE; CMP cond, #0; CSEL dest, tv, fv, NE
279 // → CMP a, b; CSEL dest, tv, fv, LE
280 compute_csel_fusion(func, &mut ctx);
281
282 // Phase 4b: select instructions and terminators for each block.
283 for block in &func.blocks {
284 let mb_id = ctx.block_map[&block.id];
285
286 for inst in &block.insts {
287 select_inst(&mut mf, &mut ctx, mb_id, inst, func);
288 }
289
290 if let Some(term) = &block.terminator {
291 select_terminator(&mut mf, &mut ctx, mb_id, term, block, func);
292 }
293 }
294
295 mf
296 }
297
298 fn select_call_inst(
299 mf: &mut MachineFunction,
300 ctx: &mut ISelCtx,
301 mb: MBlockId,
302 inst: &Inst,
303 func: &Function,
304 ) {
305 let (label, args, runtime_func, indirect_target) = match &inst.kind {
306 InstKind::Call(FuncRef::External(name), args) => {
307 (name.clone(), args.as_slice(), None, None)
308 }
309 InstKind::Call(FuncRef::Internal(idx), args) => {
310 (format!("_func_{}", idx), args.as_slice(), None, None)
311 }
312 InstKind::Call(FuncRef::Indirect(target), args) => {
313 (String::new(), args.as_slice(), None, Some(*target))
314 }
315 InstKind::RuntimeCall(rf, args) => (String::new(), args.as_slice(), Some(rf), None),
316 _ => unreachable!(),
317 };
318
319 let mut abi_state = AbiArgState::default();
320 let mut arg_locs = Vec::with_capacity(args.len());
321 for &arg_val in args {
322 let arg_ty = func
323 .value_type(arg_val)
324 .unwrap_or_else(|| panic!("isel: missing type for call arg %{}", arg_val.0));
325 arg_locs.push((arg_val, classify_abi_arg(&arg_ty, &mut abi_state), arg_ty));
326 }
327 let label = runtime_func
328 .map(|rf| runtime_func_symbol(rf, &arg_locs))
329 .unwrap_or(label);
330 if abi_state.stack_offset > 0 {
331 mf.reserve_outgoing_args(abi_state.stack_offset as u32);
332 }
333
334 let mut pending_reg_arg_moves: Vec<(ArmOpcode, PhysReg, VRegId)> = Vec::new();
335 for (arg_val, loc, arg_ty) in arg_locs {
336 if matches!(arg_ty, IrType::Int(IntWidth::I128)) {
337 let arg_slot = ctx.lookup_wide_slot(arg_val);
338 match loc {
339 AbiArgLoc::GpPair(reg) => {
340 emit_load_phys_i128_pair(
341 mf,
342 mb,
343 MachineOperand::PhysReg(PhysReg::FP),
344 arg_slot as i64,
345 PhysReg::Gp(reg),
346 PhysReg::Gp(reg + 1),
347 );
348 }
349 AbiArgLoc::Stack(stack_offset) => {
350 emit_load_phys_i128_pair(
351 mf,
352 mb,
353 MachineOperand::PhysReg(PhysReg::FP),
354 arg_slot as i64,
355 PhysReg::Gp(16),
356 PhysReg::Gp(17),
357 );
358 emit_store_phys_i128_pair(
359 mf,
360 mb,
361 MachineOperand::PhysReg(PhysReg::Sp),
362 stack_offset,
363 PhysReg::Gp(16),
364 PhysReg::Gp(17),
365 );
366 }
367 other => {
368 panic!("isel: unexpected ABI loc {:?} for outgoing i128 arg", other);
369 }
370 }
371 continue;
372 }
373
374 let arg_vreg = ctx.lookup_vreg(arg_val);
375 let arg_class = mf.vregs.iter().find(|v| v.id == arg_vreg).map(|v| v.class);
376 match (arg_class, loc) {
377 (Some(RegClass::Fp64), AbiArgLoc::Fp(reg)) => {
378 pending_reg_arg_moves.push((ArmOpcode::FmovReg, PhysReg::Fp(reg), arg_vreg));
379 }
380 (Some(RegClass::Fp32), AbiArgLoc::Fp32(reg)) => {
381 pending_reg_arg_moves.push((ArmOpcode::FmovReg, PhysReg::Fp32(reg), arg_vreg));
382 }
383 (Some(RegClass::Gp32), AbiArgLoc::Gp32(reg)) => {
384 pending_reg_arg_moves.push((ArmOpcode::MovReg, PhysReg::Gp32(reg), arg_vreg));
385 }
386 (Some(RegClass::Gp64), AbiArgLoc::Gp(reg)) => {
387 pending_reg_arg_moves.push((ArmOpcode::MovReg, PhysReg::Gp(reg), arg_vreg));
388 }
389 (Some(class), AbiArgLoc::Stack(stack_offset)) => {
390 emit_store_stack_arg_from_vreg(mf, mb, arg_vreg, class, &arg_ty, stack_offset);
391 }
392 (Some(class), other) => {
393 panic!(
394 "isel: unexpected ABI loc {:?} for outgoing {:?} arg",
395 other, class
396 );
397 }
398 (None, _) => {
399 panic!("isel: call arg vreg class missing for %{}", arg_val.0);
400 }
401 }
402 }
403
404 for (opcode, dst, src) in pending_reg_arg_moves {
405 mf.block_mut(mb).insts.push(MachineInst {
406 opcode,
407 operands: vec![MachineOperand::PhysReg(dst), MachineOperand::VReg(src)],
408 def: None,
409 });
410 }
411
412 if let Some(target) = indirect_target {
413 mf.block_mut(mb).insts.push(MachineInst {
414 opcode: ArmOpcode::Blr,
415 operands: vec![MachineOperand::VReg(ctx.lookup_vreg(target))],
416 def: None,
417 });
418 } else {
419 mf.block_mut(mb).insts.push(MachineInst {
420 opcode: ArmOpcode::Bl,
421 operands: vec![MachineOperand::Extern(label)],
422 def: None,
423 });
424 }
425
426 if matches!(inst.ty, IrType::Int(IntWidth::I128)) {
427 let dest_slot = ctx.lookup_wide_slot(inst.id);
428 emit_store_phys_i128_pair(
429 mf,
430 mb,
431 MachineOperand::PhysReg(PhysReg::FP),
432 dest_slot as i64,
433 PhysReg::Gp(0),
434 PhysReg::Gp(1),
435 );
436 } else if inst.ty != IrType::Void {
437 let class = type_to_reg_class(&inst.ty);
438 let dest = ctx.get_vreg(mf, inst.id, class);
439 let (src_reg, opcode) = match class {
440 RegClass::Fp64 => (PhysReg::Fp(0), ArmOpcode::FmovReg),
441 RegClass::Fp32 => (PhysReg::Fp32(0), ArmOpcode::FmovReg),
442 RegClass::V128 => (PhysReg::Fp(0), ArmOpcode::FmovReg),
443 RegClass::Gp32 => (PhysReg::Gp32(0), ArmOpcode::MovReg),
444 RegClass::Gp64 => (PhysReg::Gp(0), ArmOpcode::MovReg),
445 };
446 mf.block_mut(mb).insts.push(MachineInst {
447 opcode,
448 operands: vec![MachineOperand::VReg(dest), MachineOperand::PhysReg(src_reg)],
449 def: Some(dest),
450 });
451 } else {
452 ctx.get_vreg(mf, inst.id, RegClass::Gp64);
453 }
454 }
455
456 /// Instruction selection context.
457 struct ISelCtx {
458 /// IR ValueId → MIR VRegId.
459 value_map: HashMap<ValueId, VRegId>,
460 /// IR wide scalar ValueId → stack slot offset used as its backing store.
461 wide_value_slots: HashMap<ValueId, i32>,
462 /// IR BlockId → MIR MBlockId.
463 block_map: HashMap<BlockId, MBlockId>,
464 /// IR alloca ValueId → stack frame offset.
465 alloca_offsets: HashMap<ValueId, i32>,
466 /// IR BlockId → its block params. Snapshotted before phase 4b
467 /// so terminator selection can read each target's params
468 /// without re-borrowing the function while &mut MachineFunction
469 /// is held. Cloning just the param vec is dramatically cheaper
470 /// than cloning the whole BasicBlock — instructions can be in
471 /// the thousands, params are typically 0-3.
472 block_params: HashMap<BlockId, Vec<BlockParam>>,
473 /// ICmp/FCmp ValueIds that are exclusively consumed by a Select in
474 /// the same block with no intervening flag-clobbering instruction.
475 /// For these, we suppress CSET during ICmp lowering and use the
476 /// flags directly from the CMP in the CSEL.
477 select_fused: HashSet<ValueId>,
478 /// For each fused ICmp/FCmp, the ARM condition code to use in the
479 /// CSEL (determined at the time we suppress the CSET).
480 fused_arm_cond: HashMap<ValueId, ArmCond>,
481 }
482
483 impl ISelCtx {
484 fn new() -> Self {
485 Self {
486 value_map: HashMap::new(),
487 wide_value_slots: HashMap::new(),
488 block_map: HashMap::new(),
489 alloca_offsets: HashMap::new(),
490 block_params: HashMap::new(),
491 select_fused: HashSet::new(),
492 fused_arm_cond: HashMap::new(),
493 }
494 }
495
496 /// Get the vreg for an IR value, or create one if needed.
497 /// In debug builds, asserts that an existing mapping has the
498 /// same register class as requested — a class mismatch means
499 /// Phase 4a (vreg pre-allocation) and Phase 4b (instruction
500 /// selection) disagree about a value's type, which would
501 /// silently corrupt code.
502 fn get_vreg(&mut self, mf: &mut MachineFunction, val: ValueId, class: RegClass) -> VRegId {
503 if let Some(&vreg) = self.value_map.get(&val) {
504 debug_assert!(
505 mf.vregs.iter().find(|v| v.id == vreg).map(|v| v.class) == Some(class),
506 "isel: vreg class mismatch for IR value %{} (existing class \
507 differs from requested {:?}) — phase 4a/4b disagreement",
508 val.0,
509 class,
510 );
511 return vreg;
512 }
513 let vreg = mf.new_vreg(class);
514 self.value_map.insert(val, vreg);
515 vreg
516 }
517
518 /// Get the vreg for an IR value, assuming it was already mapped.
519 fn lookup_vreg(&self, val: ValueId) -> VRegId {
520 *self.value_map.get(&val).unwrap_or_else(|| {
521 panic!(
522 "isel: unmapped IR value %{} — phase 4a should have allocated \
523 a vreg for every IR value before phase 4b runs. {} values are \
524 currently mapped. This usually means a forward reference, \
525 a missing block param, or a value defined in an unreachable \
526 block.",
527 val.0,
528 self.value_map.len(),
529 )
530 })
531 }
532
533 /// Get machine block for an IR block.
534 fn lookup_block(&self, block: BlockId) -> MBlockId {
535 *self.block_map.get(&block).unwrap_or(&MBlockId(0))
536 }
537
538 fn lookup_wide_slot(&self, val: ValueId) -> i32 {
539 *self.wide_value_slots.get(&val).unwrap_or_else(|| {
540 panic!(
541 "isel: unmapped wide i128 value %{} — phase 4a should have allocated \
542 a backing slot for every supported i128 SSA value before phase 4b runs",
543 val.0
544 )
545 })
546 }
547 }
548
549 /// Select machine instructions for a single IR instruction.
550 fn select_inst(
551 mf: &mut MachineFunction,
552 ctx: &mut ISelCtx,
553 mb: MBlockId,
554 inst: &Inst,
555 func: &Function,
556 ) {
557 if matches!(inst.ty, IrType::Int(IntWidth::I128)) {
558 match &inst.kind {
559 InstKind::ConstInt(val, IntWidth::I128) => {
560 let dest_slot = ctx.lookup_wide_slot(inst.id);
561 emit_const_i128_to_phys_pair(mf, mb, *val, PhysReg::Gp(16), PhysReg::Gp(17));
562 emit_store_phys_i128_pair(
563 mf,
564 mb,
565 MachineOperand::PhysReg(PhysReg::FP),
566 dest_slot as i64,
567 PhysReg::Gp(16),
568 PhysReg::Gp(17),
569 );
570 return;
571 }
572 InstKind::Undef(_) => {
573 let dest_slot = ctx.lookup_wide_slot(inst.id);
574 emit_const_i128_to_phys_pair(mf, mb, 0, PhysReg::Gp(16), PhysReg::Gp(17));
575 emit_store_phys_i128_pair(
576 mf,
577 mb,
578 MachineOperand::PhysReg(PhysReg::FP),
579 dest_slot as i64,
580 PhysReg::Gp(16),
581 PhysReg::Gp(17),
582 );
583 return;
584 }
585 InstKind::IAdd(a, b) => {
586 emit_i128_binop_via_slots(mf, ctx, mb, I128BinOp::Add, inst.id, *a, *b);
587 return;
588 }
589 InstKind::ISub(a, b) => {
590 emit_i128_binop_via_slots(mf, ctx, mb, I128BinOp::Sub, inst.id, *a, *b);
591 return;
592 }
593 InstKind::INeg(a) => {
594 let dest_slot = ctx.lookup_wide_slot(inst.id);
595 let src_slot = ctx.lookup_wide_slot(*a);
596 emit_load_phys_i128_pair(
597 mf,
598 mb,
599 MachineOperand::PhysReg(PhysReg::FP),
600 src_slot as i64,
601 PhysReg::Gp(16),
602 PhysReg::Gp(17),
603 );
604 emit_i128_neg(mf, mb, PhysReg::Gp(16), PhysReg::Gp(17));
605 emit_store_phys_i128_pair(
606 mf,
607 mb,
608 MachineOperand::PhysReg(PhysReg::FP),
609 dest_slot as i64,
610 PhysReg::Gp(16),
611 PhysReg::Gp(17),
612 );
613 return;
614 }
615 InstKind::Load(addr) => {
616 let dest_slot = ctx.lookup_wide_slot(inst.id);
617 if let Some(&offset) = ctx.alloca_offsets.get(addr) {
618 emit_load_phys_i128_pair(
619 mf,
620 mb,
621 MachineOperand::PhysReg(PhysReg::FP),
622 offset as i64,
623 PhysReg::Gp(16),
624 PhysReg::Gp(17),
625 );
626 } else {
627 let base = ctx.lookup_vreg(*addr);
628 emit_load_phys_i128_pair(
629 mf,
630 mb,
631 MachineOperand::VReg(base),
632 0,
633 PhysReg::Gp(16),
634 PhysReg::Gp(17),
635 );
636 }
637 emit_store_phys_i128_pair(
638 mf,
639 mb,
640 MachineOperand::PhysReg(PhysReg::FP),
641 dest_slot as i64,
642 PhysReg::Gp(16),
643 PhysReg::Gp(17),
644 );
645 return;
646 }
647 InstKind::Select(cond, tv, fv) => {
648 let arm_cond = if let Some(&fused_cond) = ctx.fused_arm_cond.get(cond) {
649 fused_cond
650 } else {
651 let cond_reg = ctx.lookup_vreg(*cond);
652 mf.block_mut(mb).insts.push(MachineInst {
653 opcode: ArmOpcode::CmpImm,
654 operands: vec![MachineOperand::VReg(cond_reg), MachineOperand::Imm(0)],
655 def: None,
656 });
657 ArmCond::Ne
658 };
659 let dest_slot = ctx.lookup_wide_slot(inst.id);
660 let true_slot = ctx.lookup_wide_slot(*tv);
661 let false_slot = ctx.lookup_wide_slot(*fv);
662 emit_load_phys_i128_pair(
663 mf,
664 mb,
665 MachineOperand::PhysReg(PhysReg::FP),
666 true_slot as i64,
667 PhysReg::Gp(16),
668 PhysReg::Gp(17),
669 );
670 emit_load_phys_i128_pair(
671 mf,
672 mb,
673 MachineOperand::PhysReg(PhysReg::FP),
674 false_slot as i64,
675 PhysReg::Gp(8),
676 PhysReg::Gp(9),
677 );
678 mf.block_mut(mb).insts.push(MachineInst {
679 opcode: ArmOpcode::CselReg,
680 operands: vec![
681 MachineOperand::PhysReg(PhysReg::Gp(16)),
682 MachineOperand::PhysReg(PhysReg::Gp(16)),
683 MachineOperand::PhysReg(PhysReg::Gp(8)),
684 MachineOperand::Cond(arm_cond),
685 ],
686 def: None,
687 });
688 mf.block_mut(mb).insts.push(MachineInst {
689 opcode: ArmOpcode::CselReg,
690 operands: vec![
691 MachineOperand::PhysReg(PhysReg::Gp(17)),
692 MachineOperand::PhysReg(PhysReg::Gp(17)),
693 MachineOperand::PhysReg(PhysReg::Gp(9)),
694 MachineOperand::Cond(arm_cond),
695 ],
696 def: None,
697 });
698 emit_store_phys_i128_pair(
699 mf,
700 mb,
701 MachineOperand::PhysReg(PhysReg::FP),
702 dest_slot as i64,
703 PhysReg::Gp(16),
704 PhysReg::Gp(17),
705 );
706 return;
707 }
708 InstKind::Call(..) => {
709 select_call_inst(mf, ctx, mb, inst, func);
710 return;
711 }
712 _ => {
713 panic!(
714 "isel: unsupported i128 instruction reached backend despite gating: {:?}",
715 inst.kind
716 );
717 }
718 }
719 }
720
721 match &inst.kind {
722 // ---- Constants ----
723 InstKind::ConstInt(val, width) => {
724 let class = int_width_class(width);
725 let dest = ctx.get_vreg(mf, inst.id, class);
726 emit_const_int(mf, mb, dest, *val, *width);
727 }
728
729 InstKind::ConstFloat(val, width) => {
730 let class = float_width_class(width);
731 let dest = ctx.get_vreg(mf, inst.id, class);
732 let cp_idx = match width {
733 FloatWidth::F32 => mf.add_const(ConstPoolEntry::F32(*val as f32)),
734 FloatWidth::F64 => mf.add_const(ConstPoolEntry::F64(*val)),
735 };
736 // ADRP + LDR from constant pool.
737 mf.block_mut(mb).insts.push(MachineInst {
738 opcode: ArmOpcode::AdrpLdr,
739 operands: vec![
740 MachineOperand::VReg(dest),
741 MachineOperand::ConstPool(cp_idx),
742 ],
743 def: Some(dest),
744 });
745 }
746
747 InstKind::ConstBool(val) => {
748 let dest = ctx.get_vreg(mf, inst.id, RegClass::Gp32);
749 emit_const_int(mf, mb, dest, if *val { 1 } else { 0 }, IntWidth::I32);
750 }
751
752 InstKind::ConstString(bytes) => {
753 let dest = ctx.get_vreg(mf, inst.id, RegClass::Gp64);
754 let cp_idx = mf.add_const(ConstPoolEntry::Bytes(bytes.clone()));
755 // Use ADRP+ADD to compute the address (not ADRP+LDR which loads the value).
756 mf.block_mut(mb).insts.push(MachineInst {
757 opcode: ArmOpcode::AdrpAdd,
758 operands: vec![
759 MachineOperand::VReg(dest),
760 MachineOperand::ConstPool(cp_idx),
761 ],
762 def: Some(dest),
763 });
764 }
765
766 InstKind::Undef(_) => {
767 // Emit a deterministic zero instead of leaving the vreg
768 // undefined. A truly undefined vreg lets the register
769 // allocator hand us whatever physical register is free,
770 // and that register's stale contents leak into reads —
771 // which makes optimization-level diffs nondeterministic
772 // and turns "undef ⇒ anything" into "undef ⇒ whatever
773 // happened to be in x14 at this point in the program."
774 //
775 // mem2reg synthesizes Undef as the initial value of a
776 // promoted slot before any store. The Fortran semantics
777 // for reading uninitialized storage are undefined, but
778 // a hard zero is at least reproducible across opt
779 // levels and friendly to debuggers.
780 let class = type_to_reg_class(&inst.ty);
781 let dest = ctx.get_vreg(mf, inst.id, class);
782 match class {
783 RegClass::Gp32 => {
784 mf.block_mut(mb).insts.push(MachineInst {
785 opcode: ArmOpcode::MovReg,
786 operands: vec![
787 MachineOperand::VReg(dest),
788 MachineOperand::PhysReg(PhysReg::Wzr),
789 ],
790 def: Some(dest),
791 });
792 }
793 RegClass::Gp64 => {
794 mf.block_mut(mb).insts.push(MachineInst {
795 opcode: ArmOpcode::MovReg,
796 operands: vec![
797 MachineOperand::VReg(dest),
798 MachineOperand::PhysReg(PhysReg::Xzr),
799 ],
800 def: Some(dest),
801 });
802 }
803 RegClass::Fp32 => {
804 let cp_idx = mf.add_const(ConstPoolEntry::F32(0.0));
805 mf.block_mut(mb).insts.push(MachineInst {
806 opcode: ArmOpcode::AdrpLdr,
807 operands: vec![
808 MachineOperand::VReg(dest),
809 MachineOperand::ConstPool(cp_idx),
810 ],
811 def: Some(dest),
812 });
813 }
814 RegClass::Fp64 => {
815 let cp_idx = mf.add_const(ConstPoolEntry::F64(0.0));
816 mf.block_mut(mb).insts.push(MachineInst {
817 opcode: ArmOpcode::AdrpLdr,
818 operands: vec![
819 MachineOperand::VReg(dest),
820 MachineOperand::ConstPool(cp_idx),
821 ],
822 def: Some(dest),
823 });
824 }
825 RegClass::V128 => {
826 // Sprint 12 Stage 1 reserves the type/instr; no
827 // path produces a V128 Undef yet. Bail rather
828 // than emit a half-baked NEON zero — when the
829 // vectorizer arrives it will have its own
830 // VBroadcast(const 0) lowering.
831 unreachable!("V128 Undef emission not implemented (Sprint 12 Stage 4 work)");
832 }
833 }
834 }
835
836 // ---- Integer arithmetic ----
837 InstKind::IAdd(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::AddReg, *a, *b),
838 InstKind::ISub(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::SubReg, *a, *b),
839 InstKind::IMul(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::Mul, *a, *b),
840 InstKind::IDiv(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::Sdiv, *a, *b),
841 InstKind::IMod(a, b) => {
842 // imod = a - (a / b) * b → SDIV + MSUB
843 let class = type_to_reg_class(&inst.ty);
844 let dest = ctx.get_vreg(mf, inst.id, class);
845 let va = ctx.lookup_vreg(*a);
846 let vb = ctx.lookup_vreg(*b);
847 let tmp = mf.new_vreg(class);
848 // tmp = sdiv a, b
849 mf.block_mut(mb).insts.push(MachineInst {
850 opcode: ArmOpcode::Sdiv,
851 operands: vec![
852 MachineOperand::VReg(tmp),
853 MachineOperand::VReg(va),
854 MachineOperand::VReg(vb),
855 ],
856 def: Some(tmp),
857 });
858 // dest = msub tmp, vb, va → va - tmp * vb = a - (a/b)*b
859 mf.block_mut(mb).insts.push(MachineInst {
860 opcode: ArmOpcode::Msub,
861 operands: vec![
862 MachineOperand::VReg(dest),
863 MachineOperand::VReg(tmp),
864 MachineOperand::VReg(vb),
865 MachineOperand::VReg(va),
866 ],
867 def: Some(dest),
868 });
869 }
870 InstKind::INeg(a) => {
871 let class = type_to_reg_class(&inst.ty);
872 let dest = ctx.get_vreg(mf, inst.id, class);
873 let va = ctx.lookup_vreg(*a);
874 mf.block_mut(mb).insts.push(MachineInst {
875 opcode: ArmOpcode::Neg,
876 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(va)],
877 def: Some(dest),
878 });
879 }
880
881 // ---- Float arithmetic ----
882 InstKind::FAdd(a, b) => emit_float_binop(
883 mf,
884 ctx,
885 mb,
886 inst,
887 &inst.ty,
888 *a,
889 *b,
890 ArmOpcode::FaddS,
891 ArmOpcode::FaddD,
892 ),
893 InstKind::FSub(a, b) => emit_float_binop(
894 mf,
895 ctx,
896 mb,
897 inst,
898 &inst.ty,
899 *a,
900 *b,
901 ArmOpcode::FsubS,
902 ArmOpcode::FsubD,
903 ),
904 InstKind::FMul(a, b) => emit_float_binop(
905 mf,
906 ctx,
907 mb,
908 inst,
909 &inst.ty,
910 *a,
911 *b,
912 ArmOpcode::FmulS,
913 ArmOpcode::FmulD,
914 ),
915 InstKind::FDiv(a, b) => emit_float_binop(
916 mf,
917 ctx,
918 mb,
919 inst,
920 &inst.ty,
921 *a,
922 *b,
923 ArmOpcode::FdivS,
924 ArmOpcode::FdivD,
925 ),
926 InstKind::FNeg(a) => {
927 let (class, opcode) = match &inst.ty {
928 IrType::Float(FloatWidth::F32) => (RegClass::Fp32, ArmOpcode::FnegS),
929 _ => (RegClass::Fp64, ArmOpcode::FnegD),
930 };
931 let dest = ctx.get_vreg(mf, inst.id, class);
932 let va = ctx.lookup_vreg(*a);
933 mf.block_mut(mb).insts.push(MachineInst {
934 opcode,
935 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(va)],
936 def: Some(dest),
937 });
938 }
939 InstKind::FPow(a, b) => {
940 let class = type_to_reg_class(&inst.ty);
941 let dest = ctx.get_vreg(mf, inst.id, class);
942 let va = ctx.lookup_vreg(*a);
943 let vb = ctx.lookup_vreg(*b);
944 let (func_name, arg0, arg1, ret) = match &inst.ty {
945 IrType::Float(FloatWidth::F32) => {
946 ("powf", PhysReg::Fp32(0), PhysReg::Fp32(1), PhysReg::Fp32(0))
947 }
948 _ => ("pow", PhysReg::Fp(0), PhysReg::Fp(1), PhysReg::Fp(0)),
949 };
950 mf.block_mut(mb).insts.push(MachineInst {
951 opcode: ArmOpcode::FmovReg,
952 operands: vec![MachineOperand::PhysReg(arg0), MachineOperand::VReg(va)],
953 def: None,
954 });
955 mf.block_mut(mb).insts.push(MachineInst {
956 opcode: ArmOpcode::FmovReg,
957 operands: vec![MachineOperand::PhysReg(arg1), MachineOperand::VReg(vb)],
958 def: None,
959 });
960 mf.block_mut(mb).insts.push(MachineInst {
961 opcode: ArmOpcode::Bl,
962 operands: vec![MachineOperand::Extern(func_name.into())],
963 def: None,
964 });
965 mf.block_mut(mb).insts.push(MachineInst {
966 opcode: ArmOpcode::FmovReg,
967 operands: vec![MachineOperand::VReg(dest), MachineOperand::PhysReg(ret)],
968 def: Some(dest),
969 });
970 }
971
972 // ---- Comparisons ----
973 InstKind::ICmp(op, a, b) => {
974 if matches!(func.value_type(*a), Some(IrType::Int(IntWidth::I128)))
975 || matches!(func.value_type(*b), Some(IrType::Int(IntWidth::I128)))
976 {
977 let dest = ctx.get_vreg(mf, inst.id, RegClass::Gp32);
978 let lhs_slot = ctx.lookup_wide_slot(*a);
979 let rhs_slot = ctx.lookup_wide_slot(*b);
980 emit_load_phys_i128_pair(
981 mf,
982 mb,
983 MachineOperand::PhysReg(PhysReg::FP),
984 lhs_slot as i64,
985 PhysReg::Gp(16),
986 PhysReg::Gp(17),
987 );
988 emit_load_phys_i128_pair(
989 mf,
990 mb,
991 MachineOperand::PhysReg(PhysReg::FP),
992 rhs_slot as i64,
993 PhysReg::Gp(8),
994 PhysReg::Gp(9),
995 );
996 match op {
997 CmpOp::Eq | CmpOp::Ne => {
998 mf.block_mut(mb).insts.push(MachineInst {
999 opcode: ArmOpcode::CmpReg,
1000 operands: vec![
1001 MachineOperand::PhysReg(PhysReg::Gp(16)),
1002 MachineOperand::PhysReg(PhysReg::Gp(8)),
1003 ],
1004 def: None,
1005 });
1006 mf.block_mut(mb).insts.push(MachineInst {
1007 opcode: ArmOpcode::Cset,
1008 operands: vec![
1009 MachineOperand::PhysReg(PhysReg::Gp32(10)),
1010 MachineOperand::Cond(cmp_to_arm_cond(*op)),
1011 ],
1012 def: None,
1013 });
1014 mf.block_mut(mb).insts.push(MachineInst {
1015 opcode: ArmOpcode::CmpReg,
1016 operands: vec![
1017 MachineOperand::PhysReg(PhysReg::Gp(17)),
1018 MachineOperand::PhysReg(PhysReg::Gp(9)),
1019 ],
1020 def: None,
1021 });
1022 mf.block_mut(mb).insts.push(MachineInst {
1023 opcode: ArmOpcode::Cset,
1024 operands: vec![
1025 MachineOperand::PhysReg(PhysReg::Gp32(11)),
1026 MachineOperand::Cond(cmp_to_arm_cond(*op)),
1027 ],
1028 def: None,
1029 });
1030 let combine = match op {
1031 CmpOp::Eq => ArmOpcode::AndReg,
1032 CmpOp::Ne => ArmOpcode::OrrReg,
1033 _ => unreachable!(),
1034 };
1035 mf.block_mut(mb).insts.push(MachineInst {
1036 opcode: combine,
1037 operands: vec![
1038 MachineOperand::PhysReg(PhysReg::Gp32(10)),
1039 MachineOperand::PhysReg(PhysReg::Gp32(10)),
1040 MachineOperand::PhysReg(PhysReg::Gp32(11)),
1041 ],
1042 def: None,
1043 });
1044 }
1045 CmpOp::Lt | CmpOp::Le | CmpOp::Gt | CmpOp::Ge => {
1046 let (hi_cond, lo_cond) = i128_ordered_conds(*op);
1047 mf.block_mut(mb).insts.push(MachineInst {
1048 opcode: ArmOpcode::CmpReg,
1049 operands: vec![
1050 MachineOperand::PhysReg(PhysReg::Gp(17)),
1051 MachineOperand::PhysReg(PhysReg::Gp(9)),
1052 ],
1053 def: None,
1054 });
1055 mf.block_mut(mb).insts.push(MachineInst {
1056 opcode: ArmOpcode::Cset,
1057 operands: vec![
1058 MachineOperand::PhysReg(PhysReg::Gp32(10)),
1059 MachineOperand::Cond(hi_cond),
1060 ],
1061 def: None,
1062 });
1063 mf.block_mut(mb).insts.push(MachineInst {
1064 opcode: ArmOpcode::Cset,
1065 operands: vec![
1066 MachineOperand::PhysReg(PhysReg::Gp32(11)),
1067 MachineOperand::Cond(ArmCond::Eq),
1068 ],
1069 def: None,
1070 });
1071 mf.block_mut(mb).insts.push(MachineInst {
1072 opcode: ArmOpcode::CmpReg,
1073 operands: vec![
1074 MachineOperand::PhysReg(PhysReg::Gp(16)),
1075 MachineOperand::PhysReg(PhysReg::Gp(8)),
1076 ],
1077 def: None,
1078 });
1079 mf.block_mut(mb).insts.push(MachineInst {
1080 opcode: ArmOpcode::Cset,
1081 operands: vec![
1082 MachineOperand::PhysReg(PhysReg::Gp32(8)),
1083 MachineOperand::Cond(lo_cond),
1084 ],
1085 def: None,
1086 });
1087 mf.block_mut(mb).insts.push(MachineInst {
1088 opcode: ArmOpcode::AndReg,
1089 operands: vec![
1090 MachineOperand::PhysReg(PhysReg::Gp32(11)),
1091 MachineOperand::PhysReg(PhysReg::Gp32(11)),
1092 MachineOperand::PhysReg(PhysReg::Gp32(8)),
1093 ],
1094 def: None,
1095 });
1096 mf.block_mut(mb).insts.push(MachineInst {
1097 opcode: ArmOpcode::OrrReg,
1098 operands: vec![
1099 MachineOperand::PhysReg(PhysReg::Gp32(10)),
1100 MachineOperand::PhysReg(PhysReg::Gp32(10)),
1101 MachineOperand::PhysReg(PhysReg::Gp32(11)),
1102 ],
1103 def: None,
1104 });
1105 }
1106 }
1107 mf.block_mut(mb).insts.push(MachineInst {
1108 opcode: ArmOpcode::MovReg,
1109 operands: vec![
1110 MachineOperand::VReg(dest),
1111 MachineOperand::PhysReg(PhysReg::Gp32(10)),
1112 ],
1113 def: Some(dest),
1114 });
1115 return;
1116 }
1117
1118 let dest = ctx.get_vreg(mf, inst.id, RegClass::Gp32);
1119 let va = icmp_operand_vreg(mf, ctx, mb, func, *a, *b);
1120 let vb = icmp_operand_vreg(mf, ctx, mb, func, *b, *a);
1121 mf.block_mut(mb).insts.push(MachineInst {
1122 opcode: ArmOpcode::CmpReg,
1123 operands: vec![MachineOperand::VReg(va), MachineOperand::VReg(vb)],
1124 def: None,
1125 });
1126 // If this ICmp feeds exclusively into a Select (detected in the
1127 // pre-pass), suppress CSET. The Select will use the flags directly.
1128 if !ctx.select_fused.contains(&inst.id) {
1129 mf.block_mut(mb).insts.push(MachineInst {
1130 opcode: ArmOpcode::Cset,
1131 operands: vec![
1132 MachineOperand::VReg(dest),
1133 MachineOperand::Cond(cmp_to_arm_cond(*op)),
1134 ],
1135 def: Some(dest),
1136 });
1137 }
1138 }
1139 InstKind::FCmp(op, a, b) => {
1140 let dest = ctx.get_vreg(mf, inst.id, RegClass::Gp32);
1141 let va = ctx.lookup_vreg(*a);
1142 let vb = ctx.lookup_vreg(*b);
1143 mf.block_mut(mb).insts.push(MachineInst {
1144 opcode: ArmOpcode::FCmpReg,
1145 operands: vec![MachineOperand::VReg(va), MachineOperand::VReg(vb)],
1146 def: None,
1147 });
1148 if !ctx.select_fused.contains(&inst.id) {
1149 mf.block_mut(mb).insts.push(MachineInst {
1150 opcode: ArmOpcode::FCset,
1151 operands: vec![
1152 MachineOperand::VReg(dest),
1153 MachineOperand::Cond(fcmp_to_arm_cond(*op)),
1154 ],
1155 def: Some(dest),
1156 });
1157 }
1158 }
1159
1160 // ---- Logic ----
1161 InstKind::And(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::AndReg, *a, *b),
1162 InstKind::Or(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::OrrReg, *a, *b),
1163 InstKind::Not(a) => {
1164 // Logical NOT: CMP src, #0; CSET dest, EQ
1165 // If src == 0 (false), EQ is true → dest = 1 (true).
1166 // If src != 0 (true), EQ is false → dest = 0 (false).
1167 // This correctly handles any truthy value, not just 0/1.
1168 let dest = ctx.get_vreg(mf, inst.id, RegClass::Gp32);
1169 let va = ctx.lookup_vreg(*a);
1170 mf.block_mut(mb).insts.push(MachineInst {
1171 opcode: ArmOpcode::CmpImm,
1172 operands: vec![MachineOperand::VReg(va), MachineOperand::Imm(0)],
1173 def: None,
1174 });
1175 mf.block_mut(mb).insts.push(MachineInst {
1176 opcode: ArmOpcode::Cset,
1177 operands: vec![
1178 MachineOperand::VReg(dest),
1179 MachineOperand::Cond(ArmCond::Eq),
1180 ],
1181 def: Some(dest),
1182 });
1183 }
1184
1185 // ---- Select (CSEL) ----
1186 //
1187 // Fast path: if the condition was produced by an ICmp/FCmp in the
1188 // same block with no other users, the pre-pass marked it as fused.
1189 // We already emitted `CMP a, b` (no CSET), so the flags are live.
1190 // Use them directly: `CSEL dest, tv, fv, <arm_cond>`.
1191 //
1192 // Slow path (unfused): the condition is an arbitrary boolean in a
1193 // register. Materialize with `CMP cond, #0; CSEL dest, tv, fv, NE`.
1194 InstKind::Select(cond, tv, fv) => {
1195 let class = type_to_reg_class(&inst.ty);
1196 let dest = ctx.get_vreg(mf, inst.id, class);
1197 let true_reg = coerce_select_operand_vreg(mf, ctx, mb, func, *tv, &inst.ty);
1198 let false_reg = coerce_select_operand_vreg(mf, ctx, mb, func, *fv, &inst.ty);
1199
1200 let arm_cond = if let Some(&fused_cond) = ctx.fused_arm_cond.get(cond) {
1201 // Flags already set by the fused CMP — no extra compare needed.
1202 fused_cond
1203 } else {
1204 // Unfused: compare the boolean register against 0.
1205 let cond_reg = ctx.lookup_vreg(*cond);
1206 mf.block_mut(mb).insts.push(MachineInst {
1207 opcode: ArmOpcode::CmpImm,
1208 operands: vec![MachineOperand::VReg(cond_reg), MachineOperand::Imm(0)],
1209 def: None,
1210 });
1211 ArmCond::Ne
1212 };
1213
1214 let opcode = if class == RegClass::Fp32 || class == RegClass::Fp64 {
1215 ArmOpcode::FcselReg
1216 } else {
1217 ArmOpcode::CselReg
1218 };
1219 mf.block_mut(mb).insts.push(MachineInst {
1220 opcode,
1221 operands: vec![
1222 MachineOperand::VReg(dest),
1223 MachineOperand::VReg(true_reg),
1224 MachineOperand::VReg(false_reg),
1225 MachineOperand::Cond(arm_cond),
1226 ],
1227 def: Some(dest),
1228 });
1229 }
1230
1231 // ---- Float: fabs, fsqrt ----
1232 InstKind::FAbs(a) => {
1233 let src = ctx.lookup_vreg(*a);
1234 let class = type_to_reg_class(&inst.ty);
1235 let dest = ctx.get_vreg(mf, inst.id, class);
1236 let opcode = if class == RegClass::Fp64 {
1237 ArmOpcode::FabsD
1238 } else {
1239 ArmOpcode::FabsS
1240 };
1241 mf.block_mut(mb).insts.push(MachineInst {
1242 opcode,
1243 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
1244 def: Some(dest),
1245 });
1246 }
1247 InstKind::FSqrt(a) => {
1248 let src = ctx.lookup_vreg(*a);
1249 let class = type_to_reg_class(&inst.ty);
1250 let dest = ctx.get_vreg(mf, inst.id, class);
1251 let opcode = if class == RegClass::Fp64 {
1252 ArmOpcode::FsqrtD
1253 } else {
1254 ArmOpcode::FsqrtS
1255 };
1256 mf.block_mut(mb).insts.push(MachineInst {
1257 opcode,
1258 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
1259 def: Some(dest),
1260 });
1261 }
1262
1263 // ---- Bitwise ----
1264 InstKind::BitAnd(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::AndReg, *a, *b),
1265 InstKind::BitOr(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::OrrReg, *a, *b),
1266 InstKind::BitXor(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::EorReg, *a, *b),
1267 InstKind::BitNot(a) => {
1268 let src = ctx.lookup_vreg(*a);
1269 let class = type_to_reg_class(&inst.ty);
1270 let dest = ctx.get_vreg(mf, inst.id, class);
1271 mf.block_mut(mb).insts.push(MachineInst {
1272 opcode: ArmOpcode::Mvn,
1273 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
1274 def: Some(dest),
1275 });
1276 }
1277 InstKind::Shl(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::LslReg, *a, *b),
1278 InstKind::LShr(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::LsrReg, *a, *b),
1279 InstKind::AShr(a, b) => emit_binop(mf, ctx, mb, inst, ArmOpcode::AsrReg, *a, *b),
1280 InstKind::CountLeadingZeros(a) => {
1281 let src = ctx.lookup_vreg(*a);
1282 let class = type_to_reg_class(&inst.ty);
1283 let dest = ctx.get_vreg(mf, inst.id, class);
1284 mf.block_mut(mb).insts.push(MachineInst {
1285 opcode: ArmOpcode::Clz,
1286 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
1287 def: Some(dest),
1288 });
1289 }
1290 InstKind::CountTrailingZeros(a) => {
1291 // CTZ = CLZ(RBIT(x))
1292 let src = ctx.lookup_vreg(*a);
1293 let class = type_to_reg_class(&inst.ty);
1294 let tmp = mf.new_vreg(class);
1295 let dest = ctx.get_vreg(mf, inst.id, class);
1296 mf.block_mut(mb).insts.push(MachineInst {
1297 opcode: ArmOpcode::Rbit,
1298 operands: vec![MachineOperand::VReg(tmp), MachineOperand::VReg(src)],
1299 def: Some(tmp),
1300 });
1301 mf.block_mut(mb).insts.push(MachineInst {
1302 opcode: ArmOpcode::Clz,
1303 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(tmp)],
1304 def: Some(dest),
1305 });
1306 }
1307 InstKind::PopCount(a) => {
1308 // ARM64 popcount: FMOV Vd.8B, Xn; CNT Vd.8B, Vd.8B; ADDV Bd, Vd.8B; FMOV Wd, Sd
1309 // For simplicity, emit a runtime call.
1310 let src = ctx.lookup_vreg(*a);
1311 let class = type_to_reg_class(&inst.ty);
1312 let dest = ctx.get_vreg(mf, inst.id, class);
1313 // Placeholder: use CLZ-based Hamming weight or runtime call.
1314 // For now, move src to dest (will be replaced with proper popcount later).
1315 mf.block_mut(mb).insts.push(MachineInst {
1316 opcode: ArmOpcode::MovReg,
1317 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
1318 def: Some(dest),
1319 });
1320 }
1321
1322 // ---- Conversions ----
1323 InstKind::IntToFloat(a, fw) => {
1324 let src = ctx.lookup_vreg(*a);
1325 let src_class = mf.vregs.iter().find(|v| v.id == src).map(|v| v.class);
1326 let is_64bit_src = matches!(src_class, Some(RegClass::Gp64));
1327 let (class, opcode) = match (fw, is_64bit_src) {
1328 (FloatWidth::F32, false) => (RegClass::Fp32, ArmOpcode::ScvtfSW),
1329 (FloatWidth::F32, true) => (RegClass::Fp32, ArmOpcode::ScvtfSX),
1330 (FloatWidth::F64, false) => (RegClass::Fp64, ArmOpcode::ScvtfDW),
1331 (FloatWidth::F64, true) => (RegClass::Fp64, ArmOpcode::ScvtfDX),
1332 };
1333 let dest = ctx.get_vreg(mf, inst.id, class);
1334 mf.block_mut(mb).insts.push(MachineInst {
1335 opcode,
1336 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
1337 def: Some(dest),
1338 });
1339 }
1340 InstKind::FloatToInt(a, iw) => {
1341 let src = ctx.lookup_vreg(*a);
1342 let src_class = mf.vregs.iter().find(|v| v.id == src).map(|v| v.class);
1343 let is_f64_src = matches!(src_class, Some(RegClass::Fp64));
1344 let is_64bit_dest = matches!(iw, IntWidth::I64);
1345 let class = int_width_class(iw);
1346 let opcode = match (is_64bit_dest, is_f64_src) {
1347 (false, false) => ArmOpcode::FcvtzsWS,
1348 (false, true) => ArmOpcode::FcvtzsWD,
1349 (true, false) => ArmOpcode::FcvtzsXS,
1350 (true, true) => ArmOpcode::FcvtzsXD,
1351 };
1352 let dest = ctx.get_vreg(mf, inst.id, class);
1353 mf.block_mut(mb).insts.push(MachineInst {
1354 opcode,
1355 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
1356 def: Some(dest),
1357 });
1358 }
1359 InstKind::FloatExtend(a, _) => {
1360 let src = ctx.lookup_vreg(*a);
1361 let dest = ctx.get_vreg(mf, inst.id, RegClass::Fp64);
1362 mf.block_mut(mb).insts.push(MachineInst {
1363 opcode: ArmOpcode::FcvtDS,
1364 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
1365 def: Some(dest),
1366 });
1367 }
1368 InstKind::FloatTrunc(a, _) => {
1369 let src = ctx.lookup_vreg(*a);
1370 let dest = ctx.get_vreg(mf, inst.id, RegClass::Fp32);
1371 mf.block_mut(mb).insts.push(MachineInst {
1372 opcode: ArmOpcode::FcvtSD,
1373 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
1374 def: Some(dest),
1375 });
1376 }
1377
1378 // ---- Memory ----
1379 InstKind::GlobalAddr(name) => {
1380 // Materialize the address of a module-level global into
1381 // a Gp64 vreg via ADRP+ADD against `_globalname`. Loads
1382 // and stores then operate on this pointer the same way
1383 // they operate on an alloca address.
1384 let dest = ctx.get_vreg(mf, inst.id, RegClass::Gp64);
1385 mf.block_mut(mb).insts.push(MachineInst {
1386 opcode: ArmOpcode::AdrpAdd,
1387 operands: vec![
1388 MachineOperand::VReg(dest),
1389 MachineOperand::GlobalLabel(name.clone()),
1390 ],
1391 def: Some(dest),
1392 });
1393 }
1394
1395 InstKind::Alloca(_) => {
1396 // Alloca is handled in Phase 1 (stack slot allocation).
1397 // The "address" is a frame slot offset. Map the ValueId to the offset.
1398 if let Some(&offset) = ctx.alloca_offsets.get(&inst.id) {
1399 let dest = ctx.get_vreg(mf, inst.id, RegClass::Gp64);
1400 // Materialize address: SUB dest, FP, #abs(offset)
1401 // Offsets are negative from FP, so we subtract the absolute value.
1402 let abs_offset = (-offset) as i64;
1403 mf.block_mut(mb).insts.push(MachineInst {
1404 opcode: ArmOpcode::SubImm,
1405 operands: vec![
1406 MachineOperand::VReg(dest),
1407 MachineOperand::PhysReg(PhysReg::FP),
1408 MachineOperand::Imm(abs_offset),
1409 ],
1410 def: Some(dest),
1411 });
1412 }
1413 }
1414
1415 InstKind::Load(addr) => {
1416 // Audit CRITICAL-2: dispatch on the IR result type so the
1417 // load opcode width matches the value, not the pointer.
1418 // Previously every integer load used `ldr w_, [_]` regardless
1419 // of width, silently reading 4 bytes for an i8 load.
1420 let class = type_to_reg_class(&inst.ty);
1421 let dest = ctx.get_vreg(mf, inst.id, class);
1422 let opcode = load_opcode_for(&inst.ty, class);
1423 let (base_op, offset_op) = narrow_load_store_addr(ctx, *addr);
1424 mf.block_mut(mb).insts.push(MachineInst {
1425 opcode,
1426 operands: vec![MachineOperand::VReg(dest), base_op, offset_op],
1427 def: Some(dest),
1428 });
1429 }
1430
1431 InstKind::Store(val, addr) => {
1432 if matches!(func.value_type(*val), Some(IrType::Int(IntWidth::I128))) {
1433 let src_slot = ctx.lookup_wide_slot(*val);
1434 emit_load_phys_i128_pair(
1435 mf,
1436 mb,
1437 MachineOperand::PhysReg(PhysReg::FP),
1438 src_slot as i64,
1439 PhysReg::Gp(16),
1440 PhysReg::Gp(17),
1441 );
1442 if let Some(&offset) = ctx.alloca_offsets.get(addr) {
1443 emit_store_phys_i128_pair(
1444 mf,
1445 mb,
1446 MachineOperand::PhysReg(PhysReg::FP),
1447 offset as i64,
1448 PhysReg::Gp(16),
1449 PhysReg::Gp(17),
1450 );
1451 } else {
1452 let base = ctx.lookup_vreg(*addr);
1453 emit_store_phys_i128_pair(
1454 mf,
1455 mb,
1456 MachineOperand::VReg(base),
1457 0,
1458 PhysReg::Gp(16),
1459 PhysReg::Gp(17),
1460 );
1461 }
1462 return;
1463 }
1464
1465 let val_vreg = ctx.lookup_vreg(*val);
1466 // Audit CRITICAL-2: dispatch on the *value*'s declared IR
1467 // type, not the pointer's pointee — byte-level GEPs into
1468 // derived types and array constructors reuse `Ptr<i8>` as a
1469 // generic offset cursor, so dispatching by the pointee
1470 // would silently truncate non-byte stores.
1471 let val_ty = func.value_type(*val);
1472 let val_class = mf.vregs
1473 .iter()
1474 .find(|v| v.id == val_vreg)
1475 .map(|v| v.class)
1476 .unwrap_or(RegClass::Gp64);
1477 let opcode = store_opcode_for(val_ty.as_ref(), val_class);
1478 let (base_op, offset_op) = narrow_load_store_addr(ctx, *addr);
1479 mf.block_mut(mb).insts.push(MachineInst {
1480 opcode,
1481 operands: vec![MachineOperand::VReg(val_vreg), base_op, offset_op],
1482 def: None,
1483 });
1484 }
1485
1486 InstKind::GetElementPtr(base, indices) => {
1487 // GEP: base + index * elem_size
1488 let dest = ctx.get_vreg(mf, inst.id, RegClass::Gp64);
1489 let base_src = ctx.lookup_vreg(*base);
1490 let base_vreg = if mf.vregs.iter().find(|v| v.id == base_src).map(|v| v.class)
1491 != Some(RegClass::Gp64)
1492 {
1493 let widened = mf.new_vreg(RegClass::Gp64);
1494 mf.block_mut(mb).insts.push(MachineInst {
1495 opcode: ArmOpcode::MovReg,
1496 operands: vec![
1497 MachineOperand::VReg(widened),
1498 MachineOperand::VReg(base_src),
1499 ],
1500 def: Some(widened),
1501 });
1502 widened
1503 } else {
1504 base_src
1505 };
1506
1507 // Determine element size from the GEP result type (Ptr<elem_ty>).
1508 // Bool occupies 1 byte both in SSA and in `alloca [Bool x N]`
1509 // storage; the prior 4-byte override here desynced GEP byte
1510 // strides from `alloca` byte strides, so `arr(i) = .true.` for
1511 // a stack `logical :: arr(N)` wrote 3 bytes past the slot.
1512 let elem_size = match &inst.ty {
1513 IrType::Ptr(inner) => match inner.as_ref() {
1514 IrType::Struct(_) => alloca_size(inner) as i64,
1515 _ => inner.size_bytes() as i64,
1516 },
1517 _ => 4, // fallback
1518 };
1519
1520 if let Some(idx) = indices.first() {
1521 let idx_src = ctx.lookup_vreg(*idx);
1522 let idx_vreg = if mf.vregs.iter().find(|v| v.id == idx_src).map(|v| v.class)
1523 == Some(RegClass::Gp64)
1524 {
1525 idx_src
1526 } else {
1527 let widened = mf.new_vreg(RegClass::Gp64);
1528 let opcode = if matches!(func.value_type(*idx), Some(IrType::Bool)) {
1529 ArmOpcode::MovReg
1530 } else {
1531 ArmOpcode::Sxtw
1532 };
1533 mf.block_mut(mb).insts.push(MachineInst {
1534 opcode,
1535 operands: vec![
1536 MachineOperand::VReg(widened),
1537 MachineOperand::VReg(idx_src),
1538 ],
1539 def: Some(widened),
1540 });
1541 widened
1542 };
1543 let tmp = mf.new_vreg(RegClass::Gp64);
1544 emit_const_int(mf, mb, tmp, elem_size as i128, IntWidth::I64);
1545 let scaled = mf.new_vreg(RegClass::Gp64);
1546 mf.block_mut(mb).insts.push(MachineInst {
1547 opcode: ArmOpcode::Mul,
1548 operands: vec![
1549 MachineOperand::VReg(scaled),
1550 MachineOperand::VReg(idx_vreg),
1551 MachineOperand::VReg(tmp),
1552 ],
1553 def: Some(scaled),
1554 });
1555 mf.block_mut(mb).insts.push(MachineInst {
1556 opcode: ArmOpcode::AddReg,
1557 operands: vec![
1558 MachineOperand::VReg(dest),
1559 MachineOperand::VReg(base_vreg),
1560 MachineOperand::VReg(scaled),
1561 ],
1562 def: Some(dest),
1563 });
1564 } else {
1565 // No indices — just copy the base.
1566 mf.block_mut(mb).insts.push(MachineInst {
1567 opcode: ArmOpcode::MovReg,
1568 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(base_vreg)],
1569 def: Some(dest),
1570 });
1571 }
1572 }
1573
1574 // ---- Calls ----
1575 InstKind::Call(..) | InstKind::RuntimeCall(..) => {
1576 select_call_inst(mf, ctx, mb, inst, func);
1577 }
1578
1579 // ---- Integer extend/truncate ----
1580 InstKind::IntExtend(a, _target_width, signed) => {
1581 let src = ctx.lookup_vreg(*a);
1582 // Pick the opcode based on the SOURCE width, not the
1583 // target. ARM64 has distinct SXTB/SXTH/SXTW instructions
1584 // for 8/16/32-bit sources; using SXTW on anything other
1585 // than a 32-bit source (or with a non-X dest) yields
1586 // "invalid operand for instruction" at the assembler.
1587 let src_ty = func.value_type(*a);
1588 let src_width = match src_ty.as_ref() {
1589 Some(IrType::Int(IntWidth::I8)) => 8,
1590 Some(IrType::Int(IntWidth::I16)) => 16,
1591 Some(IrType::Int(IntWidth::I32)) | Some(IrType::Bool) => 32,
1592 Some(IrType::Int(IntWidth::I64)) => 64,
1593 _ => 32, // conservative default
1594 };
1595 let dest_width = match &inst.ty {
1596 IrType::Int(IntWidth::I8)
1597 | IrType::Int(IntWidth::I16)
1598 | IrType::Int(IntWidth::I32)
1599 | IrType::Bool => 32,
1600 IrType::Int(IntWidth::I64) => 64,
1601 _ => 32,
1602 };
1603 // Dest register class follows the declared target
1604 // bit-width, with one exception: SXTW requires an
1605 // X-register destination, so promote to Gp64 when
1606 // source is 32 AND target is 64.
1607 let dest_class = if dest_width == 64 {
1608 RegClass::Gp64
1609 } else {
1610 RegClass::Gp32
1611 };
1612 let dest = ctx.get_vreg(mf, inst.id, dest_class);
1613
1614 if !*signed {
1615 // Zero-extend: MOV (ARM64 implicitly zero-extends W→X).
1616 mf.block_mut(mb).insts.push(MachineInst {
1617 opcode: ArmOpcode::MovReg,
1618 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
1619 def: Some(dest),
1620 });
1621 } else if src_width >= dest_width {
1622 // Same-width or wider source (bogus from lowering's
1623 // perspective but observed in practice when a
1624 // function-result intrinsic mis-resolves). Emit MOV
1625 // rather than an illegal SXTW Wd, Wn.
1626 mf.block_mut(mb).insts.push(MachineInst {
1627 opcode: ArmOpcode::MovReg,
1628 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
1629 def: Some(dest),
1630 });
1631 } else {
1632 let opcode = match src_width {
1633 8 => ArmOpcode::Sxtb,
1634 16 => ArmOpcode::Sxth,
1635 32 => ArmOpcode::Sxtw,
1636 _ => ArmOpcode::MovReg, // unreachable given the bool above
1637 };
1638 mf.block_mut(mb).insts.push(MachineInst {
1639 opcode,
1640 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
1641 def: Some(dest),
1642 });
1643 }
1644 }
1645
1646 InstKind::IntTrunc(a, _) => {
1647 let src = ctx.lookup_vreg(*a);
1648 let class = type_to_reg_class(&inst.ty);
1649 let dest = ctx.get_vreg(mf, inst.id, class);
1650 // Truncate: just MOV — the 32-bit register naturally truncates.
1651 mf.block_mut(mb).insts.push(MachineInst {
1652 opcode: ArmOpcode::MovReg,
1653 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
1654 def: Some(dest),
1655 });
1656 }
1657
1658 InstKind::PtrToInt(a) => {
1659 // Pointer is already an i64 in a GP register — just mov.
1660 let src = ctx.lookup_vreg(*a);
1661 let class = type_to_reg_class(&inst.ty);
1662 let dest = ctx.get_vreg(mf, inst.id, class);
1663 mf.block_mut(mb).insts.push(MachineInst {
1664 opcode: ArmOpcode::MovReg,
1665 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
1666 def: Some(dest),
1667 });
1668 }
1669
1670 InstKind::IntToPtr(a, _) => {
1671 // Integer already in a GP register — treat as pointer via mov.
1672 let src = ctx.lookup_vreg(*a);
1673 let dest = ctx.get_vreg(mf, inst.id, RegClass::Gp64);
1674 mf.block_mut(mb).insts.push(MachineInst {
1675 opcode: ArmOpcode::MovReg,
1676 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
1677 def: Some(dest),
1678 });
1679 }
1680
1681 // ---- SIMD vector ops (Sprint 12 Stage 2 isel hookup) ----
1682 //
1683 // The vectorizer (Stage 4) is what will start producing
1684 // these. Each arm picks a NEON ArmOpcode based on the result
1685 // vector's lane shape. Mixed-shape ops (e.g. integer 8×i16
1686 // narrow lanes) aren't selected here — Stage 4 will only
1687 // emit the four shapes covered by `VShape`.
1688 InstKind::VAdd(a, b) => emit_vbinop(mf, ctx, mb, inst, *a, *b, |s| match s {
1689 VShape::V4S => ArmOpcode::AddV4S,
1690 VShape::V2D => ArmOpcode::AddV2D,
1691 VShape::F4S => ArmOpcode::FaddV4S,
1692 VShape::F2D => ArmOpcode::FaddV2D,
1693 }),
1694 InstKind::VSub(a, b) => emit_vbinop(mf, ctx, mb, inst, *a, *b, |s| match s {
1695 VShape::V4S => ArmOpcode::SubV4S,
1696 VShape::V2D => ArmOpcode::SubV2D,
1697 VShape::F4S => ArmOpcode::FsubV4S,
1698 VShape::F2D => ArmOpcode::FsubV2D,
1699 }),
1700 InstKind::VMul(a, b) => emit_vbinop(mf, ctx, mb, inst, *a, *b, |s| match s {
1701 VShape::V4S => ArmOpcode::MulV4S,
1702 // NEON has no integer 2D mul — Stage 4 should not request
1703 // it; if it does we fall through to a placeholder.
1704 VShape::V2D => ArmOpcode::Nop,
1705 VShape::F4S => ArmOpcode::FmulV4S,
1706 VShape::F2D => ArmOpcode::FmulV2D,
1707 }),
1708 InstKind::VDiv(a, b) => emit_vbinop(mf, ctx, mb, inst, *a, *b, |s| match s {
1709 // No integer NEON divide — emit a placeholder; the
1710 // vectorizer should refuse to pick V128 lanes for VDiv
1711 // on integer types. Float forms exist.
1712 VShape::V4S | VShape::V2D => ArmOpcode::Nop,
1713 VShape::F4S => ArmOpcode::FdivV4S,
1714 VShape::F2D => ArmOpcode::FdivV2D,
1715 }),
1716 InstKind::VNeg(a) => emit_vunop(mf, ctx, mb, inst, *a, |s| match s {
1717 VShape::V4S => ArmOpcode::NegV4S,
1718 VShape::V2D => ArmOpcode::NegV2D,
1719 VShape::F4S => ArmOpcode::FnegV4S,
1720 VShape::F2D => ArmOpcode::FnegV2D,
1721 }),
1722 InstKind::VAbs(a) => emit_vunop(mf, ctx, mb, inst, *a, |s| match s {
1723 VShape::F4S => ArmOpcode::FabsV4S,
1724 VShape::F2D => ArmOpcode::FabsV2D,
1725 // NEON `abs` exists for integer too but the four-shape
1726 // alias isn't generated yet; placeholder.
1727 VShape::V4S | VShape::V2D => ArmOpcode::Nop,
1728 }),
1729 InstKind::VSqrt(a) => emit_vunop(mf, ctx, mb, inst, *a, |s| match s {
1730 VShape::F4S => ArmOpcode::FsqrtV4S,
1731 VShape::F2D => ArmOpcode::FsqrtV2D,
1732 // sqrt is float-only.
1733 VShape::V4S | VShape::V2D => ArmOpcode::Nop,
1734 }),
1735 InstKind::VFma(a, b, c) => {
1736 // FMLA is dest += a*b. Conventional 3-operand call
1737 // assumes dest is a fresh vreg — emit a copy-from-c
1738 // followed by FMLA. Stage 4 should fold the copy when it
1739 // tracks SSA destinations more carefully.
1740 let shape = match VShape::from_ir(&inst.ty) {
1741 Some(s) if s.is_float() => s,
1742 _ => {
1743 // unsupported shape — placeholder
1744 let dest = ctx.get_vreg(mf, inst.id, RegClass::V128);
1745 mf.block_mut(mb).insts.push(MachineInst {
1746 opcode: ArmOpcode::Nop,
1747 operands: vec![],
1748 def: Some(dest),
1749 });
1750 return;
1751 }
1752 };
1753 let opcode = match shape {
1754 VShape::F4S => ArmOpcode::FmlaV4S,
1755 VShape::F2D => ArmOpcode::FmlaV2D,
1756 _ => unreachable!(),
1757 };
1758 let va = ctx.lookup_vreg(*a);
1759 let vb = ctx.lookup_vreg(*b);
1760 let vc = ctx.lookup_vreg(*c);
1761 let dest = ctx.get_vreg(mf, inst.id, RegClass::V128);
1762 // dest = c (init accumulator). Must use Mov16B (mov.16b)
1763 // for V128 — fmov d, d truncates to 64 bits and silently
1764 // drops the upper lanes.
1765 mf.block_mut(mb).insts.push(MachineInst {
1766 opcode: ArmOpcode::Mov16B,
1767 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(vc)],
1768 def: Some(dest),
1769 });
1770 // dest += a * b
1771 mf.block_mut(mb).insts.push(MachineInst {
1772 opcode,
1773 operands: vec![
1774 MachineOperand::VReg(dest),
1775 MachineOperand::VReg(va),
1776 MachineOperand::VReg(vb),
1777 ],
1778 def: Some(dest),
1779 });
1780 }
1781 InstKind::VSelect(mask, t, f) => {
1782 // BSL is destructive: bsl Vd.16b, Vn.16b, Vm.16b → for
1783 // each bit, if Vd then Vn else Vm. So we copy the mask
1784 // into the dest first (mov.16b), then bsl with t/f.
1785 let vmask = ctx.lookup_vreg(*mask);
1786 let vt = ctx.lookup_vreg(*t);
1787 let vf = ctx.lookup_vreg(*f);
1788 let dest = ctx.get_vreg(mf, inst.id, RegClass::V128);
1789 mf.block_mut(mb).insts.push(MachineInst {
1790 opcode: ArmOpcode::Mov16B,
1791 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(vmask)],
1792 def: Some(dest),
1793 });
1794 mf.block_mut(mb).insts.push(MachineInst {
1795 opcode: ArmOpcode::BslV16B,
1796 operands: vec![
1797 MachineOperand::VReg(dest),
1798 MachineOperand::VReg(vt),
1799 MachineOperand::VReg(vf),
1800 ],
1801 def: Some(dest),
1802 });
1803 }
1804 InstKind::VLoad(addr) => {
1805 let dest = ctx.get_vreg(mf, inst.id, RegClass::V128);
1806 let base = ctx.lookup_vreg(*addr);
1807 mf.block_mut(mb).insts.push(MachineInst {
1808 opcode: ArmOpcode::LdrQ,
1809 operands: vec![
1810 MachineOperand::VReg(dest),
1811 MachineOperand::VReg(base),
1812 MachineOperand::Imm(0),
1813 ],
1814 def: Some(dest),
1815 });
1816 }
1817 InstKind::VStore(val, addr) => {
1818 let v = ctx.lookup_vreg(*val);
1819 let base = ctx.lookup_vreg(*addr);
1820 mf.block_mut(mb).insts.push(MachineInst {
1821 opcode: ArmOpcode::StrQ,
1822 operands: vec![
1823 MachineOperand::VReg(v),
1824 MachineOperand::VReg(base),
1825 MachineOperand::Imm(0),
1826 ],
1827 def: None,
1828 });
1829 }
1830 InstKind::VFCmp(op, a, b) => {
1831 // NEON fcmp produces an all-ones / all-zeros mask per lane.
1832 // Eq/Ge/Gt are direct; Ne/Le/Lt swap operands or invert.
1833 // For Lt: fcmgt swapped operands. For Le: fcmge swapped.
1834 // Ne is not a single-instruction in NEON; we don't handle
1835 // it yet (vectorizer doesn't emit Ne).
1836 let dest = ctx.get_vreg(mf, inst.id, RegClass::V128);
1837 let va = ctx.lookup_vreg(*a);
1838 let vb = ctx.lookup_vreg(*b);
1839 let shape = VShape::from_ir(&inst.ty);
1840 let (opcode, swap) = match (shape, op) {
1841 (Some(VShape::F4S), CmpOp::Gt) => (ArmOpcode::FcmgtV4S, false),
1842 (Some(VShape::F2D), CmpOp::Gt) => (ArmOpcode::FcmgtV2D, false),
1843 (Some(VShape::F4S), CmpOp::Ge) => (ArmOpcode::FcmgeV4S, false),
1844 (Some(VShape::F2D), CmpOp::Ge) => (ArmOpcode::FcmgeV2D, false),
1845 (Some(VShape::F4S), CmpOp::Eq) => (ArmOpcode::FcmeqV4S, false),
1846 (Some(VShape::F2D), CmpOp::Eq) => (ArmOpcode::FcmeqV2D, false),
1847 (Some(VShape::F4S), CmpOp::Lt) => (ArmOpcode::FcmgtV4S, true),
1848 (Some(VShape::F2D), CmpOp::Lt) => (ArmOpcode::FcmgtV2D, true),
1849 (Some(VShape::F4S), CmpOp::Le) => (ArmOpcode::FcmgeV4S, true),
1850 (Some(VShape::F2D), CmpOp::Le) => (ArmOpcode::FcmgeV2D, true),
1851 _ => (ArmOpcode::Nop, false),
1852 };
1853 let (lhs, rhs) = if swap { (vb, va) } else { (va, vb) };
1854 mf.block_mut(mb).insts.push(MachineInst {
1855 opcode,
1856 operands: vec![
1857 MachineOperand::VReg(dest),
1858 MachineOperand::VReg(lhs),
1859 MachineOperand::VReg(rhs),
1860 ],
1861 def: Some(dest),
1862 });
1863 }
1864 InstKind::VICmp(op, a, b) => {
1865 let dest = ctx.get_vreg(mf, inst.id, RegClass::V128);
1866 let va = ctx.lookup_vreg(*a);
1867 let vb = ctx.lookup_vreg(*b);
1868 let shape = VShape::from_ir(&inst.ty);
1869 let (opcode, swap) = match (shape, op) {
1870 (Some(VShape::V4S), CmpOp::Gt) => (ArmOpcode::CmgtV4S, false),
1871 (Some(VShape::V4S), CmpOp::Ge) => (ArmOpcode::CmgeV4S, false),
1872 (Some(VShape::V4S), CmpOp::Eq) => (ArmOpcode::CmeqV4S, false),
1873 (Some(VShape::V4S), CmpOp::Lt) => (ArmOpcode::CmgtV4S, true),
1874 (Some(VShape::V4S), CmpOp::Le) => (ArmOpcode::CmgeV4S, true),
1875 _ => (ArmOpcode::Nop, false),
1876 };
1877 let (lhs, rhs) = if swap { (vb, va) } else { (va, vb) };
1878 mf.block_mut(mb).insts.push(MachineInst {
1879 opcode,
1880 operands: vec![
1881 MachineOperand::VReg(dest),
1882 MachineOperand::VReg(lhs),
1883 MachineOperand::VReg(rhs),
1884 ],
1885 def: Some(dest),
1886 });
1887 }
1888 InstKind::VBroadcast(scalar) => {
1889 let s = ctx.lookup_vreg(*scalar);
1890 let dest = ctx.get_vreg(mf, inst.id, RegClass::V128);
1891 // Float scalars live in S/D registers — splatting from
1892 // those uses the lane-dup form (`dup.4s vN, vM.s[0]`).
1893 // Integer scalars live in W/X registers — splatting from
1894 // those uses the gp-dup form (`dup.4s vN, wM`).
1895 let opcode = match VShape::from_ir(&inst.ty) {
1896 Some(VShape::V4S) => ArmOpcode::DupGen4S,
1897 Some(VShape::V2D) => ArmOpcode::DupGen2D,
1898 Some(VShape::F4S) => ArmOpcode::DupEl4S,
1899 Some(VShape::F2D) => ArmOpcode::DupEl2D,
1900 None => ArmOpcode::Nop,
1901 };
1902 mf.block_mut(mb).insts.push(MachineInst {
1903 opcode,
1904 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(s)],
1905 def: Some(dest),
1906 });
1907 }
1908 InstKind::VReduceSum(v) => {
1909 // Cross-lane sum. The reduction instruction writes its
1910 // 32/64-bit result into the FP register file (sN/dN view
1911 // of vN). For float results that's already what we want;
1912 // for int results we follow up with a `umov.s/.d` move
1913 // from the FP lane back into a GP register.
1914 //
1915 // F4S → faddv s_dest, v_src.4s
1916 // F2D → faddp d_dest, v_src.2d
1917 // int(I32) → addv s_tmp, v_src.4s; umov.s w_dest, v_tmp[0]
1918 // int(I64) → addv s_tmp, v_src.4s; umov.s w_dest, v_tmp[0]
1919 // (4-lane i32 sum widens into a single i32; the
1920 // caller is expected to sign-extend if it
1921 // wanted i64 semantics — matches scalar IAdd)
1922 let src = ctx.lookup_vreg(*v);
1923 match &inst.ty {
1924 IrType::Float(FloatWidth::F32) => {
1925 // NEON has no `faddv.4s`. Reduce 4 f32 lanes
1926 // with two pairwise adds:
1927 // 1) `faddp.4s v_tmp, v_src, v_src`
1928 // → [a+b, c+d, a+b, c+d]
1929 // 2) `faddp.2s s_dest, v_tmp`
1930 // → (a+b)+(c+d) — the full sum
1931 let tmp = mf.new_vreg(RegClass::V128);
1932 mf.block_mut(mb).insts.push(MachineInst {
1933 opcode: ArmOpcode::FaddpV4S,
1934 operands: vec![
1935 MachineOperand::VReg(tmp),
1936 MachineOperand::VReg(src),
1937 MachineOperand::VReg(src),
1938 ],
1939 def: Some(tmp),
1940 });
1941 let class = type_to_reg_class(&inst.ty);
1942 let dest = ctx.get_vreg(mf, inst.id, class);
1943 mf.block_mut(mb).insts.push(MachineInst {
1944 opcode: ArmOpcode::FaddpV2S,
1945 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(tmp)],
1946 def: Some(dest),
1947 });
1948 }
1949 IrType::Float(FloatWidth::F64) => {
1950 let class = type_to_reg_class(&inst.ty);
1951 let dest = ctx.get_vreg(mf, inst.id, class);
1952 mf.block_mut(mb).insts.push(MachineInst {
1953 opcode: ArmOpcode::FaddpV2D,
1954 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
1955 def: Some(dest),
1956 });
1957 }
1958 IrType::Int(IntWidth::I32) => {
1959 // 4×i32 → scalar via `addv.4s s_tmp, v_src` then
1960 // `umov.s w_dest, v_tmp[0]`.
1961 let tmp = mf.new_vreg(RegClass::V128);
1962 mf.block_mut(mb).insts.push(MachineInst {
1963 opcode: ArmOpcode::Addv4S,
1964 operands: vec![MachineOperand::VReg(tmp), MachineOperand::VReg(src)],
1965 def: Some(tmp),
1966 });
1967 let class = type_to_reg_class(&inst.ty);
1968 let dest = ctx.get_vreg(mf, inst.id, class);
1969 mf.block_mut(mb).insts.push(MachineInst {
1970 opcode: ArmOpcode::Umov4S,
1971 operands: vec![
1972 MachineOperand::VReg(dest),
1973 MachineOperand::VReg(tmp),
1974 MachineOperand::Imm(0),
1975 ],
1976 def: Some(dest),
1977 });
1978 }
1979 IrType::Int(IntWidth::I64) => {
1980 // 2×i64 → scalar via pairwise add (`addp.2d
1981 // v_tmp, v_src, v_src`) then `umov.d x_dest,
1982 // v_tmp[0]`. NEON has no `addv.2d`, so the
1983 // pairwise form is the canonical i64 reduce.
1984 let tmp = mf.new_vreg(RegClass::V128);
1985 mf.block_mut(mb).insts.push(MachineInst {
1986 opcode: ArmOpcode::AddpV2D,
1987 operands: vec![
1988 MachineOperand::VReg(tmp),
1989 MachineOperand::VReg(src),
1990 MachineOperand::VReg(src),
1991 ],
1992 def: Some(tmp),
1993 });
1994 let class = type_to_reg_class(&inst.ty);
1995 let dest = ctx.get_vreg(mf, inst.id, class);
1996 mf.block_mut(mb).insts.push(MachineInst {
1997 opcode: ArmOpcode::Umov2D,
1998 operands: vec![
1999 MachineOperand::VReg(dest),
2000 MachineOperand::VReg(tmp),
2001 MachineOperand::Imm(0),
2002 ],
2003 def: Some(dest),
2004 });
2005 }
2006 IrType::Int(_) => {
2007 let class = type_to_reg_class(&inst.ty);
2008 let dest = ctx.get_vreg(mf, inst.id, class);
2009 mf.block_mut(mb).insts.push(MachineInst {
2010 opcode: ArmOpcode::Nop,
2011 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
2012 def: Some(dest),
2013 });
2014 }
2015 _ => {
2016 let class = type_to_reg_class(&inst.ty);
2017 let dest = ctx.get_vreg(mf, inst.id, class);
2018 mf.block_mut(mb).insts.push(MachineInst {
2019 opcode: ArmOpcode::Nop,
2020 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
2021 def: Some(dest),
2022 });
2023 }
2024 }
2025 }
2026 InstKind::VExtract(v, lane) => {
2027 let src = ctx.lookup_vreg(*v);
2028 let class = type_to_reg_class(&inst.ty);
2029 let dest = ctx.get_vreg(mf, inst.id, class);
2030 let opcode = match &inst.ty {
2031 IrType::Int(IntWidth::I32) => ArmOpcode::Umov4S,
2032 IrType::Int(IntWidth::I64) => ArmOpcode::Umov2D,
2033 IrType::Float(FloatWidth::F32) => ArmOpcode::FmovEl4S,
2034 IrType::Float(FloatWidth::F64) => ArmOpcode::FmovEl2D,
2035 _ => ArmOpcode::Nop,
2036 };
2037 mf.block_mut(mb).insts.push(MachineInst {
2038 opcode,
2039 operands: vec![
2040 MachineOperand::VReg(dest),
2041 MachineOperand::VReg(src),
2042 MachineOperand::Imm(*lane as i64),
2043 ],
2044 def: Some(dest),
2045 });
2046 }
2047
2048 InstKind::VMin(a, b) | InstKind::VMax(a, b) => {
2049 let va = ctx.lookup_vreg(*a);
2050 let vb = ctx.lookup_vreg(*b);
2051 let dest = ctx.get_vreg(mf, inst.id, RegClass::V128);
2052 let is_max = matches!(inst.kind, InstKind::VMax(..));
2053 let opcode = match (VShape::from_ir(&inst.ty), is_max) {
2054 (Some(VShape::V4S), true) => ArmOpcode::SmaxV4S,
2055 (Some(VShape::V4S), false) => ArmOpcode::SminV4S,
2056 (Some(VShape::F4S), true) => ArmOpcode::FmaxV4S,
2057 (Some(VShape::F4S), false) => ArmOpcode::FminV4S,
2058 (Some(VShape::F2D), true) => ArmOpcode::FmaxV2D,
2059 (Some(VShape::F2D), false) => ArmOpcode::FminV2D,
2060 _ => ArmOpcode::Nop,
2061 };
2062 mf.block_mut(mb).insts.push(MachineInst {
2063 opcode,
2064 operands: vec![
2065 MachineOperand::VReg(dest),
2066 MachineOperand::VReg(va),
2067 MachineOperand::VReg(vb),
2068 ],
2069 def: Some(dest),
2070 });
2071 }
2072 InstKind::VReduceMin(v) | InstKind::VReduceMax(v) => {
2073 let src = ctx.lookup_vreg(*v);
2074 let is_max = matches!(inst.kind, InstKind::VReduceMax(..));
2075 match &inst.ty {
2076 IrType::Int(IntWidth::I32) => {
2077 let tmp = mf.new_vreg(RegClass::V128);
2078 let opcode = if is_max {
2079 ArmOpcode::Smaxv4S
2080 } else {
2081 ArmOpcode::Sminv4S
2082 };
2083 mf.block_mut(mb).insts.push(MachineInst {
2084 opcode,
2085 operands: vec![MachineOperand::VReg(tmp), MachineOperand::VReg(src)],
2086 def: Some(tmp),
2087 });
2088 let class = type_to_reg_class(&inst.ty);
2089 let dest = ctx.get_vreg(mf, inst.id, class);
2090 mf.block_mut(mb).insts.push(MachineInst {
2091 opcode: ArmOpcode::Umov4S,
2092 operands: vec![
2093 MachineOperand::VReg(dest),
2094 MachineOperand::VReg(tmp),
2095 MachineOperand::Imm(0),
2096 ],
2097 def: Some(dest),
2098 });
2099 }
2100 IrType::Float(FloatWidth::F32) => {
2101 // fmaxv.4s / fminv.4s s_dest, v_src
2102 let class = type_to_reg_class(&inst.ty);
2103 let dest = ctx.get_vreg(mf, inst.id, class);
2104 let opcode = if is_max {
2105 ArmOpcode::FmaxvV4S
2106 } else {
2107 ArmOpcode::FminvV4S
2108 };
2109 mf.block_mut(mb).insts.push(MachineInst {
2110 opcode,
2111 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
2112 def: Some(dest),
2113 });
2114 }
2115 IrType::Float(FloatWidth::F64) => {
2116 // NEON has no fmaxv.2d; the pairwise scalar form
2117 // (fmaxp.2d d_dest, v_src) is the across-lane
2118 // reduction for two f64 lanes.
2119 let class = type_to_reg_class(&inst.ty);
2120 let dest = ctx.get_vreg(mf, inst.id, class);
2121 let opcode = if is_max {
2122 ArmOpcode::FmaxpV2DScalar
2123 } else {
2124 ArmOpcode::FminpV2DScalar
2125 };
2126 mf.block_mut(mb).insts.push(MachineInst {
2127 opcode,
2128 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
2129 def: Some(dest),
2130 });
2131 }
2132 _ => {
2133 let class = type_to_reg_class(&inst.ty);
2134 let dest = ctx.get_vreg(mf, inst.id, class);
2135 mf.block_mut(mb).insts.push(MachineInst {
2136 opcode: ArmOpcode::Nop,
2137 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
2138 def: Some(dest),
2139 });
2140 }
2141 }
2142 }
2143
2144 // Remaining: ExtractField, InsertField, and other vector ops
2145 // (VInsert, VICmp, VFCmp, VBitcast) — placeholder. Land
2146 // per-op as the vectorizer grows in Stage 4.
2147 _ => {
2148 let class = type_to_reg_class(&inst.ty);
2149 let _dest = ctx.get_vreg(mf, inst.id, class);
2150 mf.block_mut(mb).insts.push(MachineInst {
2151 opcode: ArmOpcode::Nop,
2152 operands: vec![],
2153 def: None,
2154 });
2155 }
2156 }
2157 }
2158
2159 /// Select machine instructions for a terminator.
2160 fn select_terminator(
2161 mf: &mut MachineFunction,
2162 ctx: &mut ISelCtx,
2163 mb: MBlockId,
2164 term: &Terminator,
2165 src_block: &BasicBlock,
2166 func: &Function,
2167 ) {
2168 let _ = src_block; // used implicitly via `term`'s args; kept for clarity
2169 match term {
2170 Terminator::Return(None) => {
2171 emit_epilogue(mf, mb);
2172 }
2173 Terminator::Return(Some(val)) => {
2174 if matches!(func.value_type(*val), Some(IrType::Int(IntWidth::I128))) {
2175 let src_slot = ctx.lookup_wide_slot(*val);
2176 emit_load_phys_i128_pair(
2177 mf,
2178 mb,
2179 MachineOperand::PhysReg(PhysReg::FP),
2180 src_slot as i64,
2181 PhysReg::Gp(0),
2182 PhysReg::Gp(1),
2183 );
2184 emit_epilogue(mf, mb);
2185 return;
2186 }
2187 // Move result to X0 (integer) or D0 (float).
2188 let src = ctx.lookup_vreg(*val);
2189 let class = mf.vregs.iter().find(|v| v.id == src).map(|v| v.class);
2190 let (reg, opcode) = match class {
2191 Some(RegClass::Fp64) => (PhysReg::Fp(0), ArmOpcode::FmovReg),
2192 Some(RegClass::Fp32) => (PhysReg::Fp32(0), ArmOpcode::FmovReg),
2193 Some(RegClass::Gp32) => (PhysReg::Gp32(0), ArmOpcode::MovReg),
2194 _ => (PhysReg::Gp(0), ArmOpcode::MovReg),
2195 };
2196 mf.block_mut(mb).insts.push(MachineInst {
2197 opcode,
2198 operands: vec![MachineOperand::PhysReg(reg), MachineOperand::VReg(src)],
2199 def: None,
2200 });
2201 emit_epilogue(mf, mb);
2202 }
2203 Terminator::Branch(dest, args) => {
2204 // Emit parallel copy from each branch arg into the
2205 // target block's corresponding param vreg BEFORE the
2206 // actual branch instruction. Without this, block
2207 // parameters introduced by mem2reg or the lowerer
2208 // would never receive their incoming values at edge
2209 // points, producing infinite loops or stale data.
2210 emit_branch_arg_copies(mf, ctx, mb, *dest, args);
2211 let target = ctx.lookup_block(*dest);
2212 mf.block_mut(mb).insts.push(MachineInst {
2213 opcode: ArmOpcode::B,
2214 operands: vec![MachineOperand::BlockRef(target)],
2215 def: None,
2216 });
2217 }
2218 Terminator::CondBranch {
2219 cond,
2220 true_dest,
2221 true_args,
2222 false_dest,
2223 false_args,
2224 } => {
2225 let cond_vreg = ctx.lookup_vreg(*cond);
2226 let true_mb = ctx.lookup_block(*true_dest);
2227 let false_mb = ctx.lookup_block(*false_dest);
2228
2229 // For a conditional branch, the parallel copies for
2230 // the two arms must happen only on the taken edge. We
2231 // emit the copies inside per-arm trampoline sequences:
2232 //
2233 // CMP cond, #0
2234 // B.EQ false_copies_then_jump (conditional jump to
2235 // false-side copies)
2236 // <true copies>
2237 // B true_dest
2238 // false_copies_then_jump:
2239 // <false copies>
2240 // B false_dest
2241 //
2242 // To keep the machine CFG simple we instead emit the
2243 // false-side copies + jump as a new machine block.
2244 // But that's invasive. For the common case where
2245 // neither arm has copies, fall back to the original
2246 // shape. When either arm has copies, materialize a
2247 // shim block for that arm.
2248 mf.block_mut(mb).insts.push(MachineInst {
2249 opcode: ArmOpcode::CmpImm,
2250 operands: vec![MachineOperand::VReg(cond_vreg), MachineOperand::Imm(0)],
2251 def: None,
2252 });
2253
2254 // True arm: if there are branch args to copy, create
2255 // a shim block that does the copies then jumps to the
2256 // true destination. Otherwise, branch directly.
2257 let true_target = if true_args.is_empty() {
2258 true_mb
2259 } else {
2260 // Prefix with the function name so labels stay
2261 // unique across functions in the same .s file. Two
2262 // functions could otherwise both emit `L3_true_shim`.
2263 let label = format!("L{}_{}_true_shim", mf.name, mb.0);
2264 let shim = mf.new_block(&label);
2265 emit_branch_arg_copies(mf, ctx, shim, *true_dest, true_args);
2266 mf.block_mut(shim).insts.push(MachineInst {
2267 opcode: ArmOpcode::B,
2268 operands: vec![MachineOperand::BlockRef(true_mb)],
2269 def: None,
2270 });
2271 shim
2272 };
2273
2274 mf.block_mut(mb).insts.push(MachineInst {
2275 opcode: ArmOpcode::BCond,
2276 operands: vec![
2277 MachineOperand::Cond(ArmCond::Ne),
2278 MachineOperand::BlockRef(true_target),
2279 ],
2280 def: None,
2281 });
2282
2283 // False arm: same treatment.
2284 let false_target = if false_args.is_empty() {
2285 false_mb
2286 } else {
2287 let label = format!("L{}_{}_false_shim", mf.name, mb.0);
2288 let shim = mf.new_block(&label);
2289 emit_branch_arg_copies(mf, ctx, shim, *false_dest, false_args);
2290 mf.block_mut(shim).insts.push(MachineInst {
2291 opcode: ArmOpcode::B,
2292 operands: vec![MachineOperand::BlockRef(false_mb)],
2293 def: None,
2294 });
2295 shim
2296 };
2297 mf.block_mut(mb).insts.push(MachineInst {
2298 opcode: ArmOpcode::B,
2299 operands: vec![MachineOperand::BlockRef(false_target)],
2300 def: None,
2301 });
2302 }
2303 Terminator::Switch {
2304 selector,
2305 cases,
2306 default,
2307 } => {
2308 let sel_vreg = ctx.lookup_vreg(*selector);
2309 let default_mb = ctx.lookup_block(*default);
2310
2311 for (val, dest) in cases {
2312 let dest_mb = ctx.lookup_block(*dest);
2313 // CMP selector, #val; B.EQ case_block
2314 mf.block_mut(mb).insts.push(MachineInst {
2315 opcode: ArmOpcode::CmpImm,
2316 operands: vec![MachineOperand::VReg(sel_vreg), MachineOperand::Imm(*val)],
2317 def: None,
2318 });
2319 mf.block_mut(mb).insts.push(MachineInst {
2320 opcode: ArmOpcode::BCond,
2321 operands: vec![
2322 MachineOperand::Cond(ArmCond::Eq),
2323 MachineOperand::BlockRef(dest_mb),
2324 ],
2325 def: None,
2326 });
2327 }
2328 // Default: unconditional branch.
2329 mf.block_mut(mb).insts.push(MachineInst {
2330 opcode: ArmOpcode::B,
2331 operands: vec![MachineOperand::BlockRef(default_mb)],
2332 def: None,
2333 });
2334 }
2335 Terminator::Unreachable => {
2336 // Debug trap — should never execute. brk #1 triggers SIGTRAP.
2337 mf.block_mut(mb).insts.push(MachineInst {
2338 opcode: ArmOpcode::Brk,
2339 operands: vec![MachineOperand::Imm(1)],
2340 def: None,
2341 });
2342 }
2343 }
2344 }
2345
2346 /// Emit the parallel-copy that materializes branch arguments into
2347 /// the target block's parameter vregs.
2348 ///
2349 /// At an SSA block boundary the IR semantics say "all the new values
2350 /// arrive in the target's params simultaneously." On a register
2351 /// machine that means we have to perform multiple `mov` operations
2352 /// such that none of them clobbers a value still needed by another
2353 /// pending move. The classical solution:
2354 ///
2355 /// 1. Skip identity copies (`dst == src`).
2356 /// 2. Repeatedly find a pending copy whose `dst` is **not** also
2357 /// the `src` of some other pending copy. Such a copy is "safe"
2358 /// — emitting it can't trample anything still needed.
2359 /// 3. If every remaining copy is part of a cycle (no safe copy
2360 /// exists), break the cycle by moving the head of any pending
2361 /// copy through a freshly-allocated scratch vreg, then continue.
2362 ///
2363 /// Cycles arise when block params swap with each other across an
2364 /// edge. The lowerer doesn't currently produce that shape, but
2365 /// mem2reg may once we have more sophisticated reaching-definition
2366 /// flow, so handling it now keeps a future bug out of the IR.
2367 fn emit_branch_arg_copies(
2368 mf: &mut MachineFunction,
2369 ctx: &ISelCtx,
2370 mb: MBlockId,
2371 target_block: BlockId,
2372 args: &[ValueId],
2373 ) {
2374 if args.is_empty() {
2375 return;
2376 }
2377
2378 // Look up the target block's param vregs in the same order
2379 // they appear in the IR (which is also the order they were
2380 // allocated in Phase 4a, so the i-th arg corresponds to the
2381 // i-th param).
2382 let target_params = ctx
2383 .block_params
2384 .get(&target_block)
2385 .expect("isel: branch target not in block_params snapshot");
2386 if target_params.len() != args.len() {
2387 // Verifier should reject this — but if it leaks through
2388 // we want a clear panic, not silent corruption.
2389 panic!(
2390 "isel: branch arg count {} ≠ target block param count {}",
2391 args.len(),
2392 target_params.len()
2393 );
2394 }
2395
2396 // Build the pending copy lists. Narrow SSA values move through
2397 // vregs; wide i128 values stay stack-backed and must copy slot to
2398 // slot through a temporary register pair.
2399 let mut pending_narrow: Vec<(VRegId, VRegId)> = Vec::with_capacity(args.len());
2400 let mut pending_wide: Vec<(i32, i32)> = Vec::new();
2401 for (arg, bp) in args.iter().zip(target_params.iter()) {
2402 if matches!(bp.ty, IrType::Int(IntWidth::I128)) {
2403 let dst = ctx.lookup_wide_slot(bp.id);
2404 let src = ctx.lookup_wide_slot(*arg);
2405 if dst != src {
2406 pending_wide.push((dst, src));
2407 }
2408 continue;
2409 }
2410 let dst = ctx.lookup_vreg(bp.id);
2411 let src = ctx.lookup_vreg(*arg);
2412 if dst != src {
2413 pending_narrow.push((dst, src));
2414 }
2415 }
2416
2417 // Helper to look up a vreg's RegClass via mf.vregs.
2418 fn class_of(mf: &MachineFunction, v: VRegId) -> RegClass {
2419 mf.vregs
2420 .iter()
2421 .find(|r| r.id == v)
2422 .map(|r| r.class)
2423 .expect("isel: vreg not registered")
2424 }
2425
2426 // Helper to choose the right move opcode for a vreg's class.
2427 fn move_opcode_for(class: RegClass) -> ArmOpcode {
2428 match class {
2429 // V128 needs `mov.16b` to copy all 128 bits — `fmov d, d`
2430 // would corrupt the upper lanes. Fp64/Fp32 still use
2431 // `fmov` which is the canonical narrow form.
2432 RegClass::V128 => ArmOpcode::Mov16B,
2433 RegClass::Fp64 | RegClass::Fp32 => ArmOpcode::FmovReg,
2434 RegClass::Gp64 | RegClass::Gp32 => ArmOpcode::MovReg,
2435 }
2436 }
2437
2438 let emit_move = |mf: &mut MachineFunction, mb: MBlockId, dst: VRegId, src: VRegId| {
2439 let class = class_of(mf, dst);
2440 let opcode = move_opcode_for(class);
2441 mf.block_mut(mb).insts.push(MachineInst {
2442 opcode,
2443 operands: vec![MachineOperand::VReg(dst), MachineOperand::VReg(src)],
2444 def: Some(dst),
2445 });
2446 };
2447
2448 // Iteratively emit safe narrow moves; break cycles via a scratch
2449 // vreg of the same class.
2450 let mut pending = pending_narrow;
2451 while !pending.is_empty() {
2452 let safe_idx = (0..pending.len()).find(|&i| {
2453 let (d, _) = pending[i];
2454 !pending
2455 .iter()
2456 .enumerate()
2457 .any(|(j, &(_, s))| j != i && s == d)
2458 });
2459
2460 if let Some(idx) = safe_idx {
2461 let (d, s) = pending.remove(idx);
2462 emit_move(mf, mb, d, s);
2463 } else {
2464 let (d, s) = pending[0];
2465 let class = class_of(mf, s);
2466 let temp = mf.new_vreg(class);
2467 emit_move(mf, mb, temp, s);
2468 pending[0] = (d, temp);
2469 }
2470 }
2471
2472 // Wide i128 block params stay stack-backed, so the same parallel-copy
2473 // algorithm runs on stack slots instead of vregs.
2474 let mut pending = pending_wide;
2475 let mut scratch_slot: Option<i32> = None;
2476 while !pending.is_empty() {
2477 let safe_idx = (0..pending.len()).find(|&i| {
2478 let (d, _) = pending[i];
2479 !pending
2480 .iter()
2481 .enumerate()
2482 .any(|(j, &(_, s))| j != i && s == d)
2483 });
2484
2485 if let Some(idx) = safe_idx {
2486 let (d, s) = pending.remove(idx);
2487 emit_copy_wide_slot(mf, mb, s, d);
2488 } else {
2489 let (d, s) = pending[0];
2490 let temp = if let Some(slot) = scratch_slot {
2491 slot
2492 } else {
2493 let slot = mf.alloc_local(16);
2494 scratch_slot = Some(slot);
2495 slot
2496 };
2497 emit_copy_wide_slot(mf, mb, s, temp);
2498 pending[0] = (d, temp);
2499 }
2500 }
2501 }
2502
2503 fn emit_copy_wide_slot(mf: &mut MachineFunction, mb: MBlockId, src_slot: i32, dst_slot: i32) {
2504 emit_load_phys_i128_pair(
2505 mf,
2506 mb,
2507 MachineOperand::PhysReg(PhysReg::FP),
2508 src_slot as i64,
2509 PhysReg::Gp(16),
2510 PhysReg::Gp(17),
2511 );
2512 emit_store_phys_i128_pair(
2513 mf,
2514 mb,
2515 MachineOperand::PhysReg(PhysReg::FP),
2516 dst_slot as i64,
2517 PhysReg::Gp(16),
2518 PhysReg::Gp(17),
2519 );
2520 }
2521
2522 // ---- Helpers ----
2523
2524 /// Emit function prologue:
2525 /// stp x29, x30, [sp, #-FRAME_SIZE]!
2526 /// add x29, sp, #FRAME_SIZE - 16
2527 /// FP points at the saved FP/LR pair at the top of the frame.
2528 fn emit_prologue(mf: &mut MachineFunction, mb: MBlockId) {
2529 // STP x29, x30, [sp, #-FRAME_SIZE]!
2530 mf.block_mut(mb).insts.push(MachineInst {
2531 opcode: ArmOpcode::StpPre,
2532 operands: vec![
2533 MachineOperand::PhysReg(PhysReg::FP),
2534 MachineOperand::PhysReg(PhysReg::LR),
2535 MachineOperand::PhysReg(PhysReg::Sp),
2536 ],
2537 def: None,
2538 });
2539 // ADD x29, sp, #FRAME_SIZE - 16
2540 // (frame_size - 16 computed during emission when final size is known)
2541 mf.block_mut(mb).insts.push(MachineInst {
2542 opcode: ArmOpcode::AddImm,
2543 operands: vec![
2544 MachineOperand::PhysReg(PhysReg::FP),
2545 MachineOperand::PhysReg(PhysReg::Sp),
2546 MachineOperand::Imm(-1), // sentinel: replaced with frame_size-16 during emit
2547 ],
2548 def: None,
2549 });
2550 }
2551
2552 /// Emit function epilogue:
2553 /// ldp x29, x30, [sp, #FRAME_SIZE-16]
2554 /// add sp, sp, #FRAME_SIZE
2555 /// ret
2556 fn emit_epilogue(mf: &mut MachineFunction, mb: MBlockId) {
2557 // LDP + ADD emitted as a single LdpPost pseudo-op, expanded during emit.
2558 mf.block_mut(mb).insts.push(MachineInst {
2559 opcode: ArmOpcode::LdpPost,
2560 operands: vec![
2561 MachineOperand::PhysReg(PhysReg::FP),
2562 MachineOperand::PhysReg(PhysReg::LR),
2563 MachineOperand::PhysReg(PhysReg::Sp),
2564 ],
2565 def: None,
2566 });
2567 mf.block_mut(mb).insts.push(MachineInst {
2568 opcode: ArmOpcode::Ret,
2569 operands: vec![],
2570 def: None,
2571 });
2572 }
2573
2574 fn split_i128_words(value: i128) -> (u64, u64) {
2575 let bits = value as u128;
2576 (bits as u64, (bits >> 64) as u64)
2577 }
2578
2579 fn emit_const_u64_phys(mf: &mut MachineFunction, mb: MBlockId, dest: PhysReg, value: u64) {
2580 if value == 0 {
2581 mf.block_mut(mb).insts.push(MachineInst {
2582 opcode: ArmOpcode::MovReg,
2583 operands: vec![
2584 MachineOperand::PhysReg(dest),
2585 MachineOperand::PhysReg(PhysReg::Xzr),
2586 ],
2587 def: None,
2588 });
2589 return;
2590 }
2591
2592 let mut first = true;
2593 for i in 0..4 {
2594 let shift = i * 16;
2595 let chunk = ((value >> shift) & 0xFFFF) as u16;
2596 if chunk != 0 || (first && i == 3) {
2597 let opcode = if first {
2598 ArmOpcode::Movz
2599 } else {
2600 ArmOpcode::Movk
2601 };
2602 mf.block_mut(mb).insts.push(MachineInst {
2603 opcode,
2604 operands: vec![
2605 MachineOperand::PhysReg(dest),
2606 MachineOperand::Imm(chunk as i64),
2607 MachineOperand::Shift(shift as u8),
2608 ],
2609 def: None,
2610 });
2611 first = false;
2612 }
2613 }
2614 }
2615
2616 fn emit_const_i128_to_phys_pair(
2617 mf: &mut MachineFunction,
2618 mb: MBlockId,
2619 value: i128,
2620 lo: PhysReg,
2621 hi: PhysReg,
2622 ) {
2623 let (low_word, high_word) = split_i128_words(value);
2624 emit_const_u64_phys(mf, mb, lo, low_word);
2625 emit_const_u64_phys(mf, mb, hi, high_word);
2626 }
2627
2628 fn emit_store_phys_i128_pair(
2629 mf: &mut MachineFunction,
2630 mb: MBlockId,
2631 base: MachineOperand,
2632 offset: i64,
2633 lo: PhysReg,
2634 hi: PhysReg,
2635 ) {
2636 mf.block_mut(mb).insts.push(MachineInst {
2637 opcode: ArmOpcode::StpOffset,
2638 operands: vec![
2639 MachineOperand::PhysReg(lo),
2640 MachineOperand::PhysReg(hi),
2641 base,
2642 MachineOperand::Imm(offset),
2643 ],
2644 def: None,
2645 });
2646 }
2647
2648 fn emit_load_phys_u64(
2649 mf: &mut MachineFunction,
2650 mb: MBlockId,
2651 base: MachineOperand,
2652 offset: i64,
2653 dest: PhysReg,
2654 ) {
2655 mf.block_mut(mb).insts.push(MachineInst {
2656 opcode: ArmOpcode::LdrImm,
2657 operands: vec![
2658 MachineOperand::PhysReg(dest),
2659 base,
2660 MachineOperand::Imm(offset),
2661 ],
2662 def: None,
2663 });
2664 }
2665
2666 fn emit_load_phys_i128_pair(
2667 mf: &mut MachineFunction,
2668 mb: MBlockId,
2669 base: MachineOperand,
2670 offset: i64,
2671 lo: PhysReg,
2672 hi: PhysReg,
2673 ) {
2674 mf.block_mut(mb).insts.push(MachineInst {
2675 opcode: ArmOpcode::LdpOffset,
2676 operands: vec![
2677 MachineOperand::PhysReg(lo),
2678 MachineOperand::PhysReg(hi),
2679 base,
2680 MachineOperand::Imm(offset),
2681 ],
2682 def: None,
2683 });
2684 }
2685
2686 fn emit_load_stack_arg_into_vreg(
2687 mf: &mut MachineFunction,
2688 mb: MBlockId,
2689 dest: VRegId,
2690 class: RegClass,
2691 ty: &IrType,
2692 offset: i64,
2693 ) {
2694 let opcode = load_opcode_for(ty, class);
2695 mf.block_mut(mb).insts.push(MachineInst {
2696 opcode,
2697 operands: vec![
2698 MachineOperand::VReg(dest),
2699 MachineOperand::PhysReg(PhysReg::FP),
2700 MachineOperand::Imm(offset),
2701 ],
2702 def: Some(dest),
2703 });
2704 }
2705
2706 fn emit_store_stack_arg_from_vreg(
2707 mf: &mut MachineFunction,
2708 mb: MBlockId,
2709 src: VRegId,
2710 class: RegClass,
2711 ty: &IrType,
2712 offset: i64,
2713 ) {
2714 let opcode = store_opcode_for(Some(ty), class);
2715 mf.block_mut(mb).insts.push(MachineInst {
2716 opcode,
2717 operands: vec![
2718 MachineOperand::VReg(src),
2719 MachineOperand::PhysReg(PhysReg::Sp),
2720 MachineOperand::Imm(offset),
2721 ],
2722 def: None,
2723 });
2724 }
2725
2726 fn emit_i128_add_from_slot(
2727 mf: &mut MachineFunction,
2728 mb: MBlockId,
2729 rhs_base: MachineOperand,
2730 rhs_offset: i64,
2731 lo: PhysReg,
2732 hi: PhysReg,
2733 scratch: PhysReg,
2734 ) {
2735 emit_load_phys_u64(mf, mb, rhs_base.clone(), rhs_offset, scratch);
2736 mf.block_mut(mb).insts.push(MachineInst {
2737 opcode: ArmOpcode::AddsReg,
2738 operands: vec![
2739 MachineOperand::PhysReg(lo),
2740 MachineOperand::PhysReg(lo),
2741 MachineOperand::PhysReg(scratch),
2742 ],
2743 def: None,
2744 });
2745 emit_load_phys_u64(mf, mb, rhs_base, rhs_offset + 8, scratch);
2746 mf.block_mut(mb).insts.push(MachineInst {
2747 opcode: ArmOpcode::AdcReg,
2748 operands: vec![
2749 MachineOperand::PhysReg(hi),
2750 MachineOperand::PhysReg(hi),
2751 MachineOperand::PhysReg(scratch),
2752 ],
2753 def: None,
2754 });
2755 }
2756
2757 fn emit_i128_sub_from_slot(
2758 mf: &mut MachineFunction,
2759 mb: MBlockId,
2760 rhs_base: MachineOperand,
2761 rhs_offset: i64,
2762 lo: PhysReg,
2763 hi: PhysReg,
2764 scratch: PhysReg,
2765 ) {
2766 emit_load_phys_u64(mf, mb, rhs_base.clone(), rhs_offset, scratch);
2767 mf.block_mut(mb).insts.push(MachineInst {
2768 opcode: ArmOpcode::SubsReg,
2769 operands: vec![
2770 MachineOperand::PhysReg(lo),
2771 MachineOperand::PhysReg(lo),
2772 MachineOperand::PhysReg(scratch),
2773 ],
2774 def: None,
2775 });
2776 emit_load_phys_u64(mf, mb, rhs_base, rhs_offset + 8, scratch);
2777 mf.block_mut(mb).insts.push(MachineInst {
2778 opcode: ArmOpcode::SbcReg,
2779 operands: vec![
2780 MachineOperand::PhysReg(hi),
2781 MachineOperand::PhysReg(hi),
2782 MachineOperand::PhysReg(scratch),
2783 ],
2784 def: None,
2785 });
2786 }
2787
2788 fn emit_i128_neg(mf: &mut MachineFunction, mb: MBlockId, lo: PhysReg, hi: PhysReg) {
2789 mf.block_mut(mb).insts.push(MachineInst {
2790 opcode: ArmOpcode::SubsReg,
2791 operands: vec![
2792 MachineOperand::PhysReg(lo),
2793 MachineOperand::PhysReg(PhysReg::Xzr),
2794 MachineOperand::PhysReg(lo),
2795 ],
2796 def: None,
2797 });
2798 mf.block_mut(mb).insts.push(MachineInst {
2799 opcode: ArmOpcode::SbcReg,
2800 operands: vec![
2801 MachineOperand::PhysReg(hi),
2802 MachineOperand::PhysReg(PhysReg::Xzr),
2803 MachineOperand::PhysReg(hi),
2804 ],
2805 def: None,
2806 });
2807 }
2808
2809 /// Emit a constant integer using movz/movk sequence.
2810 /// Respects width: 32-bit values mask to 32 bits and only emit shifts 0/16.
2811 fn emit_const_int(
2812 mf: &mut MachineFunction,
2813 mb: MBlockId,
2814 dest: VRegId,
2815 val: i128,
2816 width: IntWidth,
2817 ) {
2818 debug_assert!(
2819 width != IntWidth::I128,
2820 "backend should reject i128 before isel"
2821 );
2822 let is_32 = matches!(width, IntWidth::I8 | IntWidth::I16 | IntWidth::I32);
2823 // Mask to the appropriate width to prevent sign-extension artifacts.
2824 let uval = if is_32 {
2825 (val as u32) as u64
2826 } else {
2827 val as u64
2828 };
2829 let max_shift = if is_32 { 2 } else { 4 }; // 2 chunks for 32-bit, 4 for 64-bit
2830
2831 if uval == 0 {
2832 let zr = if is_32 { PhysReg::Wzr } else { PhysReg::Xzr };
2833 mf.block_mut(mb).insts.push(MachineInst {
2834 opcode: ArmOpcode::MovReg,
2835 operands: vec![MachineOperand::VReg(dest), MachineOperand::PhysReg(zr)],
2836 def: Some(dest),
2837 });
2838 return;
2839 }
2840
2841 // MOVZ for the first non-zero 16-bit chunk, MOVK for the rest.
2842 let mut first = true;
2843 for i in 0..max_shift {
2844 let shift = i * 16;
2845 let chunk = ((uval >> shift) & 0xFFFF) as u16;
2846 if chunk != 0 || (first && i == max_shift - 1) {
2847 let opcode = if first {
2848 ArmOpcode::Movz
2849 } else {
2850 ArmOpcode::Movk
2851 };
2852 mf.block_mut(mb).insts.push(MachineInst {
2853 opcode,
2854 operands: vec![
2855 MachineOperand::VReg(dest),
2856 MachineOperand::Imm(chunk as i64),
2857 MachineOperand::Shift(shift as u8),
2858 ],
2859 def: Some(dest),
2860 });
2861 first = false;
2862 }
2863 }
2864
2865 if first {
2866 let zr = if is_32 { PhysReg::Wzr } else { PhysReg::Xzr };
2867 mf.block_mut(mb).insts.push(MachineInst {
2868 opcode: ArmOpcode::MovReg,
2869 operands: vec![MachineOperand::VReg(dest), MachineOperand::PhysReg(zr)],
2870 def: Some(dest),
2871 });
2872 }
2873 }
2874
2875 /// Emit a register-register binary op.
2876 fn emit_binop(
2877 mf: &mut MachineFunction,
2878 ctx: &mut ISelCtx,
2879 mb: MBlockId,
2880 inst: &Inst,
2881 opcode: ArmOpcode,
2882 a: ValueId,
2883 b: ValueId,
2884 ) {
2885 let class = type_to_reg_class(&inst.ty);
2886 let dest = ctx.get_vreg(mf, inst.id, class);
2887 let va = ctx.lookup_vreg(a);
2888 let vb = ctx.lookup_vreg(b);
2889 mf.block_mut(mb).insts.push(MachineInst {
2890 opcode,
2891 operands: vec![
2892 MachineOperand::VReg(dest),
2893 MachineOperand::VReg(va),
2894 MachineOperand::VReg(vb),
2895 ],
2896 def: Some(dest),
2897 });
2898 }
2899
2900 /// Emit a NEON vector binary op. The `pick` closure resolves the
2901 /// concrete `ArmOpcode` from the result vector's lane shape — that
2902 /// keeps the per-op InstKind arms one-line.
2903 fn emit_vbinop(
2904 mf: &mut MachineFunction,
2905 ctx: &mut ISelCtx,
2906 mb: MBlockId,
2907 inst: &Inst,
2908 a: ValueId,
2909 b: ValueId,
2910 pick: impl FnOnce(VShape) -> ArmOpcode,
2911 ) {
2912 let dest = ctx.get_vreg(mf, inst.id, RegClass::V128);
2913 let va = ctx.lookup_vreg(a);
2914 let vb = ctx.lookup_vreg(b);
2915 let opcode = match VShape::from_ir(&inst.ty) {
2916 Some(s) => pick(s),
2917 None => ArmOpcode::Nop,
2918 };
2919 mf.block_mut(mb).insts.push(MachineInst {
2920 opcode,
2921 operands: vec![
2922 MachineOperand::VReg(dest),
2923 MachineOperand::VReg(va),
2924 MachineOperand::VReg(vb),
2925 ],
2926 def: Some(dest),
2927 });
2928 }
2929
2930 /// Emit a NEON vector unary op (one source, one result, both V128).
2931 fn emit_vunop(
2932 mf: &mut MachineFunction,
2933 ctx: &mut ISelCtx,
2934 mb: MBlockId,
2935 inst: &Inst,
2936 a: ValueId,
2937 pick: impl FnOnce(VShape) -> ArmOpcode,
2938 ) {
2939 let dest = ctx.get_vreg(mf, inst.id, RegClass::V128);
2940 let va = ctx.lookup_vreg(a);
2941 let opcode = match VShape::from_ir(&inst.ty) {
2942 Some(s) => pick(s),
2943 None => ArmOpcode::Nop,
2944 };
2945 mf.block_mut(mb).insts.push(MachineInst {
2946 opcode,
2947 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(va)],
2948 def: Some(dest),
2949 });
2950 }
2951
2952 /// Emit a float binary op, selecting single or double precision.
2953 #[allow(clippy::too_many_arguments)]
2954 fn emit_float_binop(
2955 mf: &mut MachineFunction,
2956 ctx: &mut ISelCtx,
2957 mb: MBlockId,
2958 inst: &Inst,
2959 ty: &IrType,
2960 a: ValueId,
2961 b: ValueId,
2962 op_s: ArmOpcode,
2963 op_d: ArmOpcode,
2964 ) {
2965 let (class, opcode) = match ty {
2966 IrType::Float(FloatWidth::F32) => (RegClass::Fp32, op_s),
2967 _ => (RegClass::Fp64, op_d),
2968 };
2969 let dest = ctx.get_vreg(mf, inst.id, class);
2970 let va = ctx.lookup_vreg(a);
2971 let vb = ctx.lookup_vreg(b);
2972 mf.block_mut(mb).insts.push(MachineInst {
2973 opcode,
2974 operands: vec![
2975 MachineOperand::VReg(dest),
2976 MachineOperand::VReg(va),
2977 MachineOperand::VReg(vb),
2978 ],
2979 def: Some(dest),
2980 });
2981 }
2982
2983 /// Map IR type to register class.
2984 /// Pick the load opcode for a value of the given IR type and reg class.
2985 /// Narrow integer types use the sign-extending byte/half loads; floats
2986 /// route to the FP-imm load; everything else falls through to `LdrImm`
2987 /// or `LdrFpImm` per reg class. The reg-class fallback matters when
2988 /// `ty` is a generic pointer or aggregate (e.g., a stack-arg copy that
2989 /// only knows the destination's register kind).
2990 fn load_opcode_for(ty: &IrType, class: RegClass) -> ArmOpcode {
2991 match ty {
2992 IrType::Int(IntWidth::I8) | IrType::Bool => ArmOpcode::LdrsbImm,
2993 IrType::Int(IntWidth::I16) => ArmOpcode::LdrshImm,
2994 IrType::Float(_) => ArmOpcode::LdrFpImm,
2995 _ => match class {
2996 RegClass::Fp64 | RegClass::Fp32 => ArmOpcode::LdrFpImm,
2997 RegClass::V128 => ArmOpcode::LdrQ,
2998 RegClass::Gp32 | RegClass::Gp64 => ArmOpcode::LdrImm,
2999 },
3000 }
3001 }
3002
3003 /// Mirror of `load_opcode_for` for stores. Audit CRITICAL-2: the
3004 /// `ty` here must be the *value's* declared IR type, not the pointer
3005 /// or pointee — byte-level GEPs reuse `ptr<i8>` as a generic offset
3006 /// cursor, so dispatching by pointee width would silently truncate
3007 /// non-byte stores. Pass `None` for `ty` when only the reg class is
3008 /// available; in that case the helper falls through to the class-only
3009 /// branch.
3010 fn store_opcode_for(ty: Option<&IrType>, class: RegClass) -> ArmOpcode {
3011 match ty {
3012 Some(IrType::Int(IntWidth::I8)) | Some(IrType::Bool) => ArmOpcode::StrbImm,
3013 Some(IrType::Int(IntWidth::I16)) => ArmOpcode::StrhImm,
3014 Some(IrType::Float(_)) => ArmOpcode::StrFpImm,
3015 _ => match class {
3016 RegClass::Fp64 | RegClass::Fp32 => ArmOpcode::StrFpImm,
3017 RegClass::V128 => ArmOpcode::StrQ,
3018 RegClass::Gp32 | RegClass::Gp64 => ArmOpcode::StrImm,
3019 },
3020 }
3021 }
3022
3023 /// Resolve an IR address value to the (base, offset) operand pair
3024 /// expected by `LdrImm`/`StrImm`-family instructions. Alloca addresses
3025 /// fold to `(FP, FrameSlot(offset))` so the assembler can pick the
3026 /// final stack-relative form; everything else becomes
3027 /// `(VReg(addr_vreg), Imm(0))`. Used by both narrow-width Load/Store
3028 /// arms in `select_inst`. The wide-i128 paths build their own operand
3029 /// pairs directly because they target the `emit_*_phys_i128_pair`
3030 /// helpers, which take `i64` offsets and only need a base operand.
3031 fn narrow_load_store_addr(
3032 ctx: &ISelCtx,
3033 addr: ValueId,
3034 ) -> (MachineOperand, MachineOperand) {
3035 if let Some(&offset) = ctx.alloca_offsets.get(&addr) {
3036 (
3037 MachineOperand::PhysReg(PhysReg::FP),
3038 MachineOperand::FrameSlot(offset),
3039 )
3040 } else {
3041 let base = ctx.lookup_vreg(addr);
3042 (MachineOperand::VReg(base), MachineOperand::Imm(0))
3043 }
3044 }
3045
3046 /// Operation tag for `emit_i128_binop_via_slots`. Add and Sub share a
3047 /// load-binop-store skeleton that differs only in which intermediate
3048 /// helper does the arithmetic.
3049 #[derive(Clone, Copy)]
3050 enum I128BinOp {
3051 Add,
3052 Sub,
3053 }
3054
3055 /// Lower an i128 IAdd/ISub: load `lhs_id`'s slot into x16/x17, run the
3056 /// matching `emit_i128_<op>_from_slot` against `rhs_id`, then store
3057 /// the result to `dest_id`'s slot. Replaces three near-identical 30-LOC
3058 /// blocks in the i128 dispatch (IAdd / ISub).
3059 fn emit_i128_binop_via_slots(
3060 mf: &mut MachineFunction,
3061 ctx: &ISelCtx,
3062 mb: MBlockId,
3063 op: I128BinOp,
3064 dest_id: ValueId,
3065 lhs_id: ValueId,
3066 rhs_id: ValueId,
3067 ) {
3068 let dest_slot = ctx.lookup_wide_slot(dest_id);
3069 let lhs_slot = ctx.lookup_wide_slot(lhs_id);
3070 let rhs_slot = ctx.lookup_wide_slot(rhs_id);
3071 let fp = || MachineOperand::PhysReg(PhysReg::FP);
3072 emit_load_phys_i128_pair(mf, mb, fp(), lhs_slot as i64, PhysReg::Gp(16), PhysReg::Gp(17));
3073 match op {
3074 I128BinOp::Add => emit_i128_add_from_slot(
3075 mf,
3076 mb,
3077 fp(),
3078 rhs_slot as i64,
3079 PhysReg::Gp(16),
3080 PhysReg::Gp(17),
3081 PhysReg::Gp(8),
3082 ),
3083 I128BinOp::Sub => emit_i128_sub_from_slot(
3084 mf,
3085 mb,
3086 fp(),
3087 rhs_slot as i64,
3088 PhysReg::Gp(16),
3089 PhysReg::Gp(17),
3090 PhysReg::Gp(8),
3091 ),
3092 }
3093 emit_store_phys_i128_pair(mf, mb, fp(), dest_slot as i64, PhysReg::Gp(16), PhysReg::Gp(17));
3094 }
3095
3096 fn type_to_reg_class(ty: &IrType) -> RegClass {
3097 match ty {
3098 IrType::Float(FloatWidth::F32) => RegClass::Fp32,
3099 IrType::Float(FloatWidth::F64) => RegClass::Fp64,
3100 IrType::Vector { .. } => RegClass::V128,
3101 IrType::Int(IntWidth::I8)
3102 | IrType::Int(IntWidth::I16)
3103 | IrType::Int(IntWidth::I32)
3104 | IrType::Bool => RegClass::Gp32,
3105 _ => RegClass::Gp64,
3106 }
3107 }
3108
3109 /// Vector lane shape for NEON opcode dispatch.
3110 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
3111 enum VShape {
3112 /// 4 × i32
3113 V4S,
3114 /// 2 × i64
3115 V2D,
3116 /// 4 × f32
3117 F4S,
3118 /// 2 × f64
3119 F2D,
3120 }
3121
3122 impl VShape {
3123 fn from_ir(ty: &IrType) -> Option<Self> {
3124 let (lanes, elem) = ty.vector_shape()?;
3125 match (lanes, elem) {
3126 (4, IrType::Int(IntWidth::I32)) => Some(Self::V4S),
3127 (2, IrType::Int(IntWidth::I64)) => Some(Self::V2D),
3128 (4, IrType::Float(FloatWidth::F32)) => Some(Self::F4S),
3129 (2, IrType::Float(FloatWidth::F64)) => Some(Self::F2D),
3130 _ => None,
3131 }
3132 }
3133
3134 fn is_float(self) -> bool {
3135 matches!(self, Self::F4S | Self::F2D)
3136 }
3137 }
3138
3139 fn needs_wide_icmp_operand(ty: Option<&IrType>, other_ty: Option<&IrType>) -> bool {
3140 matches!(
3141 (ty, other_ty),
3142 (
3143 Some(IrType::Int(IntWidth::I64) | IrType::Ptr(_) | IrType::FuncPtr(_)),
3144 Some(_)
3145 ) | (
3146 Some(_),
3147 Some(IrType::Int(IntWidth::I64) | IrType::Ptr(_) | IrType::FuncPtr(_))
3148 )
3149 )
3150 }
3151
3152 fn zero_extend_cmp_type(ty: Option<&IrType>) -> bool {
3153 matches!(ty, Some(IrType::Bool))
3154 }
3155
3156 fn icmp_operand_vreg(
3157 mf: &mut MachineFunction,
3158 ctx: &mut ISelCtx,
3159 mb: MBlockId,
3160 func: &Function,
3161 value: ValueId,
3162 other: ValueId,
3163 ) -> VRegId {
3164 let value_ty = func.value_type(value);
3165 let other_ty = func.value_type(other);
3166 let src = ctx.lookup_vreg(value);
3167
3168 if !needs_wide_icmp_operand(value_ty.as_ref(), other_ty.as_ref()) {
3169 return src;
3170 }
3171
3172 if matches!(
3173 value_ty,
3174 Some(IrType::Int(IntWidth::I64) | IrType::Ptr(_) | IrType::FuncPtr(_))
3175 ) {
3176 return src;
3177 }
3178
3179 let dest = mf.new_vreg(RegClass::Gp64);
3180 let opcode = if zero_extend_cmp_type(value_ty.as_ref()) {
3181 ArmOpcode::MovReg
3182 } else {
3183 ArmOpcode::Sxtw
3184 };
3185 mf.block_mut(mb).insts.push(MachineInst {
3186 opcode,
3187 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
3188 def: Some(dest),
3189 });
3190 dest
3191 }
3192
3193 fn machine_vreg_class(mf: &MachineFunction, vreg: VRegId) -> RegClass {
3194 mf.vregs
3195 .iter()
3196 .find(|r| r.id == vreg)
3197 .map(|r| r.class)
3198 .expect("isel: vreg not registered")
3199 }
3200
3201 fn coerce_select_operand_vreg(
3202 mf: &mut MachineFunction,
3203 ctx: &mut ISelCtx,
3204 mb: MBlockId,
3205 func: &Function,
3206 value: ValueId,
3207 target_ty: &IrType,
3208 ) -> VRegId {
3209 let src = ctx.lookup_vreg(value);
3210 let src_class = machine_vreg_class(mf, src);
3211 let target_class = type_to_reg_class(target_ty);
3212 if src_class == target_class {
3213 return src;
3214 }
3215
3216 let dest = mf.new_vreg(target_class);
3217 let src_ty = func.value_type(value);
3218 let opcode = match (src_class, target_class) {
3219 (RegClass::Gp32, RegClass::Gp64) => {
3220 if matches!(target_ty, IrType::Ptr(_) | IrType::FuncPtr(_))
3221 || zero_extend_cmp_type(src_ty.as_ref())
3222 {
3223 ArmOpcode::MovReg
3224 } else {
3225 match src_ty.as_ref() {
3226 Some(IrType::Int(IntWidth::I8)) => ArmOpcode::Sxtb,
3227 Some(IrType::Int(IntWidth::I16)) => ArmOpcode::Sxth,
3228 Some(IrType::Int(IntWidth::I32)) | Some(IrType::Bool) => ArmOpcode::Sxtw,
3229 _ => ArmOpcode::MovReg,
3230 }
3231 }
3232 }
3233 (RegClass::Gp64, RegClass::Gp32) => ArmOpcode::MovReg,
3234 (RegClass::Fp32, RegClass::Fp64) => ArmOpcode::FcvtDS,
3235 (RegClass::Fp64, RegClass::Fp32) => ArmOpcode::FcvtSD,
3236 _ => ArmOpcode::MovReg,
3237 };
3238
3239 mf.block_mut(mb).insts.push(MachineInst {
3240 opcode,
3241 operands: vec![MachineOperand::VReg(dest), MachineOperand::VReg(src)],
3242 def: Some(dest),
3243 });
3244 dest
3245 }
3246
3247 fn int_width_class(w: &IntWidth) -> RegClass {
3248 match w {
3249 IntWidth::I64 => RegClass::Gp64,
3250 _ => RegClass::Gp32,
3251 }
3252 }
3253
3254 fn float_width_class(w: &FloatWidth) -> RegClass {
3255 match w {
3256 FloatWidth::F32 => RegClass::Fp32,
3257 FloatWidth::F64 => RegClass::Fp64,
3258 }
3259 }
3260
3261 /// Map IR comparison op to ARM64 condition code (for integer CMP).
3262 /// Pre-scan a function to find ICmp/FCmp → Select fusion candidates.
3263 ///
3264 /// An ICmp/FCmp is a fusion candidate when:
3265 /// 1. Its result is used exactly once in the entire function.
3266 /// 2. That single use is a `Select` instruction in the same block.
3267 /// 3. No intervening instruction between the ICmp and the Select in
3268 /// that block clobbers NZCV flags (another ICmp/FCmp or a Call).
3269 ///
3270 /// For candidates, we suppress CSET during ICmp lowering and store
3271 /// the ARM condition in `ctx.fused_arm_cond` so the Select can pick
3272 /// it up and emit `CSEL dest, tv, fv, <cond>` directly.
3273 fn compute_csel_fusion(func: &Function, ctx: &mut ISelCtx) {
3274 // Build global use counts.
3275 let mut use_count: HashMap<ValueId, u32> = HashMap::new();
3276 for block in &func.blocks {
3277 for inst in &block.insts {
3278 for vid in crate::ir::walk::inst_uses(&inst.kind) {
3279 *use_count.entry(vid).or_insert(0) += 1;
3280 }
3281 }
3282 if let Some(term) = &block.terminator {
3283 for vid in crate::ir::walk::terminator_uses(term) {
3284 *use_count.entry(vid).or_insert(0) += 1;
3285 }
3286 }
3287 }
3288
3289 // Build a map of ValueId → the block that defines it (instruction defs only).
3290 let mut def_block: HashMap<ValueId, BlockId> = HashMap::new();
3291 for block in &func.blocks {
3292 for inst in &block.insts {
3293 def_block.insert(inst.id, block.id);
3294 }
3295 }
3296
3297 // Per-block scan: walk instructions in order, tracking the most
3298 // recent ICmp/FCmp that hasn't been consumed by a Select yet.
3299 // Any flag-clobbering instruction (another ICmp/FCmp, a call)
3300 // resets the pending set.
3301 for block in &func.blocks {
3302 // The most recently emitted CMP that hasn't been consumed.
3303 // We use a Vec so that `pending = {last_icmp}` is O(1) to update.
3304 let mut pending: Option<ValueId> = None;
3305
3306 for inst in &block.insts {
3307 match &inst.kind {
3308 InstKind::ICmp(op, _, _) => {
3309 if crate::ir::walk::inst_uses(&inst.kind)
3310 .into_iter()
3311 .filter_map(|vid| func.value_type(vid))
3312 .any(|ty| matches!(ty, IrType::Int(IntWidth::I128)))
3313 {
3314 pending = None;
3315 ctx.fused_arm_cond.remove(&inst.id);
3316 continue;
3317 }
3318 // New CMP overwrites NZCV — previous pending is no longer valid.
3319 pending = Some(inst.id);
3320 // Temporarily store the arm cond so we can retrieve it when
3321 // we confirm the Select is the sole user.
3322 ctx.fused_arm_cond.insert(inst.id, cmp_to_arm_cond(*op));
3323 }
3324 InstKind::FCmp(op, _, _) => {
3325 pending = Some(inst.id);
3326 ctx.fused_arm_cond.insert(inst.id, fcmp_to_arm_cond(*op));
3327 }
3328 InstKind::Select(cond, _, _) => {
3329 if let Some(p) = pending {
3330 if p == *cond
3331 && use_count.get(cond) == Some(&1)
3332 && def_block.get(cond) == Some(&block.id)
3333 {
3334 // Confirmed: fuse this ICmp into the Select.
3335 ctx.select_fused.insert(*cond);
3336 pending = None;
3337 } else {
3338 // The Select isel for an unfused cond emits
3339 // its own `cmp cond_reg, #0` to set NZCV,
3340 // which clobbers any pending fused ICmp's
3341 // flags. Drop the pending so a later Select
3342 // doesn't try to read stale flags.
3343 pending = None;
3344 }
3345 }
3346 }
3347 // Calls may clobber NZCV (per AAPCS64, flags are not preserved).
3348 InstKind::Call(_, _) | InstKind::RuntimeCall(_, _) => {
3349 pending = None;
3350 }
3351 _ => {}
3352 }
3353 }
3354
3355 // Clean up fused_arm_cond for ICmps that turned out NOT to be fused
3356 // (e.g., they had use_count > 1, or were never consumed by a Select).
3357 // Leave only the fused ones.
3358 //
3359 // We delay cleanup to after all blocks are scanned because the same
3360 // ValueId can't appear in multiple blocks (SSA), so there's no cross-
3361 // block confusion.
3362 }
3363
3364 // Remove arm_cond entries for non-fused ICmps.
3365 ctx.fused_arm_cond
3366 .retain(|vid, _| ctx.select_fused.contains(vid));
3367 }
3368
3369 fn cmp_to_arm_cond(op: CmpOp) -> ArmCond {
3370 match op {
3371 CmpOp::Eq => ArmCond::Eq,
3372 CmpOp::Ne => ArmCond::Ne,
3373 CmpOp::Lt => ArmCond::Lt,
3374 CmpOp::Le => ArmCond::Le,
3375 CmpOp::Gt => ArmCond::Gt,
3376 CmpOp::Ge => ArmCond::Ge,
3377 }
3378 }
3379
3380 fn i128_ordered_conds(op: CmpOp) -> (ArmCond, ArmCond) {
3381 match op {
3382 CmpOp::Lt => (ArmCond::Lt, ArmCond::Lo),
3383 CmpOp::Le => (ArmCond::Lt, ArmCond::Ls),
3384 CmpOp::Gt => (ArmCond::Gt, ArmCond::Hi),
3385 CmpOp::Ge => (ArmCond::Gt, ArmCond::Hs),
3386 _ => panic!("ordered i128 compare requires lt/le/gt/ge, got {:?}", op),
3387 }
3388 }
3389
3390 /// Map IR comparison op to ARM64 condition code (for float FCMP).
3391 fn fcmp_to_arm_cond(op: CmpOp) -> ArmCond {
3392 match op {
3393 CmpOp::Eq => ArmCond::Eq,
3394 CmpOp::Ne => ArmCond::Ne,
3395 CmpOp::Lt => ArmCond::Mi, // minus flag for less-than
3396 CmpOp::Le => ArmCond::Ls, // unsigned LE maps to float LE
3397 CmpOp::Gt => ArmCond::Gt,
3398 CmpOp::Ge => ArmCond::Ge,
3399 }
3400 }
3401
3402 /// Compute allocation size for an IR type.
3403 fn alloca_size(ty: &IrType) -> u32 {
3404 match ty {
3405 IrType::Void => 0,
3406 IrType::Bool => 4, // use 4 bytes for alignment
3407 IrType::Int(w) => w.bytes(),
3408 IrType::Float(w) => w.bytes(),
3409 IrType::Ptr(_) => 8,
3410 IrType::Array(elem, count) => {
3411 // Stack storage uses ABI-sized elements. Fortran LOGICAL arrays are
3412 // stored as default-kind 4-byte elements, even though Bool SSA
3413 // values themselves remain byte-sized.
3414 let elem_size = match elem.as_ref() {
3415 IrType::Bool => 4,
3416 IrType::Struct(_) => alloca_size(elem),
3417 _ => elem.size_bytes() as u32,
3418 };
3419 elem_size * (*count as u32)
3420 }
3421 IrType::FuncPtr(_) => 8,
3422 IrType::Struct(_) => 8, // placeholder
3423 IrType::Vector { .. } => 16, // 128-bit NEON
3424 }
3425 }
3426
3427 /// Get the symbol name for a runtime function.
3428 /// Get the C-level symbol name for a runtime function.
3429 /// The emitter adds the Mach-O `_` prefix when emitting assembly.
3430 fn runtime_func_symbol(rf: &RuntimeFunc, args: &[(ValueId, AbiArgLoc, IrType)]) -> String {
3431 match rf {
3432 RuntimeFunc::PrintInt => {
3433 if args
3434 .first()
3435 .is_some_and(|(_, _, ty)| matches!(ty, IrType::Int(IntWidth::I128)))
3436 {
3437 "afs_print_int128".into()
3438 } else if args
3439 .first()
3440 .is_some_and(|(_, _, ty)| matches!(ty, IrType::Int(IntWidth::I64)))
3441 {
3442 "afs_print_int64".into()
3443 } else {
3444 "afs_print_int".into()
3445 }
3446 }
3447 RuntimeFunc::PrintReal => "afs_print_real".into(),
3448 RuntimeFunc::PrintString => "afs_print_string".into(),
3449 RuntimeFunc::PrintLogical => "afs_print_logical".into(),
3450 RuntimeFunc::PrintNewline => "afs_print_newline".into(),
3451 RuntimeFunc::Allocate => "afs_allocate".into(),
3452 RuntimeFunc::Deallocate => "afs_deallocate".into(),
3453 RuntimeFunc::StringConcat => "afs_string_concat".into(),
3454 RuntimeFunc::StringCopy => "afs_string_copy".into(),
3455 RuntimeFunc::StringCompare => "afs_string_compare".into(),
3456 RuntimeFunc::Stop => "afs_stop".into(),
3457 RuntimeFunc::ErrorStop => "afs_error_stop".into(),
3458 RuntimeFunc::CheckBounds => "afs_check_bounds".into(),
3459 }
3460 }
3461
3462 #[cfg(test)]
3463 mod tests {
3464 use super::*;
3465 use crate::ir::builder::FuncBuilder;
3466
3467 fn select_simple(build: impl FnOnce(&mut FuncBuilder)) -> MachineFunction {
3468 let mut func = Function::new("test".into(), vec![], IrType::Void);
3469 {
3470 let mut b = FuncBuilder::new(&mut func);
3471 build(&mut b);
3472 }
3473 select_function(&func)
3474 }
3475
3476 #[test]
3477 fn select_const_int() {
3478 let mf = select_simple(|b| {
3479 b.const_i32(42);
3480 b.ret_void();
3481 });
3482 let insts = &mf.blocks[0].insts;
3483 // Should have: prologue (STP, MOV), MOVZ #42, epilogue (LDP, RET).
3484 assert!(insts.iter().any(|i| i.opcode == ArmOpcode::Movz));
3485 }
3486
3487 #[test]
3488 fn select_iadd() {
3489 let mf = select_simple(|b| {
3490 let x = b.const_i32(10);
3491 let y = b.const_i32(20);
3492 let _z = b.iadd(x, y);
3493 b.ret_void();
3494 });
3495 assert!(mf.blocks[0]
3496 .insts
3497 .iter()
3498 .any(|i| i.opcode == ArmOpcode::AddReg));
3499 }
3500
3501 #[test]
3502 fn select_icmp() {
3503 // ICmp whose result is NOT fed into a Select → CSET must appear.
3504 let mf = select_simple(|b| {
3505 let x = b.const_i32(5);
3506 let y = b.const_i32(10);
3507 let _c = b.icmp(CmpOp::Lt, x, y);
3508 b.ret_void();
3509 });
3510 assert!(mf.blocks[0]
3511 .insts
3512 .iter()
3513 .any(|i| i.opcode == ArmOpcode::CmpReg));
3514 assert!(mf.blocks[0]
3515 .insts
3516 .iter()
3517 .any(|i| i.opcode == ArmOpcode::Cset));
3518 }
3519
3520 #[test]
3521 fn select_i128_icmp_eq_combines_limb_results() {
3522 let mf = select_simple(|b| {
3523 let x = b.const_i128(1);
3524 let y = b.const_i128(1);
3525 let _c = b.icmp(CmpOp::Eq, x, y);
3526 b.ret_void();
3527 });
3528 let insts = &mf.blocks[0].insts;
3529 assert!(
3530 insts
3531 .iter()
3532 .filter(|i| i.opcode == ArmOpcode::CmpReg)
3533 .count()
3534 >= 2
3535 );
3536 assert!(insts.iter().filter(|i| i.opcode == ArmOpcode::Cset).count() >= 2);
3537 assert!(insts.iter().any(|i| i.opcode == ArmOpcode::AndReg));
3538 }
3539
3540 #[test]
3541 fn select_i128_icmp_lt_uses_high_signed_and_low_unsigned_conds() {
3542 let mf = select_simple(|b| {
3543 let x = b.const_i128(1);
3544 let y = b.const_i128(2);
3545 let _c = b.icmp(CmpOp::Lt, x, y);
3546 b.ret_void();
3547 });
3548 let insts = &mf.blocks[0].insts;
3549 assert!(
3550 insts
3551 .iter()
3552 .filter(|i| i.opcode == ArmOpcode::CmpReg)
3553 .count()
3554 >= 2
3555 );
3556 assert!(insts.iter().filter(|i| i.opcode == ArmOpcode::Cset).count() >= 3);
3557 assert!(insts.iter().any(|i| i.opcode == ArmOpcode::AndReg));
3558 assert!(insts.iter().any(|i| i.opcode == ArmOpcode::OrrReg));
3559 }
3560
3561 #[test]
3562 fn select_i128_uses_pair_csel_ops() {
3563 let mf = select_simple(|b| {
3564 let cond = b.const_bool(true);
3565 let x = b.const_i128(1);
3566 let y = b.const_i128(2);
3567 let _s = b.select(cond, x, y);
3568 b.ret_void();
3569 });
3570 let insts = &mf.blocks[0].insts;
3571 assert!(insts.iter().any(|i| i.opcode == ArmOpcode::CmpImm));
3572 assert_eq!(
3573 insts
3574 .iter()
3575 .filter(|i| i.opcode == ArmOpcode::CselReg)
3576 .count(),
3577 2,
3578 "wide i128 selects should lower with one CSEL per limb"
3579 );
3580 }
3581
3582 #[test]
3583 fn select_coerces_mixed_gp_widths_before_csel() {
3584 let mf = select_simple(|b| {
3585 let cond = b.const_bool(true);
3586 let wide = b.const_i64(7);
3587 let narrow = b.const_i32(-1);
3588 let _s = b.select(cond, wide, narrow);
3589 b.ret_void();
3590 });
3591 let csel = mf.blocks[0]
3592 .insts
3593 .iter()
3594 .find(|i| i.opcode == ArmOpcode::CselReg)
3595 .expect("expected CSEL for mixed-width select");
3596 for operand in csel.operands.iter().take(3) {
3597 let MachineOperand::VReg(vreg) = operand else {
3598 continue;
3599 };
3600 assert_eq!(
3601 machine_vreg_class(&mf, *vreg),
3602 RegClass::Gp64,
3603 "mixed-width select operands should be coerced to the result width before CSEL"
3604 );
3605 }
3606 }
3607
3608 #[test]
3609 fn csel_fusion_eliminates_cset_and_extra_cmp() {
3610 // ICmp used solely by a Select → CSET and CMP cond, #0 must NOT appear.
3611 // Only CmpReg + CselReg should be present.
3612 let mf = select_simple(|b| {
3613 let x = b.const_i32(5);
3614 let y = b.const_i32(10);
3615 let c = b.icmp(CmpOp::Le, x, y); // use_count[c] = 1, only in Select
3616 let _s = b.select(c, x, y);
3617 b.ret_void();
3618 });
3619 let insts = &mf.blocks[0].insts;
3620 // Must have a CMP to set flags.
3621 assert!(
3622 insts.iter().any(|i| i.opcode == ArmOpcode::CmpReg),
3623 "expected CmpReg for ICmp"
3624 );
3625 // Must have CSEL to select the value.
3626 assert!(
3627 insts.iter().any(|i| i.opcode == ArmOpcode::CselReg),
3628 "expected CselReg for Select"
3629 );
3630 // Must NOT have CSET (ICmp boolean materialization is suppressed).
3631 assert!(
3632 !insts.iter().any(|i| i.opcode == ArmOpcode::Cset),
3633 "CSET should be suppressed when ICmp feeds only a Select"
3634 );
3635 // Must NOT have a second CmpImm (CMP cond, #0 is suppressed).
3636 assert!(
3637 !insts.iter().any(|i| i.opcode == ArmOpcode::CmpImm),
3638 "CMP cond,#0 should be suppressed when CSEL uses flags directly"
3639 );
3640 }
3641
3642 #[test]
3643 fn csel_no_fusion_when_icmp_has_multiple_uses() {
3644 // ICmp used by both a Select and another instruction → CSET is kept.
3645 let mf = select_simple(|b| {
3646 let x = b.const_i32(5);
3647 let y = b.const_i32(10);
3648 let c = b.icmp(CmpOp::Le, x, y); // use_count[c] = 2
3649 let _s = b.select(c, x, y);
3650 // Also use `c` in a logical NOT to force a second use.
3651 let _n = b.not(c);
3652 b.ret_void();
3653 });
3654 let insts = &mf.blocks[0].insts;
3655 // CSET must still be emitted because `c` has multiple uses.
3656 assert!(
3657 insts.iter().any(|i| i.opcode == ArmOpcode::Cset),
3658 "CSET should remain when ICmp has multiple uses"
3659 );
3660 }
3661
3662 #[test]
3663 fn select_fadd() {
3664 let mf = select_simple(|b| {
3665 let x = b.const_f64(1.0);
3666 let y = b.const_f64(2.0);
3667 let _z = b.fadd(x, y);
3668 b.ret_void();
3669 });
3670 assert!(mf.blocks[0]
3671 .insts
3672 .iter()
3673 .any(|i| i.opcode == ArmOpcode::FaddD));
3674 }
3675
3676 #[test]
3677 fn select_alloca_and_store() {
3678 let mf = select_simple(|b| {
3679 let addr = b.alloca(IrType::Int(IntWidth::I32));
3680 let val = b.const_i32(42);
3681 b.store(val, addr);
3682 b.ret_void();
3683 });
3684 // Should have SubImm (address materialization from FP) and StrImm.
3685 assert!(mf.blocks[0]
3686 .insts
3687 .iter()
3688 .any(|i| i.opcode == ArmOpcode::SubImm));
3689 assert!(mf.blocks[0]
3690 .insts
3691 .iter()
3692 .any(|i| i.opcode == ArmOpcode::StrImm));
3693 }
3694
3695 #[test]
3696 fn select_branch() {
3697 let mf = select_simple(|b| {
3698 let cond = b.const_bool(true);
3699 let bb_t = b.create_block("then");
3700 let bb_f = b.create_block("else");
3701 b.cond_branch(cond, bb_t, vec![], bb_f, vec![]);
3702
3703 b.set_block(bb_t);
3704 b.ret_void();
3705 b.set_block(bb_f);
3706 b.ret_void();
3707 });
3708 // Entry block should have CmpImm + BCond + B.
3709 assert!(mf.blocks[0]
3710 .insts
3711 .iter()
3712 .any(|i| i.opcode == ArmOpcode::BCond));
3713 }
3714
3715 #[test]
3716 fn select_call() {
3717 let mf = select_simple(|b| {
3718 b.runtime_call(crate::ir::inst::RuntimeFunc::PrintInt, vec![], IrType::Void);
3719 b.ret_void();
3720 });
3721 assert!(mf.blocks[0].insts.iter().any(|i| i.opcode == ArmOpcode::Bl));
3722 }
3723
3724 #[test]
3725 fn select_call_arg_from_later_block_alloca_has_preallocated_vreg() {
3726 let mut func = Function::new("test".into(), vec![], IrType::Void);
3727 {
3728 let mut b = FuncBuilder::new(&mut func);
3729 let use_block = b.create_block("use");
3730 let def_block = b.create_block("def");
3731
3732 b.branch(def_block, vec![]);
3733
3734 b.set_block(use_block);
3735 let dummy = b.const_i64(7);
3736 b.call(
3737 FuncRef::External("_callee".into()),
3738 vec![dummy],
3739 IrType::Void,
3740 );
3741 b.ret_void();
3742
3743 b.set_block(def_block);
3744 let slot = b.alloca(IrType::Ptr(Box::new(IrType::Int(IntWidth::I8))));
3745 b.call(
3746 FuncRef::External("_callee".into()),
3747 vec![slot],
3748 IrType::Void,
3749 );
3750 b.branch(use_block, vec![]);
3751 }
3752
3753 let mf = select_function(&func);
3754 assert!(
3755 mf.blocks.iter().any(|block| {
3756 block.insts.iter().any(|inst| {
3757 inst.opcode == ArmOpcode::SubImm
3758 && matches!(inst.operands.first(), Some(MachineOperand::VReg(_)))
3759 })
3760 }),
3761 "alloca address should materialize into a preallocated vreg",
3762 );
3763 assert!(
3764 mf.blocks
3765 .iter()
3766 .flat_map(|block| block.insts.iter())
3767 .filter(|inst| inst.opcode == ArmOpcode::Bl)
3768 .count()
3769 >= 2,
3770 "both calls should lower successfully without an unmapped alloca arg vreg",
3771 );
3772 }
3773
3774 #[test]
3775 fn select_i128_runtime_print_uses_wide_symbol_and_pair_regs() {
3776 let mf = select_simple(|b| {
3777 let wide = b.const_i128(170141183460469231731687303715884105727i128);
3778 b.runtime_call(
3779 crate::ir::inst::RuntimeFunc::PrintInt,
3780 vec![wide],
3781 IrType::Void,
3782 );
3783 b.ret_void();
3784 });
3785 let asm = crate::codegen::emit::emit_function(&mf);
3786 assert!(
3787 asm.contains("bl _afs_print_int128"),
3788 "runtime i128 print should call the wide symbol:\n{}",
3789 asm
3790 );
3791 assert!(
3792 asm.contains("ldp x0, x1"),
3793 "runtime i128 print should marshal the value through the pair-register ABI:\n{}",
3794 asm
3795 );
3796 }
3797
3798 #[test]
3799 fn prologue_and_epilogue() {
3800 let mf = select_simple(|b| {
3801 b.ret_void();
3802 });
3803 let insts = &mf.blocks[0].insts;
3804 assert_eq!(
3805 insts[0].opcode,
3806 ArmOpcode::StpPre,
3807 "first inst should be STP (prologue)"
3808 );
3809 assert_eq!(
3810 insts[1].opcode,
3811 ArmOpcode::AddImm,
3812 "second inst should be ADD FP, SP, #offset"
3813 );
3814 assert!(
3815 insts.iter().any(|i| i.opcode == ArmOpcode::Ret),
3816 "should have RET"
3817 );
3818 }
3819
3820 #[test]
3821 fn const_zero_uses_zr() {
3822 let mf = select_simple(|b| {
3823 b.const_i32(0);
3824 b.ret_void();
3825 });
3826 // const_i32(0) should use MOV dest, WZR (32-bit zero register).
3827 let insts = &mf.blocks[0].insts;
3828 let has_mov_zr = insts.iter().any(|i| {
3829 i.opcode == ArmOpcode::MovReg
3830 && i.operands.iter().any(|o| {
3831 matches!(
3832 o,
3833 MachineOperand::PhysReg(PhysReg::Xzr)
3834 | MachineOperand::PhysReg(PhysReg::Wzr)
3835 )
3836 })
3837 });
3838 assert!(has_mov_zr, "const 0 should use MOV from XZR or WZR");
3839 }
3840
3841 // ---- Parallel-copy / branch arg copy tests ----
3842 //
3843 // The branch arg copy resolver in `emit_branch_arg_copies` handles
3844 // cross-edge moves into block params. When the source/destination
3845 // graph contains a cycle, the resolver routes one copy through a
3846 // scratch vreg. These tests construct minimal IR functions that
3847 // exercise each topology, run isel, and inspect the resulting move
3848 // count in the source machine block.
3849
3850 /// Helper: count vreg→vreg moves of the given opcode in a block,
3851 /// excluding moves that target a physical register (those are
3852 /// epilogue/return marshaling, not parallel copies).
3853 fn count_vreg_moves(block: &MachineBlock, opcode: ArmOpcode) -> usize {
3854 block
3855 .insts
3856 .iter()
3857 .filter(|i| i.opcode == opcode)
3858 .filter(|i| {
3859 // True parallel copies are VReg → VReg.
3860 matches!(i.operands.first(), Some(MachineOperand::VReg(_)))
3861 && matches!(i.operands.get(1), Some(MachineOperand::VReg(_)))
3862 })
3863 .count()
3864 }
3865
3866 fn find_block<'a>(mf: &'a MachineFunction, contains: &str) -> &'a MachineBlock {
3867 mf.blocks
3868 .iter()
3869 .find(|b| b.label.contains(contains))
3870 .unwrap_or_else(|| {
3871 panic!(
3872 "no machine block containing '{}' (have: {:?})",
3873 contains,
3874 mf.blocks.iter().map(|b| &b.label).collect::<Vec<_>>(),
3875 )
3876 })
3877 }
3878
3879 #[test]
3880 fn branch_arg_2_cycle_routes_through_scratch() {
3881 // body branches to header swapping the two int params:
3882 // br header(pb, pa)
3883 // pending = [(pa,pb), (pb,pa)] — pure 2-cycle, requires:
3884 // tmp = pb; pb = pa; pa = tmp (3 moves)
3885 let mut func = Function::new("test".into(), vec![], IrType::Void);
3886 {
3887 let mut b = FuncBuilder::new(&mut func);
3888 let header = b.create_block("header");
3889 let pa = b.add_block_param(header, IrType::Int(IntWidth::I32));
3890 let pb = b.add_block_param(header, IrType::Int(IntWidth::I32));
3891 let body = b.create_block("body");
3892 let exit = b.create_block("exit");
3893
3894 let v0 = b.const_i32(1);
3895 let v1 = b.const_i32(2);
3896 b.branch(header, vec![v0, v1]);
3897
3898 b.set_block(header);
3899 b.cond_branch(pa, body, vec![], exit, vec![]);
3900
3901 b.set_block(body);
3902 b.branch(header, vec![pb, pa]);
3903
3904 b.set_block(exit);
3905 b.ret_void();
3906 }
3907 let mf = select_function(&func);
3908 let body_mb = find_block(&mf, "body");
3909 let moves = count_vreg_moves(body_mb, ArmOpcode::MovReg);
3910 assert_eq!(
3911 moves, 3,
3912 "2-cycle should emit 3 vreg→vreg moves (scratch + 2 swaps), got {}: {:#?}",
3913 moves, body_mb.insts,
3914 );
3915 }
3916
3917 #[test]
3918 fn branch_arg_3_cycle_routes_through_scratch() {
3919 // br header(pb, pc, pa) — rotate three params left.
3920 // pending = [(pa,pb),(pb,pc),(pc,pa)]
3921 // Resolution: tmp = pb; pb = pc; pc = pa; pa = tmp (4 moves)
3922 let mut func = Function::new("test".into(), vec![], IrType::Void);
3923 {
3924 let mut b = FuncBuilder::new(&mut func);
3925 let header = b.create_block("header");
3926 let pa = b.add_block_param(header, IrType::Int(IntWidth::I32));
3927 let pb = b.add_block_param(header, IrType::Int(IntWidth::I32));
3928 let pc = b.add_block_param(header, IrType::Int(IntWidth::I32));
3929 let body = b.create_block("body");
3930 let exit = b.create_block("exit");
3931
3932 let v0 = b.const_i32(1);
3933 let v1 = b.const_i32(2);
3934 let v2 = b.const_i32(3);
3935 b.branch(header, vec![v0, v1, v2]);
3936
3937 b.set_block(header);
3938 b.cond_branch(pa, body, vec![], exit, vec![]);
3939
3940 b.set_block(body);
3941 b.branch(header, vec![pb, pc, pa]);
3942
3943 b.set_block(exit);
3944 b.ret_void();
3945 }
3946 let mf = select_function(&func);
3947 let body_mb = find_block(&mf, "body");
3948 let moves = count_vreg_moves(body_mb, ArmOpcode::MovReg);
3949 assert_eq!(
3950 moves, 4,
3951 "3-cycle should emit 4 vreg→vreg moves (scratch + 3 rotates), got {}: {:#?}",
3952 moves, body_mb.insts,
3953 );
3954 }
3955
3956 #[test]
3957 fn branch_arg_cycle_plus_independent_tail() {
3958 // 2-cycle on (pa,pb) plus an independent (pc <- v_extra) tail.
3959 // br header(pb, pa, v_extra)
3960 // The tail (pc, v_extra) is always safe and emits as a single
3961 // move; the 2-cycle adds 3 moves for a total of 4.
3962 let mut func = Function::new("test".into(), vec![], IrType::Void);
3963 {
3964 let mut b = FuncBuilder::new(&mut func);
3965 let header = b.create_block("header");
3966 let pa = b.add_block_param(header, IrType::Int(IntWidth::I32));
3967 let pb = b.add_block_param(header, IrType::Int(IntWidth::I32));
3968 let _pc = b.add_block_param(header, IrType::Int(IntWidth::I32));
3969 let body = b.create_block("body");
3970 let exit = b.create_block("exit");
3971
3972 let v0 = b.const_i32(1);
3973 let v1 = b.const_i32(2);
3974 let v2 = b.const_i32(3);
3975 b.branch(header, vec![v0, v1, v2]);
3976
3977 b.set_block(header);
3978 b.cond_branch(pa, body, vec![], exit, vec![]);
3979
3980 b.set_block(body);
3981 // Body needs a fresh value for pc so it's not part of the
3982 // cycle and so it can't degenerate into pa/pb.
3983 let v3 = b.const_i32(99);
3984 b.branch(header, vec![pb, pa, v3]);
3985
3986 b.set_block(exit);
3987 b.ret_void();
3988 }
3989 let mf = select_function(&func);
3990 let body_mb = find_block(&mf, "body");
3991 let moves = count_vreg_moves(body_mb, ArmOpcode::MovReg);
3992 assert_eq!(
3993 moves, 4,
3994 "cycle+tail should emit 4 vreg→vreg moves (3 for cycle + 1 for tail), got {}: {:#?}",
3995 moves, body_mb.insts,
3996 );
3997 }
3998
3999 #[test]
4000 fn branch_arg_mixed_gp_fp_classes() {
4001 // Two int params and two float params, all swapped pairwise.
4002 // pending splits into a GP 2-cycle and an FP 2-cycle, each of
4003 // which independently needs a scratch.
4004 // Expected: 3 GP MovReg + 3 FP FmovReg = 6 total moves.
4005 let mut func = Function::new("test".into(), vec![], IrType::Void);
4006 {
4007 let mut b = FuncBuilder::new(&mut func);
4008 let header = b.create_block("header");
4009 let ia = b.add_block_param(header, IrType::Int(IntWidth::I32));
4010 let ib = b.add_block_param(header, IrType::Int(IntWidth::I32));
4011 let fa = b.add_block_param(header, IrType::Float(FloatWidth::F64));
4012 let fb = b.add_block_param(header, IrType::Float(FloatWidth::F64));
4013 let body = b.create_block("body");
4014 let exit = b.create_block("exit");
4015
4016 let v0 = b.const_i32(1);
4017 let v1 = b.const_i32(2);
4018 let f0 = b.const_f64(1.0);
4019 let f1 = b.const_f64(2.0);
4020 b.branch(header, vec![v0, v1, f0, f1]);
4021
4022 b.set_block(header);
4023 b.cond_branch(ia, body, vec![], exit, vec![]);
4024
4025 b.set_block(body);
4026 // Swap both pairs: ints (ib, ia) and floats (fb, fa).
4027 b.branch(header, vec![ib, ia, fb, fa]);
4028
4029 b.set_block(exit);
4030 b.ret_void();
4031 }
4032 let mf = select_function(&func);
4033 let body_mb = find_block(&mf, "body");
4034 let gp_moves = count_vreg_moves(body_mb, ArmOpcode::MovReg);
4035 let fp_moves = count_vreg_moves(body_mb, ArmOpcode::FmovReg);
4036 assert_eq!(
4037 gp_moves, 3,
4038 "GP 2-cycle should emit 3 MovReg, got {}: {:#?}",
4039 gp_moves, body_mb.insts,
4040 );
4041 assert_eq!(
4042 fp_moves, 3,
4043 "FP 2-cycle should emit 3 FmovReg, got {}: {:#?}",
4044 fp_moves, body_mb.insts,
4045 );
4046 }
4047
4048 #[test]
4049 fn logical_arrays_use_default_kind_storage_for_stack_slots() {
4050 assert_eq!(alloca_size(&IrType::Array(Box::new(IrType::Bool), 3)), 12);
4051 assert_eq!(
4052 alloca_size(&IrType::Array(Box::new(IrType::Int(IntWidth::I32)), 3)),
4053 12
4054 );
4055 }
4056
4057 // ---- VShape mapping tests (Sprint 12 Stage 2 isel hookup) ----
4058
4059 #[test]
4060 fn vshape_recognizes_4xi32() {
4061 let ty = IrType::Vector {
4062 lanes: 4,
4063 elem: Box::new(IrType::Int(IntWidth::I32)),
4064 };
4065 assert_eq!(VShape::from_ir(&ty), Some(VShape::V4S));
4066 assert!(!VShape::V4S.is_float());
4067 }
4068
4069 #[test]
4070 fn vshape_recognizes_2xf64() {
4071 let ty = IrType::Vector {
4072 lanes: 2,
4073 elem: Box::new(IrType::Float(FloatWidth::F64)),
4074 };
4075 assert_eq!(VShape::from_ir(&ty), Some(VShape::F2D));
4076 assert!(VShape::F2D.is_float());
4077 }
4078
4079 #[test]
4080 fn vshape_rejects_unsupported_shape() {
4081 // 3 lanes is not a NEON shape; we already verified that
4082 // verify.rs rejects it. VShape::from_ir simply returns None
4083 // and the isel arm falls back to Nop.
4084 let ty = IrType::Vector {
4085 lanes: 3,
4086 elem: Box::new(IrType::Int(IntWidth::I32)),
4087 };
4088 assert_eq!(VShape::from_ir(&ty), None);
4089 }
4090
4091 #[test]
4092 fn vector_type_to_reg_class_returns_v128() {
4093 let ty = IrType::Vector {
4094 lanes: 4,
4095 elem: Box::new(IrType::Float(FloatWidth::F32)),
4096 };
4097 assert_eq!(type_to_reg_class(&ty), RegClass::V128);
4098 }
4099
4100 /// End-to-end: build a tiny IR function that adds two 4×f32
4101 /// vectors and walk through isel. The result MachineFunction
4102 /// must contain at least one `FaddV4S` opcode.
4103 #[test]
4104 fn isel_lowers_vadd_4xf32_to_faddv4s() {
4105 use crate::codegen::mir::ArmOpcode;
4106
4107 let v_ty = IrType::Vector {
4108 lanes: 4,
4109 elem: Box::new(IrType::Float(FloatWidth::F32)),
4110 };
4111 let mut func = Function::new("vadd_test".into(), vec![], IrType::Void);
4112 {
4113 let mut b = FuncBuilder::new(&mut func);
4114 // Two pointer params synthesized via alloca for the
4115 // smoke test — keeps the body small but exercises the
4116 // VLoad / VAdd / VStore chain.
4117 let p_a = b.alloca(v_ty.clone());
4118 let p_b = b.alloca(v_ty.clone());
4119 let p_dst = b.alloca(v_ty.clone());
4120 let va = b.vload(p_a, v_ty.clone());
4121 let vb = b.vload(p_b, v_ty.clone());
4122 let vc = b.vadd(va, vb);
4123 b.vstore(vc, p_dst);
4124 b.ret_void();
4125 }
4126
4127 let mf = select_function(&func);
4128 let opcodes: Vec<ArmOpcode> =
4129 mf.blocks.iter().flat_map(|b| b.insts.iter()).map(|i| i.opcode).collect();
4130 assert!(
4131 opcodes.contains(&ArmOpcode::FaddV4S),
4132 "expected FaddV4S in MIR, got {:?}",
4133 opcodes
4134 );
4135 assert!(
4136 opcodes.contains(&ArmOpcode::LdrQ),
4137 "expected LdrQ in MIR, got {:?}",
4138 opcodes
4139 );
4140 assert!(
4141 opcodes.contains(&ArmOpcode::StrQ),
4142 "expected StrQ in MIR, got {:?}",
4143 opcodes
4144 );
4145 }
4146
4147 #[test]
4148 fn vector_abi_arg_uses_v0_to_v7() {
4149 // First 8 vector args should land in v0-v7. The 9th should
4150 // overflow to the stack at the next 16-byte slot.
4151 let mut state = AbiArgState::default();
4152 let v_ty = IrType::Vector {
4153 lanes: 4,
4154 elem: Box::new(IrType::Float(FloatWidth::F32)),
4155 };
4156 for expected in 0u8..8 {
4157 assert_eq!(
4158 classify_abi_arg(&v_ty, &mut state),
4159 AbiArgLoc::V128(expected),
4160 "vector arg #{} should be v{}",
4161 expected,
4162 expected
4163 );
4164 }
4165 // 9th vector arg overflows to stack.
4166 match classify_abi_arg(&v_ty, &mut state) {
4167 AbiArgLoc::Stack(_) => {}
4168 other => panic!("expected Stack overflow, got {:?}", other),
4169 }
4170 }
4171
4172 #[test]
4173 fn vector_args_share_idx_with_float_args() {
4174 // AAPCS64: vector and float args draw from the same v0-v7
4175 // pool. A float arg should bump fp_idx, then a vector arg
4176 // should land at the next slot.
4177 let mut state = AbiArgState::default();
4178 let f_ty = IrType::Float(FloatWidth::F64);
4179 let v_ty = IrType::Vector {
4180 lanes: 2,
4181 elem: Box::new(IrType::Int(IntWidth::I64)),
4182 };
4183 assert_eq!(classify_abi_arg(&f_ty, &mut state), AbiArgLoc::Fp(0));
4184 assert_eq!(classify_abi_arg(&v_ty, &mut state), AbiArgLoc::V128(1));
4185 assert_eq!(classify_abi_arg(&f_ty, &mut state), AbiArgLoc::Fp(2));
4186 }
4187
4188 #[test]
4189 fn isel_lowers_vfma_2xf64_to_fmlav2d() {
4190 use crate::codegen::mir::ArmOpcode;
4191
4192 let v_ty = IrType::Vector {
4193 lanes: 2,
4194 elem: Box::new(IrType::Float(FloatWidth::F64)),
4195 };
4196 let mut func = Function::new("vfma_test".into(), vec![], IrType::Void);
4197 {
4198 let mut b = FuncBuilder::new(&mut func);
4199 let p_a = b.alloca(v_ty.clone());
4200 let p_b = b.alloca(v_ty.clone());
4201 let p_c = b.alloca(v_ty.clone());
4202 let va = b.vload(p_a, v_ty.clone());
4203 let vb = b.vload(p_b, v_ty.clone());
4204 let vc = b.vload(p_c, v_ty.clone());
4205 let _ = b.vfma(va, vb, vc);
4206 b.ret_void();
4207 }
4208
4209 let mf = select_function(&func);
4210 let opcodes: Vec<ArmOpcode> =
4211 mf.blocks.iter().flat_map(|b| b.insts.iter()).map(|i| i.opcode).collect();
4212 assert!(
4213 opcodes.contains(&ArmOpcode::FmlaV2D),
4214 "expected FmlaV2D, got {:?}",
4215 opcodes
4216 );
4217 }
4218 }
4219