//! Assembly text emission — converts Machine IR to ARM64 assembly text. //! //! Produces output compatible with both afs-as and Apple's system assembler. use super::mir::*; use std::fmt::Write; fn split_i128_words(value: i128) -> (u64, u64) { let bits = value as u128; (bits as u64, (bits >> 64) as u64) } fn emit_i128_words(out: &mut String, value: i128) { let (lo, hi) = split_i128_words(value); writeln!(out, " .quad 0x{:016x}", lo).unwrap(); writeln!(out, " .quad 0x{:016x}", hi).unwrap(); } fn emit_byte_values(out: &mut String, bytes: &[u8]) { if bytes.is_empty() { return; } let joined = bytes .iter() .map(|b| b.to_string()) .collect::>() .join(", "); writeln!(out, " .byte {}", joined).unwrap(); } fn byte_array_align_log2(byte_count: u64) -> u8 { if byte_count >= 8 { 3 } else if byte_count >= 4 { 2 } else if byte_count >= 2 { 1 } else { 0 } } /// Emit module-level globals as a `.section __DATA,__data` block. /// Each global gets a label and a directive matching its type /// (`.long`, `.quad`, `.single`, `.double`, etc.) plus the /// initializer value. Zero-initialized globals still emit an /// explicit zero so the symbol resolves at link time. /// /// Array-typed globals: the IR type is `Array` so /// the element count isn't directly recoverable from the type. /// The caller must use `IntArray`/`FloatArray` initializers that /// carry the element count explicitly. Zero-initialized arrays /// fall back to `.space byte_size`. /// /// Module globals (`afs_mod_*` and `afs_common_*`) are emitted as /// `.globl` so other translation units can reference them via USE. /// Non-module globals (SAVE-promoted locals) stay `.private_extern` /// to prevent cross-TU collisions (audit Maj-1). pub fn emit_globals(globals: &[crate::ir::inst::Global]) -> String { use crate::ir::inst::GlobalInit; use crate::ir::types::{FloatWidth, IntWidth, IrType}; let mut out = String::new(); if globals.is_empty() { return out; } writeln!(out, ".section __DATA,__data").unwrap(); for g in globals { let symbol = if g.name.starts_with('_') { g.name.clone() } else { format!("_{}", g.name) }; // Module globals need external linkage for multi-file. let is_module_global = g.name.starts_with("afs_mod_") || g.name.starts_with("afs_common_"); if is_module_global { writeln!(out, ".globl {}", symbol).unwrap(); } else { writeln!(out, ".private_extern {}", symbol).unwrap(); } // Array globals carry `Array`. Pick the // directive from the element type so `.long` / `.quad` / // `.single` / `.double` all work correctly. if let IrType::Array(elem_ty, count) = &g.ty { let (align, directive, _elem_bytes, is_float) = match elem_ty.as_ref() { IrType::Int(IntWidth::I8) | IrType::Bool => { (byte_array_align_log2(*count), ".byte", 1, false) } IrType::Int(IntWidth::I16) => (1, ".short", 2, false), IrType::Int(IntWidth::I32) => (2, ".long", 4, false), IrType::Int(IntWidth::I64) => (3, ".quad", 8, false), IrType::Int(IntWidth::I128) => (4, ".quad", 16, false), IrType::Float(FloatWidth::F32) => (2, ".single", 4, true), IrType::Float(FloatWidth::F64) => (3, ".double", 8, true), _ => (3, ".quad", 8, false), }; if align > 0 { writeln!(out, ".p2align {}", align).unwrap(); } writeln!(out, "{}:", symbol).unwrap(); match &g.initializer { Some(GlobalInit::IntArray(vs)) if matches!(elem_ty.as_ref(), IrType::Int(IntWidth::I128)) => { for v in vs { emit_i128_words(&mut out, *v); } } Some(GlobalInit::IntArray(vs)) if !is_float => { for v in vs { writeln!(out, " {} {}", directive, v).unwrap(); } } Some(GlobalInit::FloatArray(vs)) if is_float => { for v in vs { writeln!(out, " {} {}", directive, v).unwrap(); } } Some(GlobalInit::String(bytes)) => { emit_byte_values(&mut out, bytes); let total_bytes = g.ty.size_bytes() as usize; if bytes.len() < total_bytes { writeln!(out, " .space {}", total_bytes - bytes.len()).unwrap(); } } _ => { // Nested arrays (for example arrays of byte-packed derived // values) don't have a scalar element directive. Emit their // zero-initialized storage using the full IR type size // instead of falling back to a bogus ".quad * count" size. let byte_size = g.ty.size_bytes(); writeln!(out, " .space {}", byte_size).unwrap(); } } continue; } if matches!(g.ty, IrType::Int(IntWidth::I128)) { writeln!(out, ".p2align 4").unwrap(); writeln!(out, "{}:", symbol).unwrap(); match &g.initializer { Some(GlobalInit::Int(v)) => emit_i128_words(&mut out, *v), Some(GlobalInit::Zero) | None => emit_i128_words(&mut out, 0), _ => writeln!(out, " .space 16").unwrap(), } continue; } // Scalar globals: pick alignment + storage directive. // Audit Med-5: NaN/Inf must round-trip portably across // assemblers. Apple's `as` accepts `.single NaN` but GNU // binutils does not. Emit non-finite floats as their // bit-pattern via `.long` / `.quad` so the same .s file // assembles cleanly on both. let is_nonfinite_float = matches!( (&g.ty, &g.initializer), (IrType::Float(_), Some(GlobalInit::Float(v))) if !v.is_finite() ); let (align, directive, default_zero) = if is_nonfinite_float { match &g.ty { IrType::Float(FloatWidth::F32) => (2, ".long", "0"), _ => (3, ".quad", "0"), } } else { match &g.ty { IrType::Int(IntWidth::I8) | IrType::Bool => (0, ".byte", "0"), IrType::Int(IntWidth::I16) => (1, ".short", "0"), IrType::Int(IntWidth::I32) => (2, ".long", "0"), IrType::Int(IntWidth::I64) => (3, ".quad", "0"), IrType::Float(FloatWidth::F32) => (2, ".single", "0.0"), IrType::Float(FloatWidth::F64) => (3, ".double", "0.0"), _ => (3, ".quad", "0"), // pointers and aggregates: 8-byte slot } }; if align > 0 { writeln!(out, ".p2align {}", align).unwrap(); } writeln!(out, "{}:", symbol).unwrap(); let value = match &g.initializer { Some(GlobalInit::Int(v)) => v.to_string(), Some(GlobalInit::Float(v)) => { if v.is_finite() { format!("{}", v) } else { // Bit-pattern emission for NaN / ±Inf. match &g.ty { IrType::Float(FloatWidth::F32) => { format!("0x{:08x}", (*v as f32).to_bits()) } _ => format!("0x{:016x}", v.to_bits()), } } } Some(GlobalInit::Zero) | None => default_zero.into(), Some(GlobalInit::String(bytes)) if matches!(g.ty, IrType::Int(IntWidth::I8) | IrType::Bool) => { bytes.first().copied().unwrap_or(0).to_string() } Some(GlobalInit::String(_)) => default_zero.into(), Some(GlobalInit::IntArray(_)) | Some(GlobalInit::FloatArray(_)) => { // Array initializer on a scalar-typed global — // shouldn't happen, but emit zero as a safe fallback. default_zero.into() } }; writeln!(out, " {} {}", directive, value).unwrap(); } out } /// Emit a machine function as ARM64 assembly text. pub fn emit_function(mf: &MachineFunction) -> String { let mut out = String::new(); // Function directive. if mf.internal_only { writeln!(out, ".private_extern _{}", mf.name).unwrap(); } else { writeln!(out, ".globl _{}", mf.name).unwrap(); } writeln!(out, ".p2align 2").unwrap(); writeln!(out, "_{}:", mf.name).unwrap(); for block in &mf.blocks { // Don't re-emit entry label (it's the function label). if block.id != MBlockId(0) { writeln!(out, "{}:", block.label).unwrap(); } for inst in &block.insts { writeln!(out, " {}", emit_inst(inst, mf)).unwrap(); } } // Constant pool. if !mf.const_pool.is_empty() { writeln!(out).unwrap(); writeln!(out, ".section __DATA,__const").unwrap(); for (i, entry) in mf.const_pool.iter().enumerate() { let label = const_pool_label(&mf.name, i as u32); match entry { ConstPoolEntry::F32(v) => { writeln!(out, ".p2align 2").unwrap(); writeln!(out, "{}:", label).unwrap(); // Emit as hex integer to avoid decimal expansion issues // with large/small floats that the assembler can't parse. writeln!(out, " .long 0x{:08x}", v.to_bits()).unwrap(); } ConstPoolEntry::F64(v) => { writeln!(out, ".p2align 3").unwrap(); writeln!(out, "{}:", label).unwrap(); writeln!(out, " .quad 0x{:016x}", v.to_bits()).unwrap(); } ConstPoolEntry::I64(v) => { writeln!(out, ".p2align 3").unwrap(); writeln!(out, "{}:", label).unwrap(); writeln!(out, " .quad {}", v).unwrap(); } ConstPoolEntry::Bytes(b) => { writeln!(out, ".p2align 3").unwrap(); writeln!(out, "{}:", label).unwrap(); write!(out, " .ascii \"").unwrap(); for &byte in b { match byte { b'\\' => write!(out, "\\\\").unwrap(), b'"' => write!(out, "\\\"").unwrap(), b'\n' => write!(out, "\\n").unwrap(), b'\t' => write!(out, "\\t").unwrap(), b if b.is_ascii_graphic() || b == b' ' => { write!(out, "{}", b as char).unwrap(); } b => write!(out, "\\x{:02x}", b).unwrap(), } } writeln!(out, "\"").unwrap(); } } } } out } /// Format `OP sp, sp, #N` (or `add x29, sp, #N`), falling back /// to a 2-3 instruction synthesized sequence via the AAPCS64 /// scratch register x16 (IP0) when N exceeds the 12-bit /// immediate range. x16 is free in the prologue/epilogue per /// AAPCS64 — it has no caller-saved value at function entry /// and can be clobbered before/after the FP/LR save. /// /// Audit6 BLOCKING-5 (related to BLOCKING-4): functions whose /// frame size exceeds 4095 bytes used to emit raw /// `sub sp, sp, #4144` and the assembler rejected the /// immediate. This came up after audit6 BLOCKING-4 added /// per-allocate descriptor buffers, but it's a latent bug that /// any large-frame function would hit. fn fmt_sp_imm(op: &str, dest: &str, base: &str, n: i64) -> String { if (0..=4095).contains(&n) { return format!("{} {}, {}, #{}", op, dest, base, n); } // Synthesize the immediate in x16 then use the register form. let lo = n & 0xFFFF; let hi = (n >> 16) & 0xFFFF; let mov = if hi == 0 { format!("movz x16, #{}", lo) } else { format!("movz x16, #{}\n movk x16, #{}, lsl #16", lo, hi) }; format!("{}\n {} {}, {}, x16", mov, op, dest, base) } fn fmt_stack_alloc(frame_size: i64) -> String { // Apple Silicon uses large guard pages, so jumping the stack pointer // down by a huge frame in one shot can skip the guard and fault on the // first real touch. Probe the stack one chunk at a time for large // frames to keep growth fault-safe. const STACK_PROBE_STRIDE: i64 = 16 * 1024; if frame_size <= STACK_PROBE_STRIDE { return fmt_sp_imm("sub", "sp", "sp", frame_size); } let mut lines = Vec::new(); let mut remaining = frame_size; while remaining > 0 { let step = remaining.min(STACK_PROBE_STRIDE); lines.push(fmt_sp_imm("sub", "sp", "sp", step)); lines.push("str xzr, [sp]".to_string()); remaining -= step; } lines.join("\n ") } fn fmt_u64_imm(reg: &str, value: u64) -> String { let mut parts = Vec::new(); for shift in [0u32, 16, 32, 48] { let chunk = ((value >> shift) & 0xFFFF) as u16; if chunk == 0 && !parts.is_empty() { continue; } if parts.is_empty() { parts.push(format!("movz {}, #{}", reg, chunk)); } else { parts.push(format!("movk {}, #{}, lsl #{}", reg, chunk, shift)); } } if parts.is_empty() { format!("movz {}, #0", reg) } else { parts.join("\n ") } } fn fmt_addr_with_offset(dest: &str, base: &str, offset: i64, scratch: &str) -> String { if offset == 0 { return format!("mov {}, {}", dest, base); } if (0..=4095).contains(&offset) { return format!("add {}, {}, #{}", dest, base, offset); } if (-4095..=-1).contains(&offset) { return format!("sub {}, {}, #{}", dest, base, -offset); } let imm = fmt_u64_imm(scratch, offset.unsigned_abs()); let op = if offset.is_negative() { "sub" } else { "add" }; format!("{}\n {} {}, {}, {}", imm, op, dest, base, scratch) } /// Emit a single machine instruction as assembly text. Public so the /// branch-relaxation pass can count emit-time instruction bytes /// directly rather than re-deriving each opcode's expansion rules. pub fn emit_inst_text(inst: &MachineInst, mf: &MachineFunction) -> String { emit_inst(inst, mf) } /// Emit a single machine instruction as assembly text. fn emit_inst(inst: &MachineInst, mf: &MachineFunction) -> String { match inst.opcode { ArmOpcode::AddReg => format!( "add {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]) ), ArmOpcode::AddsReg => format!( "adds {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]) ), ArmOpcode::AdcReg => format!( "adc {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]) ), ArmOpcode::AddImm => { let dest = op_str(&inst.operands[0]); let base = op_str(&inst.operands[1]); let imm: i64 = match &inst.operands[2] { MachineOperand::FrameSlot(off) => *off as i64, MachineOperand::Imm(-1) => { // Sentinel: prologue FP setup → frame_size - 16 mf.frame.size.saturating_sub(16) as i64 } MachineOperand::Imm(v) => *v, _ => return format!("add {}, {}, {}", dest, base, op_str(&inst.operands[2])), }; // Both `add x29, sp, #N` (FP setup) and `add Xd, Xn, #N` // need the > 4095 fallback. Use the same scratch // synthesis since x16 is safe in the prologue. fmt_sp_imm("add", &dest, &base, imm) } ArmOpcode::SubReg => format!( "sub {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]) ), ArmOpcode::SubsReg => format!( "subs {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]) ), ArmOpcode::SbcReg => format!( "sbc {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]) ), ArmOpcode::SubImm => { let imm: i64 = match &inst.operands[2] { MachineOperand::Imm(-1) => { // Sentinel: epilogue SP restore → frame_size - 16 mf.frame.size.saturating_sub(16) as i64 } MachineOperand::Imm(v) => *v, _ => 0, }; let dest = op_str(&inst.operands[0]); let base = op_str(&inst.operands[1]); fmt_sp_imm("sub", &dest, &base, imm) } ArmOpcode::Mul => format!( "mul {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]) ), ArmOpcode::Sdiv => format!( "sdiv {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]) ), ArmOpcode::Madd => format!( "madd {}, {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]), op_str(&inst.operands[3]) ), ArmOpcode::Msub => format!( "msub {}, {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]), op_str(&inst.operands[3]) ), ArmOpcode::Neg => format!( "neg {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]) ), ArmOpcode::AndReg => format!( "and {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]) ), ArmOpcode::OrrReg => format!( "orr {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]) ), ArmOpcode::EorReg => format!( "eor {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]) ), ArmOpcode::OrnReg => format!( "orn {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]) ), ArmOpcode::LslReg => format!( "lsl {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]) ), ArmOpcode::LsrReg => format!( "lsr {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]) ), ArmOpcode::AsrReg => format!( "asr {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]) ), ArmOpcode::Mvn => format!( "mvn {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]) ), ArmOpcode::Clz => format!( "clz {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]) ), ArmOpcode::Rbit => format!( "rbit {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]) ), ArmOpcode::CmpReg => format!( "cmp {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]) ), ArmOpcode::CmpImm => format!( "cmp {}, #{}", op_str(&inst.operands[0]), if let MachineOperand::Imm(v) = &inst.operands[1] { *v } else { 0 } ), ArmOpcode::Cset | ArmOpcode::FCset => { let cond = if let MachineOperand::Cond(c) = &inst.operands[1] { cond_str(*c) } else { "eq" }; format!("cset {}, {}", op_str(&inst.operands[0]), cond) } ArmOpcode::CselReg => { let cond = if let MachineOperand::Cond(c) = &inst.operands[3] { cond_str(*c) } else { "eq" }; format!( "csel {}, {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]), cond ) } ArmOpcode::FCmpReg => format!( "fcmp {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]) ), ArmOpcode::FcselReg => { let cond = if let MachineOperand::Cond(c) = &inst.operands[3] { cond_str(*c) } else { "eq" }; format!( "fcsel {}, {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]), cond ) } ArmOpcode::FaddS | ArmOpcode::FaddD => format!( "fadd {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]) ), ArmOpcode::FsubS | ArmOpcode::FsubD => format!( "fsub {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]) ), ArmOpcode::FmulS | ArmOpcode::FmulD => format!( "fmul {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]) ), ArmOpcode::FdivS | ArmOpcode::FdivD => format!( "fdiv {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]) ), ArmOpcode::FnegS | ArmOpcode::FnegD => format!( "fneg {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]) ), ArmOpcode::FabsS | ArmOpcode::FabsD => format!( "fabs {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]) ), ArmOpcode::FsqrtS | ArmOpcode::FsqrtD => format!( "fsqrt {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]) ), // Fused multiply-add/subtract: 4-operand (dest, Sn, Sm, Sa). // FMADD Sd, Sn, Sm, Sa → Sd = Sa + Sn*Sm // FMSUB Sd, Sn, Sm, Sa → Sd = Sa - Sn*Sm // FNMSUB Sd, Sn, Sm, Sa → Sd = Sn*Sm - Sa ArmOpcode::FmaddS | ArmOpcode::FmaddD => format!( "fmadd {}, {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]), op_str(&inst.operands[3]) ), ArmOpcode::FmsubS | ArmOpcode::FmsubD => format!( "fmsub {}, {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]), op_str(&inst.operands[3]) ), ArmOpcode::FnmsubS | ArmOpcode::FnmsubD => format!( "fnmsub {}, {}, {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]), op_str(&inst.operands[3]) ), ArmOpcode::ScvtfSW | ArmOpcode::ScvtfDW | ArmOpcode::ScvtfSX | ArmOpcode::ScvtfDX => { format!( "scvtf {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]) ) } ArmOpcode::FcvtzsWS | ArmOpcode::FcvtzsWD | ArmOpcode::FcvtzsXS | ArmOpcode::FcvtzsXD => { format!( "fcvtzs {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]) ) } ArmOpcode::FcvtSD => format!( "fcvt {}, {}", fp_reg_str(&inst.operands[0], false), fp_reg_str(&inst.operands[1], true) ), ArmOpcode::FcvtDS => format!( "fcvt {}, {}", fp_reg_str(&inst.operands[0], true), fp_reg_str(&inst.operands[1], false) ), ArmOpcode::Movz => { let imm = if let MachineOperand::Imm(v) = &inst.operands[1] { *v } else { 0 }; let shift = if let MachineOperand::Shift(s) = &inst.operands[2] { *s } else { 0 }; if shift == 0 { format!("movz {}, #{}", op_str(&inst.operands[0]), imm) } else { format!( "movz {}, #{}, lsl #{}", op_str(&inst.operands[0]), imm, shift ) } } ArmOpcode::Movk => { let imm = if let MachineOperand::Imm(v) = &inst.operands[1] { *v } else { 0 }; let shift = if let MachineOperand::Shift(s) = &inst.operands[2] { *s } else { 0 }; format!( "movk {}, #{}, lsl #{}", op_str(&inst.operands[0]), imm, shift ) } ArmOpcode::Movn => { let imm = if let MachineOperand::Imm(v) = &inst.operands[1] { *v } else { 0 }; let shift = if let MachineOperand::Shift(s) = &inst.operands[2] { *s } else { 0 }; format!( "movn {}, #{}, lsl #{}", op_str(&inst.operands[0]), imm, shift ) } ArmOpcode::MovReg => { let dest = op_str(&inst.operands[0]); let src = op_str(&inst.operands[1]); // Handle width mismatch: w→x extend or x→w truncate. let dest_is_x = dest.starts_with('x'); let dest_is_w = dest.starts_with('w'); let src_is_w = src.starts_with('w'); let src_is_x = src.starts_with('x'); // Cross-register-class move: AArch64 `mov` only encodes GP↔GP // (and FP↔FP via FmovReg). When register-allocation hands us // a MovReg straddling classes, emit `fmov` which transfers // bits between an integer GPR and an SIMD/FP register. let dest_is_gp = dest_is_x || dest_is_w; let src_is_gp = src_is_x || src_is_w; let dest_is_fp = dest.starts_with('s') || dest.starts_with('d'); let src_is_fp = src.starts_with('s') || src.starts_with('d'); if dest_is_gp && src_is_fp { // GPR ← FPR: pick GPR width to match FPR (s→w, d→x). let gp = if src.starts_with('d') { if dest_is_x { dest.clone() } else { format!("x{}", &dest[1..]) } } else { if dest_is_w { dest.clone() } else { format!("w{}", &dest[1..]) } }; return format!("fmov {}, {}", gp, src); } if dest_is_fp && src_is_gp { let gp = if dest.starts_with('d') { if src_is_x { src.clone() } else { format!("x{}", &src[1..]) } } else { if src_is_w { src.clone() } else { format!("w{}", &src[1..]) } }; return format!("fmov {}, {}", dest, gp); } if dest_is_x && src_is_w { // Zero-extend 32→64: use uxtw. format!("uxtw {}, {}", dest, src) } else if dest_is_w && src_is_x { // Truncate 64→32 by reading the source register through its // 32-bit view. `mov wN, xM` is not a valid AArch64 encoding. format!("mov {}, w{}", dest, &src[1..]) } else { format!("mov {}, {}", dest, src) } } ArmOpcode::FmovReg => format!( "fmov {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]) ), ArmOpcode::Mov16B => format!( "mov.16b {}, {}", v_reg_bare(&inst.operands[0]), v_reg_bare(&inst.operands[1]), ), ArmOpcode::AddpV2D => format!( "addp.2d {}, {}, {}", v_reg_bare(&inst.operands[0]), v_reg_bare(&inst.operands[1]), v_reg_bare(&inst.operands[2]), ), ArmOpcode::FaddpV4S => format!( "faddp.4s {}, {}, {}", v_reg_bare(&inst.operands[0]), v_reg_bare(&inst.operands[1]), v_reg_bare(&inst.operands[2]), ), ArmOpcode::LdrImm | ArmOpcode::LdrFpImm | ArmOpcode::LdrsbImm | ArmOpcode::LdrshImm => { let dest = op_str(&inst.operands[0]); let base = op_str(&inst.operands[1]); let offset_val = match &inst.operands[2] { MachineOperand::FrameSlot(off) => *off as i64, MachineOperand::Imm(v) => *v, _ => 0, }; // Pick the mnemonic by opcode. LDRSB / LDRSH expect a // Wt destination (sign-extended into the lower 32 bits); // the dest operand is already a Gp32 vreg in those // cases, so the formatted register name is `w_`. let mnemonic = match inst.opcode { ArmOpcode::LdrsbImm => "ldrsb", ArmOpcode::LdrshImm => "ldrsh", _ => "ldr", }; if (-256..=255).contains(&offset_val) { format!("{} {}, [{}, #{}]", mnemonic, dest, base, offset_val) } else { // Large offset: compute address in x8, then load. format!( "{}\n {} {}, [x8]", fmt_addr_with_offset("x8", &base, offset_val, "x16"), mnemonic, dest ) } } ArmOpcode::StrImm | ArmOpcode::StrFpImm | ArmOpcode::StrbImm | ArmOpcode::StrhImm => { let src = op_str(&inst.operands[0]); let base = op_str(&inst.operands[1]); let offset_val = match &inst.operands[2] { MachineOperand::FrameSlot(off) => *off as i64, MachineOperand::Imm(v) => *v, _ => 0, }; let mnemonic = match inst.opcode { ArmOpcode::StrbImm => "strb", ArmOpcode::StrhImm => "strh", _ => "str", }; if (-256..=255).contains(&offset_val) { format!("{} {}, [{}, #{}]", mnemonic, src, base, offset_val) } else { // Large offset: compute address in x8, then store. format!( "{}\n {} {}, [x8]", fmt_addr_with_offset("x8", &base, offset_val, "x16"), mnemonic, src ) } } // Sprint 05: scaled-register-offset addressing. Operands are // [dest, base, idx, Imm(shift)]. Shift 0 elides the `, lsl // #0` suffix per the assembler convention. ArmOpcode::LdrReg | ArmOpcode::LdrFpReg | ArmOpcode::StrReg | ArmOpcode::StrFpReg => { let dest = op_str(&inst.operands[0]); let base = op_str(&inst.operands[1]); let idx = op_str(&inst.operands[2]); let shift = match &inst.operands[3] { MachineOperand::Imm(v) => *v, _ => 0, }; let mnemonic = match inst.opcode { ArmOpcode::LdrReg | ArmOpcode::LdrFpReg => "ldr", ArmOpcode::StrReg | ArmOpcode::StrFpReg => "str", _ => unreachable!(), }; if shift == 0 { format!("{} {}, [{}, {}]", mnemonic, dest, base, idx) } else { format!("{} {}, [{}, {}, lsl #{}]", mnemonic, dest, base, idx, shift) } } ArmOpcode::StpPre => { let frame_size = mf.frame.size as i64; let stp_offset = frame_size - 16; // The `sub sp, sp, #N` portion handles N > 4095 via // x16 synthesis (audit6 BLOCKING-5 root cause), and // probes very large frames so macOS guard pages aren't // skipped in one jump. The `stp ... [sp, #stp_offset]` // form is also bounded // (signed 7-bit immediate * 8 = ±504 byte range), so // we fall back to two `str` instructions when over. // For very large frames (stp_offset > 32760, the // signed 12-bit max for 64-bit ldr/str unsigned imm), // we'd need a register-form load/store — not yet // exercised in any test, so the panic catches it. let sub_sp = fmt_stack_alloc(frame_size); if stp_offset <= 504 { format!("{}\n stp x29, x30, [sp, #{}]", sub_sp, stp_offset) } else if stp_offset <= 32760 { format!( "{}\n str x29, [sp, #{}]\n str x30, [sp, #{}]", sub_sp, stp_offset, stp_offset + 8 ) } else { // Frame too large for any ldr/str unsigned immediate. // Synthesize the address in x9 (caller-saved scratch) // then use register-offset str. let x9_addr = fmt_sp_imm("add", "x9", "sp", stp_offset); format!( "{}\n {}\n str x29, [x9]\n str x30, [x9, #8]", sub_sp, x9_addr ) } } ArmOpcode::LdpPost => { let frame_size = mf.frame.size as i64; let ldp_offset = frame_size - 16; let add_sp = fmt_sp_imm("add", "sp", "sp", frame_size); if ldp_offset <= 504 { format!("ldp x29, x30, [sp, #{}]\n {}", ldp_offset, add_sp) } else if ldp_offset <= 32760 { format!( "ldr x29, [sp, #{}]\n ldr x30, [sp, #{}]\n {}", ldp_offset, ldp_offset + 8, add_sp ) } else { // Frame too large for unsigned immediate ldr. // Synthesize address in x9 then restore with register-offset ldr. let x9_addr = fmt_sp_imm("add", "x9", "sp", ldp_offset); format!( "{}\n ldr x29, [x9]\n ldr x30, [x9, #8]\n {}", x9_addr, add_sp ) } } // Non-preindex STP/LDP for callee-save pairs. // Operands: [src1/dst1, src2/dst2, base, imm]. ArmOpcode::StpOffset => { let r1 = op_str(&inst.operands[0]); let r2 = op_str(&inst.operands[1]); let base = op_str(&inst.operands[2]); let off = match &inst.operands[3] { MachineOperand::Imm(v) => *v, MachineOperand::FrameSlot(v) => *v as i64, _ => 0, }; // STP signed-offset range: 7-bit signed × 8 → [-512, 504]. // Fall back to two individual STR instructions if out of range. if (-512..=504).contains(&off) { format!("stp {}, {}, [{}, #{}]", r1, r2, base, off) } else { format!( "{}\n str {}, [x9]\n str {}, [x9, #8]", fmt_addr_with_offset("x9", &base, off, "x16"), r1, r2 ) } } ArmOpcode::LdpOffset => { let r1 = op_str(&inst.operands[0]); let r2 = op_str(&inst.operands[1]); let base = op_str(&inst.operands[2]); let off = match &inst.operands[3] { MachineOperand::Imm(v) => *v, MachineOperand::FrameSlot(v) => *v as i64, _ => 0, }; // LDP signed-offset range: 7-bit signed × 8 → [-512, 504]. // Fall back to two individual LDR instructions if out of range. if (-512..=504).contains(&off) { format!("ldp {}, {}, [{}, #{}]", r1, r2, base, off) } else { format!( "{}\n ldr {}, [x9]\n ldr {}, [x9, #8]", fmt_addr_with_offset("x9", &base, off, "x16"), r1, r2 ) } } ArmOpcode::AdrpLdr => { if let MachineOperand::ConstPool(idx) = &inst.operands[1] { let label = const_pool_label(&mf.name, *idx); let dest = op_str(&inst.operands[0]); // ADRP requires a GP register. If dest is FP (s/d), use x8 as scratch. let is_fp = dest.starts_with('s') || dest.starts_with('d'); if is_fp { format!( "adrp x8, {1}@PAGE\n ldr {0}, [x8, {1}@PAGEOFF]", dest, label ) } else { format!( "adrp {0}, {1}@PAGE\n ldr {0}, [{0}, {1}@PAGEOFF]", dest, label ) } } else { "nop ; bad adrp+ldr".into() } } ArmOpcode::AdrpAdd => { let dest = op_str(&inst.operands[0]); match &inst.operands[1] { MachineOperand::ConstPool(idx) => { let label = const_pool_label(&mf.name, *idx); format!( "adrp {0}, {1}@PAGE\n add {0}, {0}, {1}@PAGEOFF", dest, label ) } MachineOperand::GlobalLabel(name) => { // Mach-O convention: globals get an underscore prefix. let sym = if name.starts_with('_') { name.clone() } else { format!("_{}", name) }; format!( "adrp {0}, {1}@PAGE\n add {0}, {0}, {1}@PAGEOFF", dest, sym ) } _ => "nop ; bad adrp+add".into(), } } ArmOpcode::B => { match &inst.operands[0] { MachineOperand::BlockRef(id) => format!("b {}", mf.block(*id).label), // Tail call to an external symbol (TCO): B _callee MachineOperand::Extern(name) => { if name.starts_with('_') { format!("b {}", name) } else { format!("b _{}", name) } } _ => "b ???".into(), } } ArmOpcode::BCond => { let cond = if let MachineOperand::Cond(c) = &inst.operands[0] { cond_str(*c) } else { "eq" }; let target = if let MachineOperand::BlockRef(id) = &inst.operands[1] { mf.block(*id).label.clone() } else { "???".into() }; format!("b.{} {}", cond, target) } ArmOpcode::Cbz | ArmOpcode::Cbnz => { let mnemonic = match inst.opcode { ArmOpcode::Cbz => "cbz", _ => "cbnz", }; let target = if let MachineOperand::BlockRef(id) = &inst.operands[1] { mf.block(*id).label.clone() } else { "???".into() }; format!("{} {}, {}", mnemonic, op_str(&inst.operands[0]), target) } ArmOpcode::Tbz | ArmOpcode::Tbnz => { let mnemonic = match inst.opcode { ArmOpcode::Tbz => "tbz", _ => "tbnz", }; let bit = if let MachineOperand::Imm(v) = &inst.operands[1] { *v } else { 0 }; let target = if let MachineOperand::BlockRef(id) = &inst.operands[2] { mf.block(*id).label.clone() } else { "???".into() }; format!( "{} {}, #{}, {}", mnemonic, op_str(&inst.operands[0]), bit, target ) } ArmOpcode::Bl => { if let MachineOperand::Extern(name) = &inst.operands[0] { // Mach-O convention: C symbols get a _ prefix. if name.starts_with('_') { format!("bl {}", name) // already prefixed } else { format!("bl _{}", name) // add Mach-O prefix } } else { "bl ???".into() } } ArmOpcode::Blr => format!("blr {}", op_str(&inst.operands[0])), ArmOpcode::Sxtw => format!( "sxtw {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]) ), ArmOpcode::Sxth => format!( "sxth {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]) ), ArmOpcode::Sxtb => format!( "sxtb {}, {}", op_str(&inst.operands[0]), op_str(&inst.operands[1]) ), ArmOpcode::Ret => "ret".into(), ArmOpcode::Nop => "nop".into(), ArmOpcode::Brk => { let imm = if let MachineOperand::Imm(v) = &inst.operands[0] { *v } else { 1 }; format!("brk #{}", imm) } // ---- NEON SIMD vector ops (Sprint 12 Stage 2) ---- // // Each op forwards to a small helper so the lane-shape suffix // (.4s / .2d / .s[n] / .d[n]) lives in one place. ArmOpcode::AddV4S => fmt_vbinop(inst, "add", "4s"), ArmOpcode::AddV2D => fmt_vbinop(inst, "add", "2d"), ArmOpcode::SubV4S => fmt_vbinop(inst, "sub", "4s"), ArmOpcode::SubV2D => fmt_vbinop(inst, "sub", "2d"), ArmOpcode::MulV4S => fmt_vbinop(inst, "mul", "4s"), ArmOpcode::NegV4S => fmt_vunop(inst, "neg", "4s"), ArmOpcode::NegV2D => fmt_vunop(inst, "neg", "2d"), ArmOpcode::FaddV4S => fmt_vbinop(inst, "fadd", "4s"), ArmOpcode::FaddV2D => fmt_vbinop(inst, "fadd", "2d"), ArmOpcode::FsubV4S => fmt_vbinop(inst, "fsub", "4s"), ArmOpcode::FsubV2D => fmt_vbinop(inst, "fsub", "2d"), ArmOpcode::FmulV4S => fmt_vbinop(inst, "fmul", "4s"), ArmOpcode::FmulV2D => fmt_vbinop(inst, "fmul", "2d"), ArmOpcode::FdivV4S => fmt_vbinop(inst, "fdiv", "4s"), ArmOpcode::FdivV2D => fmt_vbinop(inst, "fdiv", "2d"), ArmOpcode::FnegV4S => fmt_vunop(inst, "fneg", "4s"), ArmOpcode::FnegV2D => fmt_vunop(inst, "fneg", "2d"), ArmOpcode::FabsV4S => fmt_vunop(inst, "fabs", "4s"), ArmOpcode::FabsV2D => fmt_vunop(inst, "fabs", "2d"), ArmOpcode::FsqrtV4S => fmt_vunop(inst, "fsqrt", "4s"), ArmOpcode::FsqrtV2D => fmt_vunop(inst, "fsqrt", "2d"), ArmOpcode::BslV16B => fmt_vbinop(inst, "bsl", "16b"), ArmOpcode::FcmgtV4S => fmt_vbinop(inst, "fcmgt", "4s"), ArmOpcode::FcmgtV2D => fmt_vbinop(inst, "fcmgt", "2d"), ArmOpcode::FcmgeV4S => fmt_vbinop(inst, "fcmge", "4s"), ArmOpcode::FcmgeV2D => fmt_vbinop(inst, "fcmge", "2d"), ArmOpcode::FcmeqV4S => fmt_vbinop(inst, "fcmeq", "4s"), ArmOpcode::FcmeqV2D => fmt_vbinop(inst, "fcmeq", "2d"), ArmOpcode::CmgtV4S => fmt_vbinop(inst, "cmgt", "4s"), ArmOpcode::CmgeV4S => fmt_vbinop(inst, "cmge", "4s"), ArmOpcode::CmeqV4S => fmt_vbinop(inst, "cmeq", "4s"), ArmOpcode::FmlaV4S => fmt_vbinop(inst, "fmla", "4s"), ArmOpcode::FmlaV2D => fmt_vbinop(inst, "fmla", "2d"), ArmOpcode::FminV4S => fmt_vbinop(inst, "fmin", "4s"), ArmOpcode::FminV2D => fmt_vbinop(inst, "fmin", "2d"), ArmOpcode::FmaxV4S => fmt_vbinop(inst, "fmax", "4s"), ArmOpcode::FmaxV2D => fmt_vbinop(inst, "fmax", "2d"), ArmOpcode::SminV4S => fmt_vbinop(inst, "smin", "4s"), ArmOpcode::SmaxV4S => fmt_vbinop(inst, "smax", "4s"), ArmOpcode::UminV4S => fmt_vbinop(inst, "umin", "4s"), ArmOpcode::UmaxV4S => fmt_vbinop(inst, "umax", "4s"), // afs-as dialect: cross-lane reductions encode the shape in // the mnemonic suffix; the destination is a scalar `s/d` and // the source is the bare vector register. ArmOpcode::FaddpV2S => format!( "faddp.2s {}, {}", fp32_scalar(&inst.operands[0]), v_reg_bare(&inst.operands[1]), ), ArmOpcode::FaddpV2D => format!( "faddp.2d {}, {}", fp64_scalar(&inst.operands[0]), v_reg_bare(&inst.operands[1]), ), ArmOpcode::Faddv4S => format!( "faddv.4s {}, {}", fp32_scalar(&inst.operands[0]), v_reg_bare(&inst.operands[1]), ), ArmOpcode::Sminv4S => format!( "sminv.4s {}, {}", fp32_scalar(&inst.operands[0]), v_reg_bare(&inst.operands[1]), ), ArmOpcode::Smaxv4S => format!( "smaxv.4s {}, {}", fp32_scalar(&inst.operands[0]), v_reg_bare(&inst.operands[1]), ), ArmOpcode::FmaxvV4S => format!( "fmaxv.4s {}, {}", fp32_scalar(&inst.operands[0]), v_reg_bare(&inst.operands[1]), ), ArmOpcode::FminvV4S => format!( "fminv.4s {}, {}", fp32_scalar(&inst.operands[0]), v_reg_bare(&inst.operands[1]), ), ArmOpcode::FmaxpV2DScalar => format!( "fmaxp.2d {}, {}", fp64_scalar(&inst.operands[0]), v_reg_bare(&inst.operands[1]), ), ArmOpcode::FminpV2DScalar => format!( "fminp.2d {}, {}", fp64_scalar(&inst.operands[0]), v_reg_bare(&inst.operands[1]), ), ArmOpcode::Uminv4S => format!( "uminv.4s {}, {}", fp32_scalar(&inst.operands[0]), v_reg_bare(&inst.operands[1]), ), ArmOpcode::Umaxv4S => format!( "umaxv.4s {}, {}", fp32_scalar(&inst.operands[0]), v_reg_bare(&inst.operands[1]), ), ArmOpcode::Addv4S => format!( "addv.4s {}, {}", fp32_scalar(&inst.operands[0]), v_reg_bare(&inst.operands[1]), ), ArmOpcode::DupGen4S => format!( "dup.4s {}, {}", v_reg_bare(&inst.operands[0]), op_str(&inst.operands[1]), ), ArmOpcode::DupGen2D => format!( "dup.2d {}, {}", v_reg_bare(&inst.operands[0]), op_str(&inst.operands[1]), ), ArmOpcode::DupEl4S => format!( "dup.4s {}, {}", v_reg_bare(&inst.operands[0]), v_lane_bare(&inst.operands[1], "s", 0), ), ArmOpcode::DupEl2D => format!( "dup.2d {}, {}", v_reg_bare(&inst.operands[0]), v_lane_bare(&inst.operands[1], "d", 0), ), ArmOpcode::Ins4S => { let lane = imm_u8(&inst.operands[1]); format!( "ins.s {}, {}", v_lane_bare(&inst.operands[0], "s", lane), op_str(&inst.operands[2]), ) } ArmOpcode::Ins2D => { let lane = imm_u8(&inst.operands[1]); format!( "ins.d {}, {}", v_lane_bare(&inst.operands[0], "d", lane), op_str(&inst.operands[2]), ) } ArmOpcode::Umov4S => { let lane = imm_u8(&inst.operands[2]); format!( "umov.s {}, {}", op_str(&inst.operands[0]), v_lane_bare(&inst.operands[1], "s", lane), ) } ArmOpcode::Umov2D => { let lane = imm_u8(&inst.operands[2]); format!( "umov.d {}, {}", op_str(&inst.operands[0]), v_lane_bare(&inst.operands[1], "d", lane), ) } ArmOpcode::FmovEl4S => { let lane = imm_u8(&inst.operands[2]); format!( "mov.s {}, {}", fp32_scalar(&inst.operands[0]), v_lane_bare(&inst.operands[1], "s", lane), ) } ArmOpcode::FmovEl2D => { let lane = imm_u8(&inst.operands[2]); format!( "mov.d {}, {}", fp64_scalar(&inst.operands[0]), v_lane_bare(&inst.operands[1], "d", lane), ) } ArmOpcode::LdrQ => format!( "ldr {}, [{}, {}]", q_reg(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]), ), ArmOpcode::StrQ => format!( "str {}, [{}, {}]", q_reg(&inst.operands[0]), op_str(&inst.operands[1]), op_str(&inst.operands[2]), ), } } // ---- NEON formatting helpers ---- fn v_reg(op: &MachineOperand, shape: &str) -> String { match op { MachineOperand::VReg(id) => format!("v{}.{}", id.0, shape), MachineOperand::PhysReg(PhysReg::Fp(n)) | MachineOperand::PhysReg(PhysReg::Fp32(n)) => { format!("v{}.{}", n, shape) } _ => format!("{}.{}", op_str(op), shape), } } fn q_reg(op: &MachineOperand) -> String { match op { MachineOperand::VReg(id) => format!("q{}", id.0), MachineOperand::PhysReg(PhysReg::Fp(n)) | MachineOperand::PhysReg(PhysReg::Fp32(n)) => { format!("q{}", n) } _ => format!("q{}", op_str(op)), } } fn v_lane(op: &MachineOperand, lane_ty: &str, lane: u8) -> String { match op { MachineOperand::VReg(id) => format!("v{}.{}[{}]", id.0, lane_ty, lane), MachineOperand::PhysReg(PhysReg::Fp(n)) | MachineOperand::PhysReg(PhysReg::Fp32(n)) => { format!("v{}.{}[{}]", n, lane_ty, lane) } _ => format!("v{}.{}[{}]", op_str(op), lane_ty, lane), } } fn fp32_scalar(op: &MachineOperand) -> String { match op { MachineOperand::VReg(id) => format!("s{}", id.0), MachineOperand::PhysReg(PhysReg::Fp(n)) | MachineOperand::PhysReg(PhysReg::Fp32(n)) => { format!("s{}", n) } _ => op_str(op), } } fn fp64_scalar(op: &MachineOperand) -> String { match op { MachineOperand::VReg(id) => format!("d{}", id.0), MachineOperand::PhysReg(PhysReg::Fp(n)) | MachineOperand::PhysReg(PhysReg::Fp32(n)) => { format!("d{}", n) } _ => op_str(op), } } fn imm_u8(op: &MachineOperand) -> u8 { if let MachineOperand::Imm(v) = op { *v as u8 } else { 0 } } fn fmt_vbinop(inst: &MachineInst, mnemonic: &str, shape: &str) -> String { // afs-as dialect: shape suffix is part of the mnemonic, operand // registers are bare (`fadd.4s v0, v1, v2`). Encodes to the same // bytes as the Apple/GNU `fadd v0.4s, v1.4s, v2.4s` form. format!( "{}.{} {}, {}, {}", mnemonic, shape, v_reg_bare(&inst.operands[0]), v_reg_bare(&inst.operands[1]), v_reg_bare(&inst.operands[2]), ) } fn fmt_vunop(inst: &MachineInst, mnemonic: &str, shape: &str) -> String { format!( "{}.{} {}, {}", mnemonic, shape, v_reg_bare(&inst.operands[0]), v_reg_bare(&inst.operands[1]), ) } fn v_reg_bare(op: &MachineOperand) -> String { match op { MachineOperand::VReg(id) => format!("v{}", id.0), MachineOperand::PhysReg(PhysReg::Fp(n)) | MachineOperand::PhysReg(PhysReg::Fp32(n)) => { format!("v{}", n) } _ => op_str(op), } } fn v_lane_bare(op: &MachineOperand, _lane_ty: &str, lane: u8) -> String { // afs-as dialect for `umov.s w3, v0[2]` — bare reg with `[lane]` // suffix; the element-size width is encoded into the mnemonic // (`umov.s` / `umov.d`). match op { MachineOperand::VReg(id) => format!("v{}[{}]", id.0, lane), MachineOperand::PhysReg(PhysReg::Fp(n)) | MachineOperand::PhysReg(PhysReg::Fp32(n)) => { format!("v{}[{}]", n, lane) } _ => format!("{}[{}]", op_str(op), lane), } } /// Format a machine operand as assembly text. fn op_str(op: &MachineOperand) -> String { match op { MachineOperand::VReg(id) => format!("v{}", id.0), // placeholder until regalloc MachineOperand::PhysReg(PhysReg::Sp) => "sp".into(), MachineOperand::PhysReg(PhysReg::Xzr) => "xzr".into(), MachineOperand::PhysReg(PhysReg::Wzr) => "wzr".into(), MachineOperand::PhysReg(PhysReg::Gp(n)) => format!("x{}", n), MachineOperand::PhysReg(PhysReg::Gp32(n)) => format!("w{}", n), MachineOperand::PhysReg(PhysReg::Fp(n)) => format!("d{}", n), MachineOperand::PhysReg(PhysReg::Fp32(n)) => format!("s{}", n), MachineOperand::Imm(v) => format!("#{}", v), MachineOperand::FrameSlot(off) => format!("[fp, #{}]", off), MachineOperand::Cond(c) => cond_str(*c).into(), MachineOperand::BlockRef(id) => format!("bb{}", id.0), MachineOperand::Extern(name) => name.clone(), MachineOperand::GlobalLabel(name) => { if name.starts_with('_') { name.clone() } else { format!("_{}", name) } } MachineOperand::ConstPool(idx) => format!("cp{}", idx), MachineOperand::Shift(s) => format!("lsl #{}", s), } } fn fp_reg_str(op: &MachineOperand, is_f64: bool) -> String { match op { MachineOperand::PhysReg(PhysReg::Fp(n)) | MachineOperand::PhysReg(PhysReg::Fp32(n)) => { if is_f64 { format!("d{}", n) } else { format!("s{}", n) } } _ => op_str(op), } } fn cond_str(c: ArmCond) -> &'static str { match c { ArmCond::Eq => "eq", ArmCond::Ne => "ne", ArmCond::Hs => "hs", ArmCond::Lo => "lo", ArmCond::Mi => "mi", ArmCond::Pl => "pl", ArmCond::Hi => "hi", ArmCond::Ls => "ls", ArmCond::Ge => "ge", ArmCond::Lt => "lt", ArmCond::Gt => "gt", ArmCond::Le => "le", } } /// Generate a constant pool label. fn const_pool_label(func: &str, idx: u32) -> String { format!("__{}_cp{}", func, idx) } #[cfg(test)] mod tests { use super::*; use crate::codegen::isel::select_function; use crate::ir::builder::FuncBuilder; use crate::ir::inst::*; use crate::ir::types::*; fn emit_simple(build: impl FnOnce(&mut FuncBuilder)) -> String { let mut func = Function::new("test".into(), vec![], IrType::Void); { let mut b = FuncBuilder::new(&mut func); build(&mut b); } let mf = select_function(&func); emit_function(&mf) } #[test] fn emit_prologue_epilogue() { let asm = emit_simple(|b| b.ret_void()); assert!( asm.contains("sub sp, sp,"), "missing frame allocation: {}", asm ); assert!( asm.contains("stp x29, x30, [sp,"), "missing prologue save: {}", asm ); assert!( asm.contains("ldp x29, x30, [sp,"), "missing epilogue restore: {}", asm ); assert!( asm.contains("add sp, sp,"), "missing frame deallocation: {}", asm ); assert!(asm.contains("ret"), "missing ret: {}", asm); } #[test] fn emit_integer_add() { let asm = emit_simple(|b| { let x = b.const_i32(10); let y = b.const_i32(20); let _z = b.iadd(x, y); b.ret_void(); }); assert!(asm.contains("add "), "missing add: {}", asm); } #[test] fn emit_function_label() { let asm = emit_simple(|b| b.ret_void()); assert!(asm.contains(".globl _test"), "missing .globl: {}", asm); assert!(asm.contains("_test:"), "missing function label: {}", asm); } /// Verify that functions with frame sizes > 4095 use x16 scratch /// synthesis for the `sub sp, sp, #N` prologue and `add sp, sp, #N` /// epilogue rather than an out-of-range immediate. #[test] fn emit_large_frame_prologue() { // 700 allocas of i64 = 700 * 8 = 5600 bytes, well over 4095. let asm = emit_simple(|b| { for _ in 0..700 { let _ = b.alloca(IrType::Int(IntWidth::I64)); } b.ret_void(); }); // The 12-bit immediate max is 4095, so the emitter must // synthesize the frame size via x16. assert!( asm.contains("movz x16,"), "large frame should use x16 synthesis: {}", asm ); assert!( asm.contains("sub sp, sp, x16"), "large frame sub should use register form: {}", asm ); assert!( asm.contains("add sp, sp, x16"), "large frame add should use register form: {}", asm ); // Must NOT contain a raw "sub sp, sp, #5" that exceeds 4095. assert!( !asm.contains("sub sp, sp, #5"), "should not emit out-of-range immediate: {}", asm ); } #[test] fn emit_huge_frame_with_stack_probes() { let asm = emit_simple(|b| { for _ in 0..3000 { let _ = b.alloca(IrType::Int(IntWidth::I64)); } b.ret_void(); }); assert!( asm.contains("str xzr, [sp]"), "huge frame should probe each chunk: {}", asm ); } #[test] fn emit_branch() { let asm = emit_simple(|b| { let cond = b.const_bool(true); let bb_t = b.create_block("then"); let bb_f = b.create_block("else"); b.cond_branch(cond, bb_t, vec![], bb_f, vec![]); b.set_block(bb_t); b.ret_void(); b.set_block(bb_f); b.ret_void(); }); assert!(asm.contains("b.ne"), "missing conditional branch: {}", asm); assert!(asm.contains("then_"), "missing then label: {}", asm); assert!(asm.contains("else_"), "missing else label: {}", asm); } #[test] fn emit_i128_scalar_global_as_two_quads() { let asm = emit_globals(&[Global { name: "big".into(), ty: IrType::Int(IntWidth::I128), initializer: Some(GlobalInit::Int(18_446_744_073_709_551_616i128)), }]); assert!( asm.contains(".section __DATA,__data"), "missing data section:\n{}", asm ); assert!( asm.contains(".private_extern _big"), "missing global symbol:\n{}", asm ); assert!( asm.contains(".p2align 4"), "i128 globals need 16-byte alignment:\n{}", asm ); assert_eq!( asm.matches(".quad").count(), 2, "scalar i128 should emit two quads:\n{}", asm ); assert!( asm.contains(".quad 0x0000000000000000\n .quad 0x0000000000000001"), "scalar i128 should emit low/high 64-bit words in memory order:\n{}", asm ); } #[test] fn emit_i128_array_global_as_word_pairs() { let asm = emit_globals(&[Global { name: "arr".into(), ty: IrType::Array(Box::new(IrType::Int(IntWidth::I128)), 2), initializer: Some(GlobalInit::IntArray(vec![1, -1])), }]); assert_eq!( asm.matches(".quad").count(), 4, "two i128 elements should emit four quads:\n{}", asm ); assert!( asm.contains(".quad 0x0000000000000001\n .quad 0x0000000000000000"), "positive i128 array element should preserve low/high word order:\n{}", asm ); assert!( asm.contains(".quad 0xffffffffffffffff\n .quad 0xffffffffffffffff"), "negative i128 array element should preserve two's-complement words:\n{}", asm ); } #[test] fn emit_byte_array_global_uses_natural_alignment() { let asm = emit_globals(&[Global { name: "history".into(), ty: IrType::Array(Box::new(IrType::Int(IntWidth::I8)), 400), initializer: Some(GlobalInit::Zero), }]); assert!( asm.contains(".p2align 3\n_history:"), "byte-array globals that model descriptors/derived storage need 8-byte alignment:\n{}", asm ); } #[test] fn emit_nested_byte_array_global_uses_full_storage_size() { let asm = emit_globals(&[Global { name: "command_cache".into(), ty: IrType::Array( Box::new(IrType::Array(Box::new(IrType::Int(IntWidth::I8)), 264)), 4, ), initializer: Some(GlobalInit::Zero), }]); assert!( asm.contains("_command_cache:\n .space 1056"), "nested byte-array globals should reserve their full storage size:\n{}", asm ); } #[test] fn emit_mov_reg_truncates_x_source_through_w_view() { let mf = MachineFunction::new("test".into()); let inst = MachineInst { opcode: ArmOpcode::MovReg, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp32(21)), MachineOperand::PhysReg(PhysReg::Gp(20)), ], def: None, }; assert_eq!(emit_inst(&inst, &mf), "mov w21, w20"); } #[test] fn emit_fcvt_uses_fp_register_widths() { let mf = MachineFunction::new("test".into()); let to_single = MachineInst { opcode: ArmOpcode::FcvtSD, operands: vec![ MachineOperand::PhysReg(PhysReg::Fp(0)), MachineOperand::PhysReg(PhysReg::Fp(1)), ], def: None, }; let to_double = MachineInst { opcode: ArmOpcode::FcvtDS, operands: vec![ MachineOperand::PhysReg(PhysReg::Fp32(2)), MachineOperand::PhysReg(PhysReg::Fp32(3)), ], def: None, }; assert_eq!(emit_inst(&to_single, &mf), "fcvt s0, d1"); assert_eq!(emit_inst(&to_double, &mf), "fcvt d2, s3"); } #[test] fn emit_large_negative_pair_offsets_use_scratch_addressing() { let mf = MachineFunction::new("test".into()); let stp = MachineInst { opcode: ArmOpcode::StpOffset, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(0)), MachineOperand::PhysReg(PhysReg::Gp(1)), MachineOperand::PhysReg(PhysReg::FP), MachineOperand::Imm(-544), ], def: None, }; let ldp = MachineInst { opcode: ArmOpcode::LdpOffset, operands: vec![ MachineOperand::PhysReg(PhysReg::Gp(2)), MachineOperand::PhysReg(PhysReg::Gp(3)), MachineOperand::PhysReg(PhysReg::FP), MachineOperand::Imm(-544), ], def: None, }; let stp_asm = emit_inst(&stp, &mf); let ldp_asm = emit_inst(&ldp, &mf); assert!( stp_asm.contains("sub x9, x29, #544"), "large negative stp offset should synthesize address: {}", stp_asm ); assert!( ldp_asm.contains("sub x9, x29, #544"), "large negative ldp offset should synthesize address: {}", ldp_asm ); assert!( !stp_asm.contains("[x29, #-544]"), "stp should not emit out-of-range raw offset: {}", stp_asm ); assert!( !ldp_asm.contains("[x29, #-544]"), "ldp should not emit out-of-range raw offset: {}", ldp_asm ); } #[test] fn emit_internal_only_function_as_private_extern() { let mut mf = MachineFunction::new("helper".into()); mf.internal_only = true; let asm = emit_function(&mf); assert!( asm.contains(".private_extern _helper"), "internal-only functions should not be emitted as globals:\n{}", asm ); assert!( !asm.contains(".globl _helper"), "internal-only functions should not keep external linkage:\n{}", asm ); } // ---- NEON SIMD emit smoke tests (Sprint 12 Stage 2) ---- // // The vectorizer doesn't generate any of these yet, but the emit // formatters can be exercised directly by hand-building a // MachineInst and feeding it through `emit_inst`. These tests // pin the assembly text form so future codegen wiring has a // golden reference. use crate::codegen::mir::{ArmOpcode, MachineFunction, MachineInst, MachineOperand, RegClass}; fn emit_one(opcode: ArmOpcode, operands: Vec) -> String { let mut mf = MachineFunction::new("t".into()); mf.new_block("entry"); let inst = MachineInst { opcode, operands, def: None, }; emit_inst(&inst, &mf) } #[test] fn emit_fadd_v_4s_form() { let mut mf = MachineFunction::new("t".into()); let v0 = mf.new_vreg(RegClass::V128); let v1 = mf.new_vreg(RegClass::V128); let v2 = mf.new_vreg(RegClass::V128); let asm = emit_one( ArmOpcode::FaddV4S, vec![ MachineOperand::VReg(v0), MachineOperand::VReg(v1), MachineOperand::VReg(v2), ], ); let _ = mf; // afs-as dialect: shape suffix on mnemonic, bare regs. assert_eq!(asm, "fadd.4s v0, v1, v2"); } #[test] fn emit_fadd_v_2d_form() { let asm = emit_one( ArmOpcode::FaddV2D, vec![ MachineOperand::VReg(crate::codegen::mir::VRegId(0)), MachineOperand::VReg(crate::codegen::mir::VRegId(1)), MachineOperand::VReg(crate::codegen::mir::VRegId(2)), ], ); assert_eq!(asm, "fadd.2d v0, v1, v2"); } #[test] fn emit_fmla_v_4s_form() { let asm = emit_one( ArmOpcode::FmlaV4S, vec![ MachineOperand::VReg(crate::codegen::mir::VRegId(0)), MachineOperand::VReg(crate::codegen::mir::VRegId(1)), MachineOperand::VReg(crate::codegen::mir::VRegId(2)), ], ); assert_eq!(asm, "fmla.4s v0, v1, v2"); } #[test] fn emit_addv_4s_reduction_form() { let asm = emit_one( ArmOpcode::Addv4S, vec![ MachineOperand::VReg(crate::codegen::mir::VRegId(0)), MachineOperand::VReg(crate::codegen::mir::VRegId(1)), ], ); assert_eq!(asm, "addv.4s s0, v1"); } #[test] fn emit_dup_gen_4s_broadcasts_w_register() { let asm = emit_one( ArmOpcode::DupGen4S, vec![ MachineOperand::VReg(crate::codegen::mir::VRegId(0)), MachineOperand::PhysReg(crate::codegen::mir::PhysReg::Gp32(2)), ], ); assert_eq!(asm, "dup.4s v0, w2"); } #[test] fn emit_dup_el_4s_broadcasts_fp_lane_zero() { // Splatting an Fp32 scalar (which lives in v2's lane 0) into // a 4×f32 vector uses the lane-dup form. The gp form // `dup.4s v0, s2` is rejected by the assembler. afs-as // dialect: bare `vN[L]` (no `.s` suffix), with the lane // element width encoded into the `dup.4s` mnemonic. let asm = emit_one( ArmOpcode::DupEl4S, vec![ MachineOperand::VReg(crate::codegen::mir::VRegId(0)), MachineOperand::VReg(crate::codegen::mir::VRegId(2)), ], ); assert_eq!(asm, "dup.4s v0, v2[0]"); } #[test] fn emit_dup_el_2d_broadcasts_fp_lane_zero() { let asm = emit_one( ArmOpcode::DupEl2D, vec![ MachineOperand::VReg(crate::codegen::mir::VRegId(0)), MachineOperand::VReg(crate::codegen::mir::VRegId(2)), ], ); assert_eq!(asm, "dup.2d v0, v2[0]"); } #[test] fn emit_ldr_q_form() { let asm = emit_one( ArmOpcode::LdrQ, vec![ MachineOperand::VReg(crate::codegen::mir::VRegId(0)), MachineOperand::PhysReg(crate::codegen::mir::PhysReg::Gp(1)), MachineOperand::Imm(16), ], ); assert_eq!(asm, "ldr q0, [x1, #16]"); } #[test] fn emit_str_q_form() { let asm = emit_one( ArmOpcode::StrQ, vec![ MachineOperand::VReg(crate::codegen::mir::VRegId(0)), MachineOperand::PhysReg(crate::codegen::mir::PhysReg::Gp(1)), MachineOperand::Imm(0), ], ); assert_eq!(asm, "str q0, [x1, #0]"); } #[test] fn emit_umov_extracts_lane() { let asm = emit_one( ArmOpcode::Umov4S, vec![ MachineOperand::PhysReg(crate::codegen::mir::PhysReg::Gp32(3)), MachineOperand::VReg(crate::codegen::mir::VRegId(0)), MachineOperand::Imm(2), ], ); assert_eq!(asm, "umov.s w3, v0[2]"); } }