Rust · 22825 bytes Raw Blame History
1 //! ARM64 conditional-branch range relaxation.
2 //!
3 //! `B.cond` carries a 19-bit signed PC-relative immediate, so its
4 //! target must lie within ±1MB of the branch. Single-function bodies
5 //! that exceed that span (notably `stdlib_slaruv`, ~363KB of compiled
6 //! assembly with cross-body conditional jumps) trip the assembler's
7 //! "fixup value out of range" error.
8 //!
9 //! This pass runs after register allocation and tail-call optimization,
10 //! before assembly emission. It walks the function's blocks in linear
11 //! order, computes an approximate byte offset for every block label,
12 //! and for each `B.cond far_label` whose distance exceeds the
13 //! conditional-branch encoding window expands it to the trampoline
14 //! pattern:
15 //!
16 //! ```text
17 //! Before: After (when target is too far):
18 //! b.cond far_label b.{!cond} skip ; one branch over
19 //! b far_label ; ±128MB unconditional
20 //! skip: ; falls through
21 //! ```
22 //!
23 //! Inserting an extra block + branch shifts subsequent block offsets,
24 //! which can push other previously-in-range branches out of range,
25 //! so we iterate the pass until no further insertions are needed
26 //! (capped at four iterations — non-convergence triggers a panic that
27 //! survives release builds and produces a clear ICE for diagnostics).
28 //!
29 //! ARM64 unconditional `B` and `BL` carry a 26-bit immediate (±128MB
30 //! range), so the trampoline's unconditional `B` itself never needs
31 //! relaxation — no realistic single Fortran source produces a
32 //! function body larger than 128MB.
33
34 use super::emit::emit_inst_text;
35 use super::mir::{
36 ArmCond, ArmOpcode, MBlockId, MachineBlock, MachineFunction, MachineInst, MachineOperand,
37 };
38
39 /// Maximum signed offset (in bytes) reachable by a `B.cond` and by
40 /// `cbz/cbnz`. The 19-bit signed immediate is scaled by 4, giving
41 /// ±(2^20) bytes. Subtract a safety margin so we never sit right on
42 /// the edge: a downstream peephole insertion or fall-through fixup
43 /// that nudges the offset by a single instruction shouldn't push us
44 /// over.
45 const COND_BRANCH_LIMIT: i64 = (1 << 20) - 64;
46
47 /// Maximum signed offset reachable by `tbz`/`tbnz`. The 14-bit
48 /// signed immediate is scaled by 4, giving ±(2^15) bytes — much
49 /// tighter than the cond-branch / cbz limit. Same safety margin.
50 const TBZ_BRANCH_LIMIT: i64 = (1 << 15) - 64;
51
52 /// Iteration cap. In practice 1–2 passes suffice; if convergence
53 /// genuinely doesn't happen, something pathological is going on and
54 /// we want a loud failure rather than a silent infinite loop.
55 const MAX_ITERATIONS: u32 = 4;
56
57 /// Run branch relaxation on a machine function. Idempotent: when no
58 /// `B.cond` overflows, the function is returned unchanged.
59 pub fn relax_branches(mf: &mut MachineFunction) {
60 for _ in 0..MAX_ITERATIONS {
61 if !relax_once(mf) {
62 return;
63 }
64 }
65 // Convergence failure — extremely unlikely. Better to ICE here
66 // than emit an assembly file the assembler will reject.
67 panic!(
68 "branch relaxation did not converge for function {} after {} iterations",
69 mf.name, MAX_ITERATIONS
70 );
71 }
72
73 /// Single relaxation pass. Returns `true` when at least one branch
74 /// was expanded (caller should iterate); `false` when every `B.cond`
75 /// is in range.
76 fn relax_once(mf: &mut MachineFunction) -> bool {
77 // Compute byte offsets for every block label using the actual
78 // emit-time instruction count. Several MIR opcodes lower to
79 // multiple ARM64 instructions (AdrpLdr / AdrpAdd, prologue +
80 // stack-probe sequences, large-immediate movz/movk chains, etc.),
81 // so a flat 4-bytes-per-MachineInst estimate would systematically
82 // under-shoot the real offsets in functions like stdlib_slaruv
83 // and miss far branches that need relaxation.
84 let block_offsets = compute_block_offsets(mf);
85
86 // Collect overflow sites before mutating anything. We also record
87 // the per-instruction prefix offset within each block so we can
88 // skip past wide-emit insts that precede the conditional branch.
89 let mut overflows: Vec<OverflowSite> = Vec::new();
90 for block in &mf.blocks {
91 let block_offset = block_offsets[&block.id];
92 let mut running = 0i64;
93 for (inst_idx, inst) in block.insts.iter().enumerate() {
94 if let Some((target, limit, kind)) = relaxable_cond_branch(inst) {
95 if let Some(&target_offset) = block_offsets.get(&target) {
96 let branch_offset = block_offset + running;
97 let delta = target_offset - branch_offset;
98 if delta.abs() > limit {
99 overflows.push(OverflowSite {
100 block_id: block.id,
101 inst_idx,
102 target,
103 kind,
104 });
105 }
106 }
107 }
108 running += inst_emit_bytes(inst, mf) as i64;
109 }
110 }
111
112 if overflows.is_empty() {
113 return false;
114 }
115
116 // Expand each overflow. Sort in reverse order of (block_id index,
117 // inst_idx) so earlier expansions don't invalidate later inst_idx
118 // values — block-vec insertions happen at the back of the affected
119 // block group.
120 overflows.sort_by(|a, b| {
121 let a_pos = block_position(mf, a.block_id);
122 let b_pos = block_position(mf, b.block_id);
123 b_pos.cmp(&a_pos).then(b.inst_idx.cmp(&a.inst_idx))
124 });
125
126 for site in overflows {
127 expand_to_trampoline(mf, site);
128 }
129
130 true
131 }
132
133 #[derive(Clone)]
134 struct OverflowSite {
135 block_id: MBlockId,
136 inst_idx: usize,
137 target: MBlockId,
138 kind: RelaxKind,
139 }
140
141 #[derive(Clone)]
142 enum RelaxKind {
143 /// `b.cond far` → `b.{!cond} skip; b far`
144 BCond { cond: ArmCond },
145 /// `cbz/cbnz reg far` → `cbnz/cbz reg skip; b far`
146 Cbz { invert_to: ArmOpcode, reg: MachineOperand },
147 /// `tbz/tbnz reg, #bit far` → `tbnz/tbz reg, #bit skip; b far`
148 Tbz {
149 invert_to: ArmOpcode,
150 reg: MachineOperand,
151 bit: i64,
152 },
153 }
154
155 /// Inspect a machine instruction; if it is a conditional branch that
156 /// might need relaxation, return the target, the per-opcode range
157 /// limit, and a kind descriptor capturing what the inverted form
158 /// looks like. Returns `None` for non-branch instructions or branches
159 /// whose targets are unrecoverable from operands.
160 fn relaxable_cond_branch(inst: &MachineInst) -> Option<(MBlockId, i64, RelaxKind)> {
161 match inst.opcode {
162 ArmOpcode::BCond => {
163 let target = bcond_target(inst)?;
164 let cond = bcond_cond(inst)?;
165 Some((target, COND_BRANCH_LIMIT, RelaxKind::BCond { cond }))
166 }
167 ArmOpcode::Cbz | ArmOpcode::Cbnz => {
168 // Operands: [reg, BlockRef]
169 let target = match inst.operands.get(1)? {
170 MachineOperand::BlockRef(id) => *id,
171 _ => return None,
172 };
173 let reg = inst.operands.first()?.clone();
174 let invert_to = match inst.opcode {
175 ArmOpcode::Cbz => ArmOpcode::Cbnz,
176 _ => ArmOpcode::Cbz,
177 };
178 Some((target, COND_BRANCH_LIMIT, RelaxKind::Cbz { invert_to, reg }))
179 }
180 ArmOpcode::Tbz | ArmOpcode::Tbnz => {
181 // Operands: [reg, Imm(bit), BlockRef]
182 let bit = match inst.operands.get(1)? {
183 MachineOperand::Imm(v) => *v,
184 _ => return None,
185 };
186 let target = match inst.operands.get(2)? {
187 MachineOperand::BlockRef(id) => *id,
188 _ => return None,
189 };
190 let reg = inst.operands.first()?.clone();
191 let invert_to = match inst.opcode {
192 ArmOpcode::Tbz => ArmOpcode::Tbnz,
193 _ => ArmOpcode::Tbz,
194 };
195 Some((
196 target,
197 TBZ_BRANCH_LIMIT,
198 RelaxKind::Tbz {
199 invert_to,
200 reg,
201 bit,
202 },
203 ))
204 }
205 _ => None,
206 }
207 }
208
209 /// Compute byte offsets for every block label, summing the actual
210 /// emit-time instruction byte count for every MachineInst in linear
211 /// order. Each emitted ARM64 instruction is 4 bytes; we count the
212 /// `\n` separators in the emitted text to figure out how many real
213 /// instructions an opcode produces (most are 1, but pseudo-ops and
214 /// large-immediate forms emit 2+).
215 fn compute_block_offsets(mf: &MachineFunction) -> std::collections::HashMap<MBlockId, i64> {
216 let mut offsets = std::collections::HashMap::with_capacity(mf.blocks.len());
217 let mut running: i64 = 0;
218 for block in &mf.blocks {
219 offsets.insert(block.id, running);
220 for inst in &block.insts {
221 running += inst_emit_bytes(inst, mf) as i64;
222 }
223 }
224 offsets
225 }
226
227 /// Number of bytes a single MachineInst emits at assembly time.
228 /// Counted from the rendered text — `emit_inst_text` already produces
229 /// exactly the lines `emit_function` would, so newline-count + 1
230 /// matches the real instruction count without re-deriving each
231 /// opcode's expansion rules here.
232 fn inst_emit_bytes(inst: &MachineInst, mf: &MachineFunction) -> u32 {
233 let text = emit_inst_text(inst, mf);
234 let lines = text.matches('\n').count() as u32 + 1;
235 4 * lines
236 }
237
238 fn block_position(mf: &MachineFunction, id: MBlockId) -> usize {
239 mf.blocks
240 .iter()
241 .position(|b| b.id == id)
242 .expect("block id present in block list")
243 }
244
245 fn bcond_target(inst: &MachineInst) -> Option<MBlockId> {
246 inst.operands.iter().find_map(|op| match op {
247 MachineOperand::BlockRef(id) => Some(*id),
248 _ => None,
249 })
250 }
251
252 fn bcond_cond(inst: &MachineInst) -> Option<ArmCond> {
253 inst.operands.iter().find_map(|op| match op {
254 MachineOperand::Cond(c) => Some(*c),
255 _ => None,
256 })
257 }
258
259 /// Replace an out-of-range conditional branch at `(block_id, inst_idx)`
260 /// with the trampoline pattern. Allocates a fresh skip block and
261 /// inserts it right after the original block in `mf.blocks`. The
262 /// inverted-condition branch jumps over the unconditional `b`, which
263 /// has the full ±128MB reach of the unconditional encoding — so the
264 /// trampoline itself never needs further relaxation regardless of
265 /// which conditional opcode we started from.
266 fn expand_to_trampoline(mf: &mut MachineFunction, site: OverflowSite) {
267 let OverflowSite {
268 block_id,
269 inst_idx,
270 target: far_target,
271 kind,
272 } = site;
273
274 // Allocate a fresh skip-block id without disturbing any other
275 // bookkeeping. We can't call `mf.new_block` directly because it
276 // pushes onto the back of the vec; we want the skip block
277 // physically adjacent to the original block (so its label
278 // fall-through reaches the original successor).
279 let skip_id = MBlockId(mf.next_block_id());
280 let skip_label = format!("{}_relax{}", mf.block(block_id).label, skip_id.0);
281
282 let inverted = match kind {
283 RelaxKind::BCond { cond } => MachineInst {
284 opcode: ArmOpcode::BCond,
285 operands: vec![
286 MachineOperand::Cond(cond.inverse()),
287 MachineOperand::BlockRef(skip_id),
288 ],
289 def: None,
290 },
291 RelaxKind::Cbz { invert_to, reg } => MachineInst {
292 opcode: invert_to,
293 operands: vec![reg, MachineOperand::BlockRef(skip_id)],
294 def: None,
295 },
296 RelaxKind::Tbz {
297 invert_to,
298 reg,
299 bit,
300 } => MachineInst {
301 opcode: invert_to,
302 operands: vec![
303 reg,
304 MachineOperand::Imm(bit),
305 MachineOperand::BlockRef(skip_id),
306 ],
307 def: None,
308 },
309 };
310 let unconditional_b = MachineInst {
311 opcode: ArmOpcode::B,
312 operands: vec![MachineOperand::BlockRef(far_target)],
313 def: None,
314 };
315
316 // Splice them in, preserving any instructions that followed the
317 // original branch. Those trailing instructions (uncommon) move
318 // into the skip block to keep program order.
319 let block_pos = block_position(mf, block_id);
320 let trailing = {
321 let block = &mut mf.blocks[block_pos];
322 let trailing: Vec<MachineInst> = block.insts.drain(inst_idx + 1..).collect();
323 block.insts[inst_idx] = inverted;
324 block.insts.push(unconditional_b);
325 trailing
326 };
327
328 let mut skip_block = MachineBlock::new(skip_id, skip_label);
329 skip_block.insts = trailing;
330 mf.blocks.insert(block_pos + 1, skip_block);
331 }
332
333 #[cfg(test)]
334 mod tests {
335 use super::*;
336
337 /// Compile-time confirmation that every ArmCond's inverse is
338 /// involutive. Easy to typo otherwise.
339 #[test]
340 fn arm_cond_inverse_is_involutive() {
341 for c in [
342 ArmCond::Eq,
343 ArmCond::Ne,
344 ArmCond::Hs,
345 ArmCond::Lo,
346 ArmCond::Mi,
347 ArmCond::Pl,
348 ArmCond::Hi,
349 ArmCond::Ls,
350 ArmCond::Ge,
351 ArmCond::Lt,
352 ArmCond::Gt,
353 ArmCond::Le,
354 ] {
355 assert_eq!(c.inverse().inverse(), c, "{:?}", c);
356 }
357 }
358
359 /// In-range branches stay as a single instruction.
360 #[test]
361 fn in_range_bcond_unchanged() {
362 let mut mf = MachineFunction::new("test".into());
363 let target = mf.new_block("after");
364 let entry_id = mf.blocks[0].id;
365 mf.blocks[0].insts.push(MachineInst {
366 opcode: ArmOpcode::BCond,
367 operands: vec![
368 MachineOperand::Cond(ArmCond::Ne),
369 MachineOperand::BlockRef(target),
370 ],
371 def: None,
372 });
373 let _ = entry_id;
374
375 let block_count_before = mf.blocks.len();
376 relax_branches(&mut mf);
377 assert_eq!(mf.blocks.len(), block_count_before);
378 assert_eq!(mf.blocks[0].insts.len(), 1);
379 assert_eq!(mf.blocks[0].insts[0].opcode, ArmOpcode::BCond);
380 }
381
382 /// Out-of-range branch expands to a trampoline. We force
383 /// overflow by stuffing one block with enough nops to push the
384 /// target's offset past ±1MB.
385 #[test]
386 fn out_of_range_bcond_expands() {
387 let mut mf = MachineFunction::new("test".into());
388 let target = mf.new_block("after_padding");
389 let padding = mf.new_block("padding");
390
391 // Entry: BCond Ne -> target.
392 mf.blocks[0].insts.push(MachineInst {
393 opcode: ArmOpcode::BCond,
394 operands: vec![
395 MachineOperand::Cond(ArmCond::Ne),
396 MachineOperand::BlockRef(target),
397 ],
398 def: None,
399 });
400 // Branch into padding so the entry block has at least one
401 // successor, then padding stuffs enough Nops to push the
402 // `after_padding` label past ±1MB.
403 let pad_inst_count = (COND_BRANCH_LIMIT as usize) / 4 + 16;
404 let padding_pos = mf.blocks.iter().position(|b| b.id == padding).unwrap();
405 mf.blocks[padding_pos].insts = (0..pad_inst_count)
406 .map(|_| MachineInst {
407 opcode: ArmOpcode::Nop,
408 operands: vec![],
409 def: None,
410 })
411 .collect();
412 // Layout order: entry → padding → target. The padding block
413 // pushes target's offset out of range from the entry's BCond.
414 mf.blocks.swap(1, 2); // ensure entry, padding, target order
415
416 let blocks_before = mf.blocks.len();
417 relax_branches(&mut mf);
418 // One new skip block was inserted right after entry.
419 assert_eq!(mf.blocks.len(), blocks_before + 1);
420 // Entry now ends with: BCond(inverted) -> skip; B target.
421 let entry = &mf.blocks[0];
422 assert_eq!(entry.insts.len(), 2);
423 assert_eq!(entry.insts[0].opcode, ArmOpcode::BCond);
424 assert!(matches!(
425 entry.insts[0].operands[0],
426 MachineOperand::Cond(ArmCond::Eq) // inverse of Ne
427 ));
428 assert_eq!(entry.insts[1].opcode, ArmOpcode::B);
429 assert!(matches!(
430 entry.insts[1].operands[0],
431 MachineOperand::BlockRef(t) if t == target
432 ));
433 }
434
435 /// In-range cbz stays as a single instruction.
436 #[test]
437 fn in_range_cbz_unchanged() {
438 let mut mf = MachineFunction::new("test".into());
439 let target = mf.new_block("after");
440 let test_reg = mf.new_vreg(crate::codegen::mir::RegClass::Gp32);
441 mf.blocks[0].insts.push(MachineInst {
442 opcode: ArmOpcode::Cbz,
443 operands: vec![
444 MachineOperand::VReg(test_reg),
445 MachineOperand::BlockRef(target),
446 ],
447 def: None,
448 });
449
450 let block_count_before = mf.blocks.len();
451 relax_branches(&mut mf);
452 assert_eq!(mf.blocks.len(), block_count_before);
453 assert_eq!(mf.blocks[0].insts.len(), 1);
454 assert_eq!(mf.blocks[0].insts[0].opcode, ArmOpcode::Cbz);
455 }
456
457 /// Out-of-range cbz expands to: cbnz reg, skip; b far. Padding
458 /// pushes the target past ±1MB so the original cbz can no longer
459 /// reach it directly. The inverted opcode (Cbnz) jumps over the
460 /// unconditional `b`, which has the full ±128MB reach and never
461 /// itself needs further relaxation.
462 #[test]
463 fn out_of_range_cbz_expands_to_inverted_skip() {
464 let mut mf = MachineFunction::new("test".into());
465 let target = mf.new_block("after_padding");
466 let padding = mf.new_block("padding");
467 let test_reg = mf.new_vreg(crate::codegen::mir::RegClass::Gp32);
468
469 mf.blocks[0].insts.push(MachineInst {
470 opcode: ArmOpcode::Cbz,
471 operands: vec![
472 MachineOperand::VReg(test_reg),
473 MachineOperand::BlockRef(target),
474 ],
475 def: None,
476 });
477 let pad_inst_count = (COND_BRANCH_LIMIT as usize) / 4 + 16;
478 let padding_pos = mf.blocks.iter().position(|b| b.id == padding).unwrap();
479 mf.blocks[padding_pos].insts = (0..pad_inst_count)
480 .map(|_| MachineInst {
481 opcode: ArmOpcode::Nop,
482 operands: vec![],
483 def: None,
484 })
485 .collect();
486 mf.blocks.swap(1, 2);
487
488 let blocks_before = mf.blocks.len();
489 relax_branches(&mut mf);
490 assert_eq!(mf.blocks.len(), blocks_before + 1);
491 let entry = &mf.blocks[0];
492 assert_eq!(entry.insts.len(), 2);
493 assert_eq!(entry.insts[0].opcode, ArmOpcode::Cbnz, "cbz inverts to cbnz");
494 assert_eq!(entry.insts[1].opcode, ArmOpcode::B);
495 assert!(matches!(
496 entry.insts[1].operands[0],
497 MachineOperand::BlockRef(t) if t == target
498 ));
499 }
500
501 /// Out-of-range cbnz inverts to cbz on the skip path.
502 #[test]
503 fn out_of_range_cbnz_expands_to_cbz_skip() {
504 let mut mf = MachineFunction::new("test".into());
505 let target = mf.new_block("after_padding");
506 let padding = mf.new_block("padding");
507 let test_reg = mf.new_vreg(crate::codegen::mir::RegClass::Gp64);
508
509 mf.blocks[0].insts.push(MachineInst {
510 opcode: ArmOpcode::Cbnz,
511 operands: vec![
512 MachineOperand::VReg(test_reg),
513 MachineOperand::BlockRef(target),
514 ],
515 def: None,
516 });
517 let pad_inst_count = (COND_BRANCH_LIMIT as usize) / 4 + 16;
518 let padding_pos = mf.blocks.iter().position(|b| b.id == padding).unwrap();
519 mf.blocks[padding_pos].insts = (0..pad_inst_count)
520 .map(|_| MachineInst {
521 opcode: ArmOpcode::Nop,
522 operands: vec![],
523 def: None,
524 })
525 .collect();
526 mf.blocks.swap(1, 2);
527
528 relax_branches(&mut mf);
529 let entry = &mf.blocks[0];
530 assert_eq!(entry.insts[0].opcode, ArmOpcode::Cbz, "cbnz inverts to cbz");
531 }
532
533 /// In-range tbz stays as a single instruction.
534 #[test]
535 fn in_range_tbz_unchanged() {
536 let mut mf = MachineFunction::new("test".into());
537 let target = mf.new_block("after");
538 let test_reg = mf.new_vreg(crate::codegen::mir::RegClass::Gp64);
539 mf.blocks[0].insts.push(MachineInst {
540 opcode: ArmOpcode::Tbz,
541 operands: vec![
542 MachineOperand::VReg(test_reg),
543 MachineOperand::Imm(5),
544 MachineOperand::BlockRef(target),
545 ],
546 def: None,
547 });
548
549 let block_count_before = mf.blocks.len();
550 relax_branches(&mut mf);
551 assert_eq!(mf.blocks.len(), block_count_before);
552 assert_eq!(mf.blocks[0].insts.len(), 1);
553 assert_eq!(mf.blocks[0].insts[0].opcode, ArmOpcode::Tbz);
554 }
555
556 /// Out-of-range tbz uses the tighter ±32KB limit and expands. Tbz
557 /// is the most range-restricted of the four conditional branches
558 /// we emit, so anything past 32K-ish bytes from the branch must
559 /// trip relaxation.
560 #[test]
561 fn out_of_range_tbz_expands_to_tbnz_skip() {
562 let mut mf = MachineFunction::new("test".into());
563 let target = mf.new_block("after_padding");
564 let padding = mf.new_block("padding");
565 let test_reg = mf.new_vreg(crate::codegen::mir::RegClass::Gp64);
566
567 mf.blocks[0].insts.push(MachineInst {
568 opcode: ArmOpcode::Tbz,
569 operands: vec![
570 MachineOperand::VReg(test_reg),
571 MachineOperand::Imm(7),
572 MachineOperand::BlockRef(target),
573 ],
574 def: None,
575 });
576 // Tbz's ±32KB range is much tighter than the ±1MB cond-branch
577 // range — only ~8K nops are needed to overflow.
578 let pad_inst_count = (TBZ_BRANCH_LIMIT as usize) / 4 + 16;
579 let padding_pos = mf.blocks.iter().position(|b| b.id == padding).unwrap();
580 mf.blocks[padding_pos].insts = (0..pad_inst_count)
581 .map(|_| MachineInst {
582 opcode: ArmOpcode::Nop,
583 operands: vec![],
584 def: None,
585 })
586 .collect();
587 mf.blocks.swap(1, 2);
588
589 let blocks_before = mf.blocks.len();
590 relax_branches(&mut mf);
591 assert_eq!(mf.blocks.len(), blocks_before + 1);
592 let entry = &mf.blocks[0];
593 assert_eq!(entry.insts.len(), 2);
594 assert_eq!(entry.insts[0].opcode, ArmOpcode::Tbnz, "tbz inverts to tbnz");
595 // Bit operand survives the rewrite.
596 assert!(matches!(entry.insts[0].operands[1], MachineOperand::Imm(7)));
597 assert_eq!(entry.insts[1].opcode, ArmOpcode::B);
598 }
599 }
600