armfortas Public

Watch 0 Fork 0 Star 0

Rust · 22825 bytes Raw Blame History

  
        1
        //! ARM64 conditional-branch range relaxation.
      
        2
        //!
      
        3
        //! `B.cond` carries a 19-bit signed PC-relative immediate, so its
      
        4
        //! target must lie within ±1MB of the branch. Single-function bodies
      
        5
        //! that exceed that span (notably `stdlib_slaruv`, ~363KB of compiled
      
        6
        //! assembly with cross-body conditional jumps) trip the assembler's
      
        7
        //! "fixup value out of range" error.
      
        8
        //!
      
        9
        //! This pass runs after register allocation and tail-call optimization,
      
        10
        //! before assembly emission. It walks the function's blocks in linear
      
        11
        //! order, computes an approximate byte offset for every block label,
      
        12
        //! and for each `B.cond far_label` whose distance exceeds the
      
        13
        //! conditional-branch encoding window expands it to the trampoline
      
        14
        //! pattern:
      
        15
        //!
      
        16
        //! ```text
      
        17
        //! Before:                     After (when target is too far):
      
        18
        //!   b.cond  far_label             b.{!cond}  skip       ; one branch over
      
        19
        //!                                 b          far_label  ; ±128MB unconditional
      
        20
        //!                               skip:                   ; falls through
      
        21
        //! ```
      
        22
        //!
      
        23
        //! Inserting an extra block + branch shifts subsequent block offsets,
      
        24
        //! which can push other previously-in-range branches out of range,
      
        25
        //! so we iterate the pass until no further insertions are needed
      
        26
        //! (capped at four iterations — non-convergence triggers a panic that
      
        27
        //! survives release builds and produces a clear ICE for diagnostics).
      
        28
        //!
      
        29
        //! ARM64 unconditional `B` and `BL` carry a 26-bit immediate (±128MB
      
        30
        //! range), so the trampoline's unconditional `B` itself never needs
      
        31
        //! relaxation — no realistic single Fortran source produces a
      
        32
        //! function body larger than 128MB.
      
        33
        
        34
        use super::emit::emit_inst_text;
      
        35
        use super::mir::{
      
        36
            ArmCond, ArmOpcode, MBlockId, MachineBlock, MachineFunction, MachineInst, MachineOperand,
      
        37
        };
      
        38
        
        39
        /// Maximum signed offset (in bytes) reachable by a `B.cond` and by
      
        40
        /// `cbz/cbnz`. The 19-bit signed immediate is scaled by 4, giving
      
        41
        /// ±(2^20) bytes. Subtract a safety margin so we never sit right on
      
        42
        /// the edge: a downstream peephole insertion or fall-through fixup
      
        43
        /// that nudges the offset by a single instruction shouldn't push us
      
        44
        /// over.
      
        45
        const COND_BRANCH_LIMIT: i64 = (1 << 20) - 64;
      
        46
        
        47
        /// Maximum signed offset reachable by `tbz`/`tbnz`. The 14-bit
      
        48
        /// signed immediate is scaled by 4, giving ±(2^15) bytes — much
      
        49
        /// tighter than the cond-branch / cbz limit. Same safety margin.
      
        50
        const TBZ_BRANCH_LIMIT: i64 = (1 << 15) - 64;
      
        51
        
        52
        /// Iteration cap. In practice 1–2 passes suffice; if convergence
      
        53
        /// genuinely doesn't happen, something pathological is going on and
      
        54
        /// we want a loud failure rather than a silent infinite loop.
      
        55
        const MAX_ITERATIONS: u32 = 4;
      
        56
        
        57
        /// Run branch relaxation on a machine function. Idempotent: when no
      
        58
        /// `B.cond` overflows, the function is returned unchanged.
      
        59
        pub fn relax_branches(mf: &mut MachineFunction) {
      
        60
            for _ in 0..MAX_ITERATIONS {
      
        61
                if !relax_once(mf) {
      
        62
                    return;
      
        63
                }
      
        64
            }
      
        65
            // Convergence failure — extremely unlikely. Better to ICE here
      
        66
            // than emit an assembly file the assembler will reject.
      
        67
            panic!(
      
        68
                "branch relaxation did not converge for function {} after {} iterations",
      
        69
                mf.name, MAX_ITERATIONS
      
        70
            );
      
        71
        }
      
        72
        
        73
        /// Single relaxation pass. Returns `true` when at least one branch
      
        74
        /// was expanded (caller should iterate); `false` when every `B.cond`
      
        75
        /// is in range.
      
        76
        fn relax_once(mf: &mut MachineFunction) -> bool {
      
        77
            // Compute byte offsets for every block label using the actual
      
        78
            // emit-time instruction count. Several MIR opcodes lower to
      
        79
            // multiple ARM64 instructions (AdrpLdr / AdrpAdd, prologue +
      
        80
            // stack-probe sequences, large-immediate movz/movk chains, etc.),
      
        81
            // so a flat 4-bytes-per-MachineInst estimate would systematically
      
        82
            // under-shoot the real offsets in functions like stdlib_slaruv
      
        83
            // and miss far branches that need relaxation.
      
        84
            let block_offsets = compute_block_offsets(mf);
      
        85
        
        86
            // Collect overflow sites before mutating anything. We also record
      
        87
            // the per-instruction prefix offset within each block so we can
      
        88
            // skip past wide-emit insts that precede the conditional branch.
      
        89
            let mut overflows: Vec<OverflowSite> = Vec::new();
      
        90
            for block in &mf.blocks {
      
        91
                let block_offset = block_offsets[&block.id];
      
        92
                let mut running = 0i64;
      
        93
                for (inst_idx, inst) in block.insts.iter().enumerate() {
      
        94
                    if let Some((target, limit, kind)) = relaxable_cond_branch(inst) {
      
        95
                        if let Some(&target_offset) = block_offsets.get(&target) {
      
        96
                            let branch_offset = block_offset + running;
      
        97
                            let delta = target_offset - branch_offset;
      
        98
                            if delta.abs() > limit {
      
        99
                                overflows.push(OverflowSite {
      
        100
                                    block_id: block.id,
      
        101
                                    inst_idx,
      
        102
                                    target,
      
        103
                                    kind,
      
        104
                                });
      
        105
                            }
      
        106
                        }
      
        107
                    }
      
        108
                    running += inst_emit_bytes(inst, mf) as i64;
      
        109
                }
      
        110
            }
      
        111
        
        112
            if overflows.is_empty() {
      
        113
                return false;
      
        114
            }
      
        115
        
        116
            // Expand each overflow. Sort in reverse order of (block_id index,
      
        117
            // inst_idx) so earlier expansions don't invalidate later inst_idx
      
        118
            // values — block-vec insertions happen at the back of the affected
      
        119
            // block group.
      
        120
            overflows.sort_by(|a, b| {
      
        121
                let a_pos = block_position(mf, a.block_id);
      
        122
                let b_pos = block_position(mf, b.block_id);
      
        123
                b_pos.cmp(&a_pos).then(b.inst_idx.cmp(&a.inst_idx))
      
        124
            });
      
        125
        
        126
            for site in overflows {
      
        127
                expand_to_trampoline(mf, site);
      
        128
            }
      
        129
        
        130
            true
      
        131
        }
      
        132
        
        133
        #[derive(Clone)]
      
        134
        struct OverflowSite {
      
        135
            block_id: MBlockId,
      
        136
            inst_idx: usize,
      
        137
            target: MBlockId,
      
        138
            kind: RelaxKind,
      
        139
        }
      
        140
        
        141
        #[derive(Clone)]
      
        142
        enum RelaxKind {
      
        143
            /// `b.cond far` → `b.{!cond} skip; b far`
      
        144
            BCond { cond: ArmCond },
      
        145
            /// `cbz/cbnz reg far` → `cbnz/cbz reg skip; b far`
      
        146
            Cbz { invert_to: ArmOpcode, reg: MachineOperand },
      
        147
            /// `tbz/tbnz reg, #bit far` → `tbnz/tbz reg, #bit skip; b far`
      
        148
            Tbz {
      
        149
                invert_to: ArmOpcode,
      
        150
                reg: MachineOperand,
      
        151
                bit: i64,
      
        152
            },
      
        153
        }
      
        154
        
        155
        /// Inspect a machine instruction; if it is a conditional branch that
      
        156
        /// might need relaxation, return the target, the per-opcode range
      
        157
        /// limit, and a kind descriptor capturing what the inverted form
      
        158
        /// looks like. Returns `None` for non-branch instructions or branches
      
        159
        /// whose targets are unrecoverable from operands.
      
        160
        fn relaxable_cond_branch(inst: &MachineInst) -> Option<(MBlockId, i64, RelaxKind)> {
      
        161
            match inst.opcode {
      
        162
                ArmOpcode::BCond => {
      
        163
                    let target = bcond_target(inst)?;
      
        164
                    let cond = bcond_cond(inst)?;
      
        165
                    Some((target, COND_BRANCH_LIMIT, RelaxKind::BCond { cond }))
      
        166
                }
      
        167
                ArmOpcode::Cbz | ArmOpcode::Cbnz => {
      
        168
                    // Operands: [reg, BlockRef]
      
        169
                    let target = match inst.operands.get(1)? {
      
        170
                        MachineOperand::BlockRef(id) => *id,
      
        171
                        _ => return None,
      
        172
                    };
      
        173
                    let reg = inst.operands.first()?.clone();
      
        174
                    let invert_to = match inst.opcode {
      
        175
                        ArmOpcode::Cbz => ArmOpcode::Cbnz,
      
        176
                        _ => ArmOpcode::Cbz,
      
        177
                    };
      
        178
                    Some((target, COND_BRANCH_LIMIT, RelaxKind::Cbz { invert_to, reg }))
      
        179
                }
      
        180
                ArmOpcode::Tbz | ArmOpcode::Tbnz => {
      
        181
                    // Operands: [reg, Imm(bit), BlockRef]
      
        182
                    let bit = match inst.operands.get(1)? {
      
        183
                        MachineOperand::Imm(v) => *v,
      
        184
                        _ => return None,
      
        185
                    };
      
        186
                    let target = match inst.operands.get(2)? {
      
        187
                        MachineOperand::BlockRef(id) => *id,
      
        188
                        _ => return None,
      
        189
                    };
      
        190
                    let reg = inst.operands.first()?.clone();
      
        191
                    let invert_to = match inst.opcode {
      
        192
                        ArmOpcode::Tbz => ArmOpcode::Tbnz,
      
        193
                        _ => ArmOpcode::Tbz,
      
        194
                    };
      
        195
                    Some((
      
        196
                        target,
      
        197
                        TBZ_BRANCH_LIMIT,
      
        198
                        RelaxKind::Tbz {
      
        199
                            invert_to,
      
        200
                            reg,
      
        201
                            bit,
      
        202
                        },
      
        203
                    ))
      
        204
                }
      
        205
                _ => None,
      
        206
            }
      
        207
        }
      
        208
        
        209
        /// Compute byte offsets for every block label, summing the actual
      
        210
        /// emit-time instruction byte count for every MachineInst in linear
      
        211
        /// order. Each emitted ARM64 instruction is 4 bytes; we count the
      
        212
        /// `\n` separators in the emitted text to figure out how many real
      
        213
        /// instructions an opcode produces (most are 1, but pseudo-ops and
      
        214
        /// large-immediate forms emit 2+).
      
        215
        fn compute_block_offsets(mf: &MachineFunction) -> std::collections::HashMap<MBlockId, i64> {
      
        216
            let mut offsets = std::collections::HashMap::with_capacity(mf.blocks.len());
      
        217
            let mut running: i64 = 0;
      
        218
            for block in &mf.blocks {
      
        219
                offsets.insert(block.id, running);
      
        220
                for inst in &block.insts {
      
        221
                    running += inst_emit_bytes(inst, mf) as i64;
      
        222
                }
      
        223
            }
      
        224
            offsets
      
        225
        }
      
        226
        
        227
        /// Number of bytes a single MachineInst emits at assembly time.
      
        228
        /// Counted from the rendered text — `emit_inst_text` already produces
      
        229
        /// exactly the lines `emit_function` would, so newline-count + 1
      
        230
        /// matches the real instruction count without re-deriving each
      
        231
        /// opcode's expansion rules here.
      
        232
        fn inst_emit_bytes(inst: &MachineInst, mf: &MachineFunction) -> u32 {
      
        233
            let text = emit_inst_text(inst, mf);
      
        234
            let lines = text.matches('\n').count() as u32 + 1;
      
        235
            4 * lines
      
        236
        }
      
        237
        
        238
        fn block_position(mf: &MachineFunction, id: MBlockId) -> usize {
      
        239
            mf.blocks
      
        240
                .iter()
      
        241
                .position(|b| b.id == id)
      
        242
                .expect("block id present in block list")
      
        243
        }
      
        244
        
        245
        fn bcond_target(inst: &MachineInst) -> Option<MBlockId> {
      
        246
            inst.operands.iter().find_map(|op| match op {
      
        247
                MachineOperand::BlockRef(id) => Some(*id),
      
        248
                _ => None,
      
        249
            })
      
        250
        }
      
        251
        
        252
        fn bcond_cond(inst: &MachineInst) -> Option<ArmCond> {
      
        253
            inst.operands.iter().find_map(|op| match op {
      
        254
                MachineOperand::Cond(c) => Some(*c),
      
        255
                _ => None,
      
        256
            })
      
        257
        }
      
        258
        
        259
        /// Replace an out-of-range conditional branch at `(block_id, inst_idx)`
      
        260
        /// with the trampoline pattern. Allocates a fresh skip block and
      
        261
        /// inserts it right after the original block in `mf.blocks`. The
      
        262
        /// inverted-condition branch jumps over the unconditional `b`, which
      
        263
        /// has the full ±128MB reach of the unconditional encoding — so the
      
        264
        /// trampoline itself never needs further relaxation regardless of
      
        265
        /// which conditional opcode we started from.
      
        266
        fn expand_to_trampoline(mf: &mut MachineFunction, site: OverflowSite) {
      
        267
            let OverflowSite {
      
        268
                block_id,
      
        269
                inst_idx,
      
        270
                target: far_target,
      
        271
                kind,
      
        272
            } = site;
      
        273
        
        274
            // Allocate a fresh skip-block id without disturbing any other
      
        275
            // bookkeeping. We can't call `mf.new_block` directly because it
      
        276
            // pushes onto the back of the vec; we want the skip block
      
        277
            // physically adjacent to the original block (so its label
      
        278
            // fall-through reaches the original successor).
      
        279
            let skip_id = MBlockId(mf.next_block_id());
      
        280
            let skip_label = format!("{}_relax{}", mf.block(block_id).label, skip_id.0);
      
        281
        
        282
            let inverted = match kind {
      
        283
                RelaxKind::BCond { cond } => MachineInst {
      
        284
                    opcode: ArmOpcode::BCond,
      
        285
                    operands: vec![
      
        286
                        MachineOperand::Cond(cond.inverse()),
      
        287
                        MachineOperand::BlockRef(skip_id),
      
        288
                    ],
      
        289
                    def: None,
      
        290
                },
      
        291
                RelaxKind::Cbz { invert_to, reg } => MachineInst {
      
        292
                    opcode: invert_to,
      
        293
                    operands: vec![reg, MachineOperand::BlockRef(skip_id)],
      
        294
                    def: None,
      
        295
                },
      
        296
                RelaxKind::Tbz {
      
        297
                    invert_to,
      
        298
                    reg,
      
        299
                    bit,
      
        300
                } => MachineInst {
      
        301
                    opcode: invert_to,
      
        302
                    operands: vec![
      
        303
                        reg,
      
        304
                        MachineOperand::Imm(bit),
      
        305
                        MachineOperand::BlockRef(skip_id),
      
        306
                    ],
      
        307
                    def: None,
      
        308
                },
      
        309
            };
      
        310
            let unconditional_b = MachineInst {
      
        311
                opcode: ArmOpcode::B,
      
        312
                operands: vec![MachineOperand::BlockRef(far_target)],
      
        313
                def: None,
      
        314
            };
      
        315
        
        316
            // Splice them in, preserving any instructions that followed the
      
        317
            // original branch. Those trailing instructions (uncommon) move
      
        318
            // into the skip block to keep program order.
      
        319
            let block_pos = block_position(mf, block_id);
      
        320
            let trailing = {
      
        321
                let block = &mut mf.blocks[block_pos];
      
        322
                let trailing: Vec<MachineInst> = block.insts.drain(inst_idx + 1..).collect();
      
        323
                block.insts[inst_idx] = inverted;
      
        324
                block.insts.push(unconditional_b);
      
        325
                trailing
      
        326
            };
      
        327
        
        328
            let mut skip_block = MachineBlock::new(skip_id, skip_label);
      
        329
            skip_block.insts = trailing;
      
        330
            mf.blocks.insert(block_pos + 1, skip_block);
      
        331
        }
      
        332
        
        333
        #[cfg(test)]
      
        334
        mod tests {
      
        335
            use super::*;
      
        336
        
        337
            /// Compile-time confirmation that every ArmCond's inverse is
      
        338
            /// involutive. Easy to typo otherwise.
      
        339
            #[test]
      
        340
            fn arm_cond_inverse_is_involutive() {
      
        341
                for c in [
      
        342
                    ArmCond::Eq,
      
        343
                    ArmCond::Ne,
      
        344
                    ArmCond::Hs,
      
        345
                    ArmCond::Lo,
      
        346
                    ArmCond::Mi,
      
        347
                    ArmCond::Pl,
      
        348
                    ArmCond::Hi,
      
        349
                    ArmCond::Ls,
      
        350
                    ArmCond::Ge,
      
        351
                    ArmCond::Lt,
      
        352
                    ArmCond::Gt,
      
        353
                    ArmCond::Le,
      
        354
                ] {
      
        355
                    assert_eq!(c.inverse().inverse(), c, "{:?}", c);
      
        356
                }
      
        357
            }
      
        358
        
        359
            /// In-range branches stay as a single instruction.
      
        360
            #[test]
      
        361
            fn in_range_bcond_unchanged() {
      
        362
                let mut mf = MachineFunction::new("test".into());
      
        363
                let target = mf.new_block("after");
      
        364
                let entry_id = mf.blocks[0].id;
      
        365
                mf.blocks[0].insts.push(MachineInst {
      
        366
                    opcode: ArmOpcode::BCond,
      
        367
                    operands: vec![
      
        368
                        MachineOperand::Cond(ArmCond::Ne),
      
        369
                        MachineOperand::BlockRef(target),
      
        370
                    ],
      
        371
                    def: None,
      
        372
                });
      
        373
                let _ = entry_id;
      
        374
        
        375
                let block_count_before = mf.blocks.len();
      
        376
                relax_branches(&mut mf);
      
        377
                assert_eq!(mf.blocks.len(), block_count_before);
      
        378
                assert_eq!(mf.blocks[0].insts.len(), 1);
      
        379
                assert_eq!(mf.blocks[0].insts[0].opcode, ArmOpcode::BCond);
      
        380
            }
      
        381
        
        382
            /// Out-of-range branch expands to a trampoline. We force
      
        383
            /// overflow by stuffing one block with enough nops to push the
      
        384
            /// target's offset past ±1MB.
      
        385
            #[test]
      
        386
            fn out_of_range_bcond_expands() {
      
        387
                let mut mf = MachineFunction::new("test".into());
      
        388
                let target = mf.new_block("after_padding");
      
        389
                let padding = mf.new_block("padding");
      
        390
        
        391
                // Entry: BCond Ne -> target.
      
        392
                mf.blocks[0].insts.push(MachineInst {
      
        393
                    opcode: ArmOpcode::BCond,
      
        394
                    operands: vec![
      
        395
                        MachineOperand::Cond(ArmCond::Ne),
      
        396
                        MachineOperand::BlockRef(target),
      
        397
                    ],
      
        398
                    def: None,
      
        399
                });
      
        400
                // Branch into padding so the entry block has at least one
      
        401
                // successor, then padding stuffs enough Nops to push the
      
        402
                // `after_padding` label past ±1MB.
      
        403
                let pad_inst_count = (COND_BRANCH_LIMIT as usize) / 4 + 16;
      
        404
                let padding_pos = mf.blocks.iter().position(|b| b.id == padding).unwrap();
      
        405
                mf.blocks[padding_pos].insts = (0..pad_inst_count)
      
        406
                    .map(|_| MachineInst {
      
        407
                        opcode: ArmOpcode::Nop,
      
        408
                        operands: vec![],
      
        409
                        def: None,
      
        410
                    })
      
        411
                    .collect();
      
        412
                // Layout order: entry → padding → target. The padding block
      
        413
                // pushes target's offset out of range from the entry's BCond.
      
        414
                mf.blocks.swap(1, 2); // ensure entry, padding, target order
      
        415
        
        416
                let blocks_before = mf.blocks.len();
      
        417
                relax_branches(&mut mf);
      
        418
                // One new skip block was inserted right after entry.
      
        419
                assert_eq!(mf.blocks.len(), blocks_before + 1);
      
        420
                // Entry now ends with: BCond(inverted) -> skip; B target.
      
        421
                let entry = &mf.blocks[0];
      
        422
                assert_eq!(entry.insts.len(), 2);
      
        423
                assert_eq!(entry.insts[0].opcode, ArmOpcode::BCond);
      
        424
                assert!(matches!(
      
        425
                    entry.insts[0].operands[0],
      
        426
                    MachineOperand::Cond(ArmCond::Eq) // inverse of Ne
      
        427
                ));
      
        428
                assert_eq!(entry.insts[1].opcode, ArmOpcode::B);
      
        429
                assert!(matches!(
      
        430
                    entry.insts[1].operands[0],
      
        431
                    MachineOperand::BlockRef(t) if t == target
      
        432
                ));
      
        433
            }
      
        434
        
        435
            /// In-range cbz stays as a single instruction.
      
        436
            #[test]
      
        437
            fn in_range_cbz_unchanged() {
      
        438
                let mut mf = MachineFunction::new("test".into());
      
        439
                let target = mf.new_block("after");
      
        440
                let test_reg = mf.new_vreg(crate::codegen::mir::RegClass::Gp32);
      
        441
                mf.blocks[0].insts.push(MachineInst {
      
        442
                    opcode: ArmOpcode::Cbz,
      
        443
                    operands: vec![
      
        444
                        MachineOperand::VReg(test_reg),
      
        445
                        MachineOperand::BlockRef(target),
      
        446
                    ],
      
        447
                    def: None,
      
        448
                });
      
        449
        
        450
                let block_count_before = mf.blocks.len();
      
        451
                relax_branches(&mut mf);
      
        452
                assert_eq!(mf.blocks.len(), block_count_before);
      
        453
                assert_eq!(mf.blocks[0].insts.len(), 1);
      
        454
                assert_eq!(mf.blocks[0].insts[0].opcode, ArmOpcode::Cbz);
      
        455
            }
      
        456
        
        457
            /// Out-of-range cbz expands to: cbnz reg, skip; b far. Padding
      
        458
            /// pushes the target past ±1MB so the original cbz can no longer
      
        459
            /// reach it directly. The inverted opcode (Cbnz) jumps over the
      
        460
            /// unconditional `b`, which has the full ±128MB reach and never
      
        461
            /// itself needs further relaxation.
      
        462
            #[test]
      
        463
            fn out_of_range_cbz_expands_to_inverted_skip() {
      
        464
                let mut mf = MachineFunction::new("test".into());
      
        465
                let target = mf.new_block("after_padding");
      
        466
                let padding = mf.new_block("padding");
      
        467
                let test_reg = mf.new_vreg(crate::codegen::mir::RegClass::Gp32);
      
        468
        
        469
                mf.blocks[0].insts.push(MachineInst {
      
        470
                    opcode: ArmOpcode::Cbz,
      
        471
                    operands: vec![
      
        472
                        MachineOperand::VReg(test_reg),
      
        473
                        MachineOperand::BlockRef(target),
      
        474
                    ],
      
        475
                    def: None,
      
        476
                });
      
        477
                let pad_inst_count = (COND_BRANCH_LIMIT as usize) / 4 + 16;
      
        478
                let padding_pos = mf.blocks.iter().position(|b| b.id == padding).unwrap();
      
        479
                mf.blocks[padding_pos].insts = (0..pad_inst_count)
      
        480
                    .map(|_| MachineInst {
      
        481
                        opcode: ArmOpcode::Nop,
      
        482
                        operands: vec![],
      
        483
                        def: None,
      
        484
                    })
      
        485
                    .collect();
      
        486
                mf.blocks.swap(1, 2);
      
        487
        
        488
                let blocks_before = mf.blocks.len();
      
        489
                relax_branches(&mut mf);
      
        490
                assert_eq!(mf.blocks.len(), blocks_before + 1);
      
        491
                let entry = &mf.blocks[0];
      
        492
                assert_eq!(entry.insts.len(), 2);
      
        493
                assert_eq!(entry.insts[0].opcode, ArmOpcode::Cbnz, "cbz inverts to cbnz");
      
        494
                assert_eq!(entry.insts[1].opcode, ArmOpcode::B);
      
        495
                assert!(matches!(
      
        496
                    entry.insts[1].operands[0],
      
        497
                    MachineOperand::BlockRef(t) if t == target
      
        498
                ));
      
        499
            }
      
        500
        
        501
            /// Out-of-range cbnz inverts to cbz on the skip path.
      
        502
            #[test]
      
        503
            fn out_of_range_cbnz_expands_to_cbz_skip() {
      
        504
                let mut mf = MachineFunction::new("test".into());
      
        505
                let target = mf.new_block("after_padding");
      
        506
                let padding = mf.new_block("padding");
      
        507
                let test_reg = mf.new_vreg(crate::codegen::mir::RegClass::Gp64);
      
        508
        
        509
                mf.blocks[0].insts.push(MachineInst {
      
        510
                    opcode: ArmOpcode::Cbnz,
      
        511
                    operands: vec![
      
        512
                        MachineOperand::VReg(test_reg),
      
        513
                        MachineOperand::BlockRef(target),
      
        514
                    ],
      
        515
                    def: None,
      
        516
                });
      
        517
                let pad_inst_count = (COND_BRANCH_LIMIT as usize) / 4 + 16;
      
        518
                let padding_pos = mf.blocks.iter().position(|b| b.id == padding).unwrap();
      
        519
                mf.blocks[padding_pos].insts = (0..pad_inst_count)
      
        520
                    .map(|_| MachineInst {
      
        521
                        opcode: ArmOpcode::Nop,
      
        522
                        operands: vec![],
      
        523
                        def: None,
      
        524
                    })
      
        525
                    .collect();
      
        526
                mf.blocks.swap(1, 2);
      
        527
        
        528
                relax_branches(&mut mf);
      
        529
                let entry = &mf.blocks[0];
      
        530
                assert_eq!(entry.insts[0].opcode, ArmOpcode::Cbz, "cbnz inverts to cbz");
      
        531
            }
      
        532
        
        533
            /// In-range tbz stays as a single instruction.
      
        534
            #[test]
      
        535
            fn in_range_tbz_unchanged() {
      
        536
                let mut mf = MachineFunction::new("test".into());
      
        537
                let target = mf.new_block("after");
      
        538
                let test_reg = mf.new_vreg(crate::codegen::mir::RegClass::Gp64);
      
        539
                mf.blocks[0].insts.push(MachineInst {
      
        540
                    opcode: ArmOpcode::Tbz,
      
        541
                    operands: vec![
      
        542
                        MachineOperand::VReg(test_reg),
      
        543
                        MachineOperand::Imm(5),
      
        544
                        MachineOperand::BlockRef(target),
      
        545
                    ],
      
        546
                    def: None,
      
        547
                });
      
        548
        
        549
                let block_count_before = mf.blocks.len();
      
        550
                relax_branches(&mut mf);
      
        551
                assert_eq!(mf.blocks.len(), block_count_before);
      
        552
                assert_eq!(mf.blocks[0].insts.len(), 1);
      
        553
                assert_eq!(mf.blocks[0].insts[0].opcode, ArmOpcode::Tbz);
      
        554
            }
      
        555
        
        556
            /// Out-of-range tbz uses the tighter ±32KB limit and expands. Tbz
      
        557
            /// is the most range-restricted of the four conditional branches
      
        558
            /// we emit, so anything past 32K-ish bytes from the branch must
      
        559
            /// trip relaxation.
      
        560
            #[test]
      
        561
            fn out_of_range_tbz_expands_to_tbnz_skip() {
      
        562
                let mut mf = MachineFunction::new("test".into());
      
        563
                let target = mf.new_block("after_padding");
      
        564
                let padding = mf.new_block("padding");
      
        565
                let test_reg = mf.new_vreg(crate::codegen::mir::RegClass::Gp64);
      
        566
        
        567
                mf.blocks[0].insts.push(MachineInst {
      
        568
                    opcode: ArmOpcode::Tbz,
      
        569
                    operands: vec![
      
        570
                        MachineOperand::VReg(test_reg),
      
        571
                        MachineOperand::Imm(7),
      
        572
                        MachineOperand::BlockRef(target),
      
        573
                    ],
      
        574
                    def: None,
      
        575
                });
      
        576
                // Tbz's ±32KB range is much tighter than the ±1MB cond-branch
      
        577
                // range — only ~8K nops are needed to overflow.
      
        578
                let pad_inst_count = (TBZ_BRANCH_LIMIT as usize) / 4 + 16;
      
        579
                let padding_pos = mf.blocks.iter().position(|b| b.id == padding).unwrap();
      
        580
                mf.blocks[padding_pos].insts = (0..pad_inst_count)
      
        581
                    .map(|_| MachineInst {
      
        582
                        opcode: ArmOpcode::Nop,
      
        583
                        operands: vec![],
      
        584
                        def: None,
      
        585
                    })
      
        586
                    .collect();
      
        587
                mf.blocks.swap(1, 2);
      
        588
        
        589
                let blocks_before = mf.blocks.len();
      
        590
                relax_branches(&mut mf);
      
        591
                assert_eq!(mf.blocks.len(), blocks_before + 1);
      
        592
                let entry = &mf.blocks[0];
      
        593
                assert_eq!(entry.insts.len(), 2);
      
        594
                assert_eq!(entry.insts[0].opcode, ArmOpcode::Tbnz, "tbz inverts to tbnz");
      
        595
                // Bit operand survives the rewrite.
      
        596
                assert!(matches!(entry.insts[0].operands[1], MachineOperand::Imm(7)));
      
        597
                assert_eq!(entry.insts[1].opcode, ArmOpcode::B);
      
        598
            }
      
        599
        }
      
        600

1	//! ARM64 conditional-branch range relaxation.
2	//!
3	//! `B.cond` carries a 19-bit signed PC-relative immediate, so its
4	//! target must lie within ±1MB of the branch. Single-function bodies
5	//! that exceed that span (notably `stdlib_slaruv`, ~363KB of compiled
6	//! assembly with cross-body conditional jumps) trip the assembler's
7	//! "fixup value out of range" error.
8	//!
9	//! This pass runs after register allocation and tail-call optimization,
10	//! before assembly emission. It walks the function's blocks in linear
11	//! order, computes an approximate byte offset for every block label,
12	//! and for each `B.cond far_label` whose distance exceeds the
13	//! conditional-branch encoding window expands it to the trampoline
14	//! pattern:
15	//!
16	//! ```text
17	//! Before: After (when target is too far):
18	//! b.cond far_label b.{!cond} skip ; one branch over
19	//! b far_label ; ±128MB unconditional
20	//! skip: ; falls through
21	//! ```
22	//!
23	//! Inserting an extra block + branch shifts subsequent block offsets,
24	//! which can push other previously-in-range branches out of range,
25	//! so we iterate the pass until no further insertions are needed
26	//! (capped at four iterations — non-convergence triggers a panic that
27	//! survives release builds and produces a clear ICE for diagnostics).
28	//!
29	//! ARM64 unconditional `B` and `BL` carry a 26-bit immediate (±128MB
30	//! range), so the trampoline's unconditional `B` itself never needs
31	//! relaxation — no realistic single Fortran source produces a
32	//! function body larger than 128MB.
33
34	use super::emit::emit_inst_text;
35	use super::mir::{
36	ArmCond, ArmOpcode, MBlockId, MachineBlock, MachineFunction, MachineInst, MachineOperand,
37	};
38
39	/// Maximum signed offset (in bytes) reachable by a `B.cond` and by
40	/// `cbz/cbnz`. The 19-bit signed immediate is scaled by 4, giving
41	/// ±(2^20) bytes. Subtract a safety margin so we never sit right on
42	/// the edge: a downstream peephole insertion or fall-through fixup
43	/// that nudges the offset by a single instruction shouldn't push us
44	/// over.
45	const COND_BRANCH_LIMIT: i64 = (1 << 20) - 64;
46
47	/// Maximum signed offset reachable by `tbz`/`tbnz`. The 14-bit
48	/// signed immediate is scaled by 4, giving ±(2^15) bytes — much
49	/// tighter than the cond-branch / cbz limit. Same safety margin.
50	const TBZ_BRANCH_LIMIT: i64 = (1 << 15) - 64;
51
52	/// Iteration cap. In practice 1–2 passes suffice; if convergence
53	/// genuinely doesn't happen, something pathological is going on and
54	/// we want a loud failure rather than a silent infinite loop.
55	const MAX_ITERATIONS: u32 = 4;
56
57	/// Run branch relaxation on a machine function. Idempotent: when no
58	/// `B.cond` overflows, the function is returned unchanged.
59	pub fn relax_branches(mf: &mut MachineFunction) {
60	for _ in 0..MAX_ITERATIONS {
61	if !relax_once(mf) {
62	return;
63	}
64	}
65	// Convergence failure — extremely unlikely. Better to ICE here
66	// than emit an assembly file the assembler will reject.
67	panic!(
68	"branch relaxation did not converge for function {} after {} iterations",
69	mf.name, MAX_ITERATIONS
70	);
71	}
72
73	/// Single relaxation pass. Returns `true` when at least one branch
74	/// was expanded (caller should iterate); `false` when every `B.cond`
75	/// is in range.
76	fn relax_once(mf: &mut MachineFunction) -> bool {
77	// Compute byte offsets for every block label using the actual
78	// emit-time instruction count. Several MIR opcodes lower to
79	// multiple ARM64 instructions (AdrpLdr / AdrpAdd, prologue +
80	// stack-probe sequences, large-immediate movz/movk chains, etc.),
81	// so a flat 4-bytes-per-MachineInst estimate would systematically
82	// under-shoot the real offsets in functions like stdlib_slaruv
83	// and miss far branches that need relaxation.
84	let block_offsets = compute_block_offsets(mf);
85
86	// Collect overflow sites before mutating anything. We also record
87	// the per-instruction prefix offset within each block so we can
88	// skip past wide-emit insts that precede the conditional branch.
89	let mut overflows: Vec<OverflowSite> = Vec::new();
90	for block in &mf.blocks {
91	let block_offset = block_offsets[&block.id];
92	let mut running = 0i64;
93	for (inst_idx, inst) in block.insts.iter().enumerate() {
94	if let Some((target, limit, kind)) = relaxable_cond_branch(inst) {
95	if let Some(&target_offset) = block_offsets.get(&target) {
96	let branch_offset = block_offset + running;
97	let delta = target_offset - branch_offset;
98	if delta.abs() > limit {
99	overflows.push(OverflowSite {
100	block_id: block.id,
101	inst_idx,
102	target,
103	kind,
104	});
105	}
106	}
107	}
108	running += inst_emit_bytes(inst, mf) as i64;
109	}
110	}
111
112	if overflows.is_empty() {
113	return false;
114	}
115
116	// Expand each overflow. Sort in reverse order of (block_id index,
117	// inst_idx) so earlier expansions don't invalidate later inst_idx
118	// values — block-vec insertions happen at the back of the affected
119	// block group.
120	overflows.sort_by(\|a, b\| {
121	let a_pos = block_position(mf, a.block_id);
122	let b_pos = block_position(mf, b.block_id);
123	b_pos.cmp(&a_pos).then(b.inst_idx.cmp(&a.inst_idx))
124	});
125
126	for site in overflows {
127	expand_to_trampoline(mf, site);
128	}
129
130	true
131	}
132
133	#[derive(Clone)]
134	struct OverflowSite {
135	block_id: MBlockId,
136	inst_idx: usize,
137	target: MBlockId,
138	kind: RelaxKind,
139	}
140
141	#[derive(Clone)]
142	enum RelaxKind {
143	/// `b.cond far` → `b.{!cond} skip; b far`
144	BCond { cond: ArmCond },
145	/// `cbz/cbnz reg far` → `cbnz/cbz reg skip; b far`
146	Cbz { invert_to: ArmOpcode, reg: MachineOperand },
147	/// `tbz/tbnz reg, #bit far` → `tbnz/tbz reg, #bit skip; b far`
148	Tbz {
149	invert_to: ArmOpcode,
150	reg: MachineOperand,
151	bit: i64,
152	},
153	}
154
155	/// Inspect a machine instruction; if it is a conditional branch that
156	/// might need relaxation, return the target, the per-opcode range
157	/// limit, and a kind descriptor capturing what the inverted form
158	/// looks like. Returns `None` for non-branch instructions or branches
159	/// whose targets are unrecoverable from operands.
160	fn relaxable_cond_branch(inst: &MachineInst) -> Option<(MBlockId, i64, RelaxKind)> {
161	match inst.opcode {
162	ArmOpcode::BCond => {
163	let target = bcond_target(inst)?;
164	let cond = bcond_cond(inst)?;
165	Some((target, COND_BRANCH_LIMIT, RelaxKind::BCond { cond }))
166	}
167	ArmOpcode::Cbz \| ArmOpcode::Cbnz => {
168	// Operands: [reg, BlockRef]
169	let target = match inst.operands.get(1)? {
170	MachineOperand::BlockRef(id) => *id,
171	_ => return None,
172	};
173	let reg = inst.operands.first()?.clone();
174	let invert_to = match inst.opcode {
175	ArmOpcode::Cbz => ArmOpcode::Cbnz,
176	_ => ArmOpcode::Cbz,
177	};
178	Some((target, COND_BRANCH_LIMIT, RelaxKind::Cbz { invert_to, reg }))
179	}
180	ArmOpcode::Tbz \| ArmOpcode::Tbnz => {
181	// Operands: [reg, Imm(bit), BlockRef]
182	let bit = match inst.operands.get(1)? {
183	MachineOperand::Imm(v) => *v,
184	_ => return None,
185	};
186	let target = match inst.operands.get(2)? {
187	MachineOperand::BlockRef(id) => *id,
188	_ => return None,
189	};
190	let reg = inst.operands.first()?.clone();
191	let invert_to = match inst.opcode {
192	ArmOpcode::Tbz => ArmOpcode::Tbnz,
193	_ => ArmOpcode::Tbz,
194	};
195	Some((
196	target,
197	TBZ_BRANCH_LIMIT,
198	RelaxKind::Tbz {
199	invert_to,
200	reg,
201	bit,
202	},
203	))
204	}
205	_ => None,
206	}
207	}
208
209	/// Compute byte offsets for every block label, summing the actual
210	/// emit-time instruction byte count for every MachineInst in linear
211	/// order. Each emitted ARM64 instruction is 4 bytes; we count the
212	/// `\n` separators in the emitted text to figure out how many real
213	/// instructions an opcode produces (most are 1, but pseudo-ops and
214	/// large-immediate forms emit 2+).
215	fn compute_block_offsets(mf: &MachineFunction) -> std::collections::HashMap<MBlockId, i64> {
216	let mut offsets = std::collections::HashMap::with_capacity(mf.blocks.len());
217	let mut running: i64 = 0;
218	for block in &mf.blocks {
219	offsets.insert(block.id, running);
220	for inst in &block.insts {
221	running += inst_emit_bytes(inst, mf) as i64;
222	}
223	}
224	offsets
225	}
226
227	/// Number of bytes a single MachineInst emits at assembly time.
228	/// Counted from the rendered text — `emit_inst_text` already produces
229	/// exactly the lines `emit_function` would, so newline-count + 1
230	/// matches the real instruction count without re-deriving each
231	/// opcode's expansion rules here.
232	fn inst_emit_bytes(inst: &MachineInst, mf: &MachineFunction) -> u32 {
233	let text = emit_inst_text(inst, mf);
234	let lines = text.matches('\n').count() as u32 + 1;
235	4 * lines
236	}
237
238	fn block_position(mf: &MachineFunction, id: MBlockId) -> usize {
239	mf.blocks
240	.iter()
241	.position(\|b\| b.id == id)
242	.expect("block id present in block list")
243	}
244
245	fn bcond_target(inst: &MachineInst) -> Option<MBlockId> {
246	inst.operands.iter().find_map(\|op\| match op {
247	MachineOperand::BlockRef(id) => Some(*id),
248	_ => None,
249	})
250	}
251
252	fn bcond_cond(inst: &MachineInst) -> Option<ArmCond> {
253	inst.operands.iter().find_map(\|op\| match op {
254	MachineOperand::Cond(c) => Some(*c),
255	_ => None,
256	})
257	}
258
259	/// Replace an out-of-range conditional branch at `(block_id, inst_idx)`
260	/// with the trampoline pattern. Allocates a fresh skip block and
261	/// inserts it right after the original block in `mf.blocks`. The
262	/// inverted-condition branch jumps over the unconditional `b`, which
263	/// has the full ±128MB reach of the unconditional encoding — so the
264	/// trampoline itself never needs further relaxation regardless of
265	/// which conditional opcode we started from.
266	fn expand_to_trampoline(mf: &mut MachineFunction, site: OverflowSite) {
267	let OverflowSite {
268	block_id,
269	inst_idx,
270	target: far_target,
271	kind,
272	} = site;
273
274	// Allocate a fresh skip-block id without disturbing any other
275	// bookkeeping. We can't call `mf.new_block` directly because it
276	// pushes onto the back of the vec; we want the skip block
277	// physically adjacent to the original block (so its label
278	// fall-through reaches the original successor).
279	let skip_id = MBlockId(mf.next_block_id());
280	let skip_label = format!("{}_relax{}", mf.block(block_id).label, skip_id.0);
281
282	let inverted = match kind {
283	RelaxKind::BCond { cond } => MachineInst {
284	opcode: ArmOpcode::BCond,
285	operands: vec![
286	MachineOperand::Cond(cond.inverse()),
287	MachineOperand::BlockRef(skip_id),
288	],
289	def: None,
290	},
291	RelaxKind::Cbz { invert_to, reg } => MachineInst {
292	opcode: invert_to,
293	operands: vec![reg, MachineOperand::BlockRef(skip_id)],
294	def: None,
295	},
296	RelaxKind::Tbz {
297	invert_to,
298	reg,
299	bit,
300	} => MachineInst {
301	opcode: invert_to,
302	operands: vec![
303	reg,
304	MachineOperand::Imm(bit),
305	MachineOperand::BlockRef(skip_id),
306	],
307	def: None,
308	},
309	};
310	let unconditional_b = MachineInst {
311	opcode: ArmOpcode::B,
312	operands: vec![MachineOperand::BlockRef(far_target)],
313	def: None,
314	};
315
316	// Splice them in, preserving any instructions that followed the
317	// original branch. Those trailing instructions (uncommon) move
318	// into the skip block to keep program order.
319	let block_pos = block_position(mf, block_id);
320	let trailing = {
321	let block = &mut mf.blocks[block_pos];
322	let trailing: Vec<MachineInst> = block.insts.drain(inst_idx + 1..).collect();
323	block.insts[inst_idx] = inverted;
324	block.insts.push(unconditional_b);
325	trailing
326	};
327
328	let mut skip_block = MachineBlock::new(skip_id, skip_label);
329	skip_block.insts = trailing;
330	mf.blocks.insert(block_pos + 1, skip_block);
331	}
332
333	#[cfg(test)]
334	mod tests {
335	use super::*;
336
337	/// Compile-time confirmation that every ArmCond's inverse is
338	/// involutive. Easy to typo otherwise.
339	#[test]
340	fn arm_cond_inverse_is_involutive() {
341	for c in [
342	ArmCond::Eq,
343	ArmCond::Ne,
344	ArmCond::Hs,
345	ArmCond::Lo,
346	ArmCond::Mi,
347	ArmCond::Pl,
348	ArmCond::Hi,
349	ArmCond::Ls,
350	ArmCond::Ge,
351	ArmCond::Lt,
352	ArmCond::Gt,
353	ArmCond::Le,
354	] {
355	assert_eq!(c.inverse().inverse(), c, "{:?}", c);
356	}
357	}
358
359	/// In-range branches stay as a single instruction.
360	#[test]
361	fn in_range_bcond_unchanged() {
362	let mut mf = MachineFunction::new("test".into());
363	let target = mf.new_block("after");
364	let entry_id = mf.blocks[0].id;
365	mf.blocks[0].insts.push(MachineInst {
366	opcode: ArmOpcode::BCond,
367	operands: vec![
368	MachineOperand::Cond(ArmCond::Ne),
369	MachineOperand::BlockRef(target),
370	],
371	def: None,
372	});
373	let _ = entry_id;
374
375	let block_count_before = mf.blocks.len();
376	relax_branches(&mut mf);
377	assert_eq!(mf.blocks.len(), block_count_before);
378	assert_eq!(mf.blocks[0].insts.len(), 1);
379	assert_eq!(mf.blocks[0].insts[0].opcode, ArmOpcode::BCond);
380	}
381
382	/// Out-of-range branch expands to a trampoline. We force
383	/// overflow by stuffing one block with enough nops to push the
384	/// target's offset past ±1MB.
385	#[test]
386	fn out_of_range_bcond_expands() {
387	let mut mf = MachineFunction::new("test".into());
388	let target = mf.new_block("after_padding");
389	let padding = mf.new_block("padding");
390
391	// Entry: BCond Ne -> target.
392	mf.blocks[0].insts.push(MachineInst {
393	opcode: ArmOpcode::BCond,
394	operands: vec![
395	MachineOperand::Cond(ArmCond::Ne),
396	MachineOperand::BlockRef(target),
397	],
398	def: None,
399	});
400	// Branch into padding so the entry block has at least one
401	// successor, then padding stuffs enough Nops to push the
402	// `after_padding` label past ±1MB.
403	let pad_inst_count = (COND_BRANCH_LIMIT as usize) / 4 + 16;
404	let padding_pos = mf.blocks.iter().position(\|b\| b.id == padding).unwrap();
405	mf.blocks[padding_pos].insts = (0..pad_inst_count)
406	.map(\|_\| MachineInst {
407	opcode: ArmOpcode::Nop,
408	operands: vec![],
409	def: None,
410	})
411	.collect();
412	// Layout order: entry → padding → target. The padding block
413	// pushes target's offset out of range from the entry's BCond.
414	mf.blocks.swap(1, 2); // ensure entry, padding, target order
415
416	let blocks_before = mf.blocks.len();
417	relax_branches(&mut mf);
418	// One new skip block was inserted right after entry.
419	assert_eq!(mf.blocks.len(), blocks_before + 1);
420	// Entry now ends with: BCond(inverted) -> skip; B target.
421	let entry = &mf.blocks[0];
422	assert_eq!(entry.insts.len(), 2);
423	assert_eq!(entry.insts[0].opcode, ArmOpcode::BCond);
424	assert!(matches!(
425	entry.insts[0].operands[0],
426	MachineOperand::Cond(ArmCond::Eq) // inverse of Ne
427	));
428	assert_eq!(entry.insts[1].opcode, ArmOpcode::B);
429	assert!(matches!(
430	entry.insts[1].operands[0],
431	MachineOperand::BlockRef(t) if t == target
432	));
433	}
434
435	/// In-range cbz stays as a single instruction.
436	#[test]
437	fn in_range_cbz_unchanged() {
438	let mut mf = MachineFunction::new("test".into());
439	let target = mf.new_block("after");
440	let test_reg = mf.new_vreg(crate::codegen::mir::RegClass::Gp32);
441	mf.blocks[0].insts.push(MachineInst {
442	opcode: ArmOpcode::Cbz,
443	operands: vec![
444	MachineOperand::VReg(test_reg),
445	MachineOperand::BlockRef(target),
446	],
447	def: None,
448	});
449
450	let block_count_before = mf.blocks.len();
451	relax_branches(&mut mf);
452	assert_eq!(mf.blocks.len(), block_count_before);
453	assert_eq!(mf.blocks[0].insts.len(), 1);
454	assert_eq!(mf.blocks[0].insts[0].opcode, ArmOpcode::Cbz);
455	}
456
457	/// Out-of-range cbz expands to: cbnz reg, skip; b far. Padding
458	/// pushes the target past ±1MB so the original cbz can no longer
459	/// reach it directly. The inverted opcode (Cbnz) jumps over the
460	/// unconditional `b`, which has the full ±128MB reach and never
461	/// itself needs further relaxation.
462	#[test]
463	fn out_of_range_cbz_expands_to_inverted_skip() {
464	let mut mf = MachineFunction::new("test".into());
465	let target = mf.new_block("after_padding");
466	let padding = mf.new_block("padding");
467	let test_reg = mf.new_vreg(crate::codegen::mir::RegClass::Gp32);
468
469	mf.blocks[0].insts.push(MachineInst {
470	opcode: ArmOpcode::Cbz,
471	operands: vec![
472	MachineOperand::VReg(test_reg),
473	MachineOperand::BlockRef(target),
474	],
475	def: None,
476	});
477	let pad_inst_count = (COND_BRANCH_LIMIT as usize) / 4 + 16;
478	let padding_pos = mf.blocks.iter().position(\|b\| b.id == padding).unwrap();
479	mf.blocks[padding_pos].insts = (0..pad_inst_count)
480	.map(\|_\| MachineInst {
481	opcode: ArmOpcode::Nop,
482	operands: vec![],
483	def: None,
484	})
485	.collect();
486	mf.blocks.swap(1, 2);
487
488	let blocks_before = mf.blocks.len();
489	relax_branches(&mut mf);
490	assert_eq!(mf.blocks.len(), blocks_before + 1);
491	let entry = &mf.blocks[0];
492	assert_eq!(entry.insts.len(), 2);
493	assert_eq!(entry.insts[0].opcode, ArmOpcode::Cbnz, "cbz inverts to cbnz");
494	assert_eq!(entry.insts[1].opcode, ArmOpcode::B);
495	assert!(matches!(
496	entry.insts[1].operands[0],
497	MachineOperand::BlockRef(t) if t == target
498	));
499	}
500
501	/// Out-of-range cbnz inverts to cbz on the skip path.
502	#[test]
503	fn out_of_range_cbnz_expands_to_cbz_skip() {
504	let mut mf = MachineFunction::new("test".into());
505	let target = mf.new_block("after_padding");
506	let padding = mf.new_block("padding");
507	let test_reg = mf.new_vreg(crate::codegen::mir::RegClass::Gp64);
508
509	mf.blocks[0].insts.push(MachineInst {
510	opcode: ArmOpcode::Cbnz,
511	operands: vec![
512	MachineOperand::VReg(test_reg),
513	MachineOperand::BlockRef(target),
514	],
515	def: None,
516	});
517	let pad_inst_count = (COND_BRANCH_LIMIT as usize) / 4 + 16;
518	let padding_pos = mf.blocks.iter().position(\|b\| b.id == padding).unwrap();
519	mf.blocks[padding_pos].insts = (0..pad_inst_count)
520	.map(\|_\| MachineInst {
521	opcode: ArmOpcode::Nop,
522	operands: vec![],
523	def: None,
524	})
525	.collect();
526	mf.blocks.swap(1, 2);
527
528	relax_branches(&mut mf);
529	let entry = &mf.blocks[0];
530	assert_eq!(entry.insts[0].opcode, ArmOpcode::Cbz, "cbnz inverts to cbz");
531	}
532
533	/// In-range tbz stays as a single instruction.
534	#[test]
535	fn in_range_tbz_unchanged() {
536	let mut mf = MachineFunction::new("test".into());
537	let target = mf.new_block("after");
538	let test_reg = mf.new_vreg(crate::codegen::mir::RegClass::Gp64);
539	mf.blocks[0].insts.push(MachineInst {
540	opcode: ArmOpcode::Tbz,
541	operands: vec![
542	MachineOperand::VReg(test_reg),
543	MachineOperand::Imm(5),
544	MachineOperand::BlockRef(target),
545	],
546	def: None,
547	});
548
549	let block_count_before = mf.blocks.len();
550	relax_branches(&mut mf);
551	assert_eq!(mf.blocks.len(), block_count_before);
552	assert_eq!(mf.blocks[0].insts.len(), 1);
553	assert_eq!(mf.blocks[0].insts[0].opcode, ArmOpcode::Tbz);
554	}
555
556	/// Out-of-range tbz uses the tighter ±32KB limit and expands. Tbz
557	/// is the most range-restricted of the four conditional branches
558	/// we emit, so anything past 32K-ish bytes from the branch must
559	/// trip relaxation.
560	#[test]
561	fn out_of_range_tbz_expands_to_tbnz_skip() {
562	let mut mf = MachineFunction::new("test".into());
563	let target = mf.new_block("after_padding");
564	let padding = mf.new_block("padding");
565	let test_reg = mf.new_vreg(crate::codegen::mir::RegClass::Gp64);
566
567	mf.blocks[0].insts.push(MachineInst {
568	opcode: ArmOpcode::Tbz,
569	operands: vec![
570	MachineOperand::VReg(test_reg),
571	MachineOperand::Imm(7),
572	MachineOperand::BlockRef(target),
573	],
574	def: None,
575	});
576	// Tbz's ±32KB range is much tighter than the ±1MB cond-branch
577	// range — only ~8K nops are needed to overflow.
578	let pad_inst_count = (TBZ_BRANCH_LIMIT as usize) / 4 + 16;
579	let padding_pos = mf.blocks.iter().position(\|b\| b.id == padding).unwrap();
580	mf.blocks[padding_pos].insts = (0..pad_inst_count)
581	.map(\|_\| MachineInst {
582	opcode: ArmOpcode::Nop,
583	operands: vec![],
584	def: None,
585	})
586	.collect();
587	mf.blocks.swap(1, 2);
588
589	let blocks_before = mf.blocks.len();
590	relax_branches(&mut mf);
591	assert_eq!(mf.blocks.len(), blocks_before + 1);
592	let entry = &mf.blocks[0];
593	assert_eq!(entry.insts.len(), 2);
594	assert_eq!(entry.insts[0].opcode, ArmOpcode::Tbnz, "tbz inverts to tbnz");
595	// Bit operand survives the rewrite.
596	assert!(matches!(entry.insts[0].operands[1], MachineOperand::Imm(7)));
597	assert_eq!(entry.insts[1].opcode, ArmOpcode::B);
598	}
599	}
600