fortrangoingonforty/armfortas / 77677e8

Browse files

Refuse to split phi-like vregs (defined in multiple blocks); remove env gates

Linearscan's live intervals are linear-position [start, end] ranges,
which can't represent the true CFG-aware live set for vregs that
receive values via parallel-copy at every predecessor (block params,
post-phi placeholders). When a call block is lexically wedged
between a phi-vreg's def edge and its use block, the position-based
range straddles the call point but no actual control-flow path
through the vreg traverses it. The splitter then thought the vreg
crossed a call, picked a pre/post split, and assigned the post-half
a different physreg — every predecessor's parallel-copy landed in
pre_phys but every use inside the loop read post_phys.

Track each vreg's set of defining blocks while building
vreg_actual_range; if it has more than one, force real_crosses
false (no splitting). The unit test exercising splitting still
fires because its synthetic vregs have single defs.

Removes:
- ARMFORTAS_SPLIT_INTERVALS env gate (no longer needed)
- detect_partial_unroll_loop's acc_param + store guard (the
underlying regalloc bug it worked around is now fixed)

Verified at all opt levels:
- realworld_seed_overwrite.f90: 4 19 23 (was infinite loop at -O2+)
- realworld_affine_shift.f90: 14 16 (was 14 7 at -O2+)
Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
77677e8ce07ba75e97685183c6e2473cc3e7ca9a
Parents
f2e4e02
Tree
520cf8f

2 changed files

StatusFile+-
M src/codegen/linearscan.rs 32 23
M src/opt/unroll.rs 0 15
src/codegen/linearscan.rsmodified
@@ -223,13 +223,30 @@ pub fn linear_scan(mf: &mut MachineFunction) -> AllocResult {
223223
     // ways that surface latent bugs in arg-setup emission — but we
224224
     // do consult this tighter map before committing to a split.
225225
     let mut vreg_actual_range: HashMap<VRegId, (u32, u32)> = HashMap::new();
226
+    // Vregs defined in more than one MIR block — these are
227
+    // phi-like (block params receiving values via the parallel-copy
228
+    // sequence emit_branch_arg_copies inserts at the end of every
229
+    // predecessor). Linear-position liveness can't represent the
230
+    // true CFG-aware live set for these, so the splitter must not
231
+    // act on them: a call block lexically wedged between the def
232
+    // edge and the use block fakes a crossing point that no actual
233
+    // control-flow path traverses, and the resulting split assigns
234
+    // the post-half a different physreg than the pre-half — every
235
+    // predecessor's parallel-copy lands in pre_phys but every use
236
+    // inside the loop reads post_phys. (`realworld_affine_shift.f90`
237
+    // at -O2+ exhibits this: V_iv defined in `if_end_1`, used in
238
+    // `do_check_3`/`do_body_4`, but `if_then_2` carrying the
239
+    // recursive call is lexically between if_end_1 and do_check_3.)
240
+    let mut vreg_def_blocks: HashMap<VRegId, std::collections::HashSet<usize>> =
241
+        HashMap::new();
226242
     {
227243
         let mut p: u32 = 0;
228
-        for block in &mf.blocks {
244
+        for (block_idx, block) in mf.blocks.iter().enumerate() {
229245
             for inst in &block.insts {
230246
                 let mut touched: Vec<VRegId> = Vec::new();
231247
                 if let Some(d) = inst.def {
232248
                     touched.push(d);
249
+                    vreg_def_blocks.entry(d).or_default().insert(block_idx);
233250
                 }
234251
                 for op in &inst.operands {
235252
                     if let MachineOperand::VReg(v) = op {
@@ -249,6 +266,11 @@ pub fn linear_scan(mf: &mut MachineFunction) -> AllocResult {
249266
             }
250267
         }
251268
     }
269
+    let multi_def_vregs: std::collections::HashSet<VRegId> = vreg_def_blocks
270
+        .iter()
271
+        .filter(|(_, blocks)| blocks.len() > 1)
272
+        .map(|(v, _)| *v)
273
+        .collect();
252274
 
253275
     // Worklist of intervals to allocate. Starts as a clone of the
254276
     // pre-sorted liveness intervals; splitting inserts post-half
@@ -321,7 +343,14 @@ pub fn linear_scan(mf: &mut MachineFunction) -> AllocResult {
321343
         // range from the MIR walk: a vreg whose real range doesn't
322344
         // straddle the supposed crossing point is a false positive
323345
         // from inflated liveness, and must NOT be split.
324
-        let real_crosses = if let Some(&(real_start, real_end)) =
346
+        let real_crosses = if multi_def_vregs.contains(&interval.vreg) {
347
+            // Phi-like vreg: positions can't represent the true
348
+            // live set (see comment above where multi_def_vregs is
349
+            // built). Refuse to split — the parallel-copy at every
350
+            // predecessor must land in the same physreg as every
351
+            // use, and splitting fundamentally breaks that.
352
+            false
353
+        } else if let Some(&(real_start, real_end)) =
325354
             vreg_actual_range.get(&interval.vreg)
326355
         {
327356
             interval.call_crossings.len() == 1
@@ -336,19 +365,7 @@ pub fn linear_scan(mf: &mut MachineFunction) -> AllocResult {
336365
         // P — by the time the bridge str fires (right above the
337366
         // arg-setup block), P would no longer hold the pre-half's
338367
         // value and the bridge would capture garbage.
339
-        // Live-range splitting is gated behind an opt-in env var
340
-        // until the regalloc phi-resolution interaction is hardened.
341
-        // Without the gate, loops where mem2reg leaves the
342
-        // accumulator in memory but the body has duplicate loads of
343
-        // an invariant scalar can pick up split intervals that alias
344
-        // the loop's iv block param register on the preheader edge,
345
-        // miscompiling `realworld_affine_shift.f90` at -O2+ (iv ends
346
-        // up holding the invariant scalar, not the iv init).
347
-        let splitting_enabled = std::env::var_os("ARMFORTAS_SPLIT_INTERVALS").is_some();
348
-        let safe_pre_phys = if splitting_enabled
349
-            && !free_caller.is_empty()
350
-            && interval.call_crossings.len() == 1
351
-        {
368
+        let safe_pre_phys = if !free_caller.is_empty() && interval.call_crossings.len() == 1 {
352369
             let cp = interval.call_crossings[0];
353370
             let pre_end = cp.saturating_sub(1);
354371
             free_caller.iter().rposition(|&r| {
@@ -1982,14 +1999,6 @@ mod tests {
19821999
 
19832000
     #[test]
19842001
     fn linear_scan_splits_call_crossing_when_callee_saved_exhausted() {
1985
-        // Splitting is gated behind an env var in production until
1986
-        // the regalloc phi-resolution interaction is hardened. Set
1987
-        // the gate here so the unit test can exercise the splitting
1988
-        // path. (Unsetting it after the test would race with parallel
1989
-        // tests; instead, only this one test reads the var, and any
1990
-        // other tests that don't expect splitting use shapes that
1991
-        // wouldn't trigger it anyway.)
1992
-        std::env::set_var("ARMFORTAS_SPLIT_INTERVALS", "1");
19932002
         // 12 vregs all live across a single BL forces the
19942003
         // allocator past the 10-deep callee-saved pool. Without
19952004
         // splitting the overflow vregs full-spill; with splitting
src/opt/unroll.rsmodified
@@ -1223,18 +1223,6 @@ fn detect_partial_unroll_loop(
12231223
     // (e.g., `a(i) = ...; b(i) = ...; c(i) = ...;`) cloned U-way
12241224
     // create too much register pressure across the function and
12251225
     // expose latent regalloc fragility (Sprint 18 follow-up).
1226
-    //
1227
-    // Additional gate for 2-block-param loops with stores: when the
1228
-    // second block param is a counter mutated by an inlined
1229
-    // `intent(inout)` arg (e.g., `do i = 1, n; y(i) = ...;
1230
-    // call touch(token); end do`), the unroller treats the counter
1231
-    // as a reduction accumulator and chains it across iterations.
1232
-    // The IR is correct, but linearscan splits the chained register
1233
-    // across the latch back-edge inconsistently with the preheader
1234
-    // edge — phi resolution misses the move and the chained iadd
1235
-    // in the body uses a constant-holding register, producing an
1236
-    // infinite loop where iv never increments. Skip this combo
1237
-    // until regalloc phi-resolution is hardened.
12381226
     let store_count = latch_blk
12391227
         .insts
12401228
         .iter()
@@ -1243,9 +1231,6 @@ fn detect_partial_unroll_loop(
12431231
     if acc_param.is_none() && store_count > 1 {
12441232
         return None;
12451233
     }
1246
-    if acc_param.is_some() && store_count > 0 {
1247
-        return None;
1248
-    }
12491234
     // Pick the largest U ≤ MAX_FACTOR that divides trip and respects
12501235
     // the body-size budget.
12511236
     let mut chosen_u: Option<usize> = None;