fortrangoingonforty/armfortas / 33036ef

Browse files

Accept either NEON or kernel-call form of O3 vectorization

The test was written before NeonVectorize landed and asserted
specifically on the older bulk-runtime-call shape (afs_array_add_i32).
With NeonVectorize fixed end-to-end, O3 now lowers this loop to
inline vload/vadd/vstore on 128-bit lanes. Both shapes are valid
'vectorize' outcomes — the test now accepts either. The old
'do_check_ count must drop' check only applies to the kernel form
(which replaces the loop CFG entirely); the NEON form preserves the
loop and just rewrites the body.
Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
33036efedb7b356a2afdbe9e8cc0d92347cf3d85
Parents
c180ab0
Tree
e50bf68

1 changed file

StatusFile+-
M tests/claims_audit_29_11.rs 43 9
tests/claims_audit_29_11.rsmodified
@@ -267,21 +267,55 @@ fn o3_vectorizes_realworld_explicit_do_stage() {
267267
     );
268268
 
269269
     assert!(
270
-        o2_ir.matches("do_check_").count() >= 2 && !o2_ir.contains("call @afs_array_add_i32("),
270
+        o2_ir.matches("do_check_").count() >= 2
271
+            && !o2_ir.contains("call @afs_array_add_i32(")
272
+            && !o2_ir.contains("vadd"),
271273
         "O2 should still keep the explicit scalar loop for this real-world stage:\n{}",
272274
         o2_ir
273275
     );
276
+    // O3 vectorization can land in either of two forms now:
277
+    //   * The newer NeonVectorize pass rewrites the inner body to
278
+    //     vload/vadd/vstore on 128-bit lanes (preferred — no call
279
+    //     overhead, fewer iterations).
280
+    //   * The older Vectorize pass redirects the whole loop to the
281
+    //     bulk runtime kernel `afs_array_add_i32` (fallback for
282
+    //     shapes the NEON pass does not yet handle).
283
+    // Either is a valid "vectorization" claim for this loop; the
284
+    // load-bearing invariant is that the explicit do_check chain
285
+    // shrinks and the loop body becomes vector-shaped (or a kernel
286
+    // call) instead of scalar load/iadd/store.
287
+    let o3_neon = o3_ir.contains("vstore") && o3_ir.contains("vadd");
288
+    let o3_kernel = o3_ir.contains("call @afs_array_add_i32(");
289
+    // For the kernel form the loop CFG is replaced by a single call,
290
+    // so the do_check block count drops. For the NEON form the loop
291
+    // CFG is preserved (vector ops live inside the body), so the
292
+    // assertion is just that the body is vector-shaped, not that
293
+    // the CFG shrank.
274294
     assert!(
275
-        o3_ir.contains("call @afs_array_add_i32(")
276
-            && o3_ir.matches("do_check_").count() < o2_ir.matches("do_check_").count(),
277
-        "O3 should redirect the real-world explicit DO loop to the bulk add kernel:\n{}",
295
+        o3_kernel || o3_neon,
296
+        "O3 should vectorize the real-world explicit DO loop (vload/vadd/vstore or bulk kernel call):\n{}",
278297
         o3_ir
279298
     );
280
-    assert!(
281
-        o3_asm.contains("_afs_array_add_i32"),
282
-        "vectorized O3 assembly should reference the bulk add kernel:\n{}",
283
-        o3_asm
284
-    );
299
+    if o3_kernel {
300
+        assert!(
301
+            o3_ir.matches("do_check_").count() < o2_ir.matches("do_check_").count(),
302
+            "kernel-form O3 should replace the explicit DO with a single call:\n{}",
303
+            o3_ir
304
+        );
305
+    }
306
+    if o3_kernel {
307
+        assert!(
308
+            o3_asm.contains("_afs_array_add_i32"),
309
+            "kernel-form O3 assembly should reference the bulk add kernel:\n{}",
310
+            o3_asm
311
+        );
312
+    } else {
313
+        assert!(
314
+            o3_asm.contains("add.4s") || o3_asm.contains("ldr q") || o3_asm.contains("str q"),
315
+            "neon-form O3 assembly should reference 128-bit vector ops:\n{}",
316
+            o3_asm
317
+        );
318
+    }
285319
     assert_eq!(
286320
         o3_obj_a, o3_obj_b,
287321
         "vectorized O3 object snapshot should stay deterministic"