fortrangoingonforty/armfortas / db1605b

Browse files

Accept either NEON or kernel-call form of O3 in vectorize_do_loop

Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
db1605bd7db6d92c40b1c97667ab8453130cb589
Parents
33036ef
Tree
cd993ee

1 changed file

StatusFile+-
M tests/vectorize_do_loop.rs 27 7
tests/vectorize_do_loop.rsmodified
@@ -82,16 +82,36 @@ fn o3_vectorizes_full_extent_do_loop_and_keeps_objects_deterministic() {
8282
         "O2 should keep the scalar loop for this ordinary DO map:\n{}",
8383
         o2_ir
8484
     );
85
+    // Two valid vectorization shapes at O3:
86
+    //   * NeonVectorize rewrites the inner body to vload/vadd/vstore
87
+    //     on 128-bit lanes (preferred — no call overhead).
88
+    //   * The older Vectorize pass replaces the loop with a single
89
+    //     afs_array_add_i32 kernel call (fallback).
90
+    let o3_neon = o3_ir.contains("vstore") && o3_ir.contains("vadd");
91
+    let o3_kernel = o3_ir.contains("call @afs_array_add_i32(");
8592
     assert!(
86
-        o3_ir.contains("call @afs_array_add_i32(") && !o3_ir.contains("do_check_"),
87
-        "O3 should replace the scalar loop with a bulk kernel call:\n{}",
93
+        o3_neon || o3_kernel,
94
+        "O3 should vectorize the scalar loop (vload/vadd/vstore or bulk kernel call):\n{}",
8895
         o3_ir
8996
     );
90
-    assert!(
91
-        o3_asm.contains("_afs_array_add_i32"),
92
-        "O3 assembly should reference the bulk add kernel:\n{}",
93
-        o3_asm
94
-    );
97
+    if o3_kernel {
98
+        assert!(
99
+            !o3_ir.contains("do_check_"),
100
+            "kernel-form O3 should remove the loop CFG entirely:\n{}",
101
+            o3_ir
102
+        );
103
+        assert!(
104
+            o3_asm.contains("_afs_array_add_i32"),
105
+            "kernel-form O3 assembly should reference the bulk add kernel:\n{}",
106
+            o3_asm
107
+        );
108
+    } else {
109
+        assert!(
110
+            o3_asm.contains("ldr q") || o3_asm.contains("add.4s") || o3_asm.contains("str q"),
111
+            "neon-form O3 assembly should reference 128-bit vector ops:\n{}",
112
+            o3_asm
113
+        );
114
+    }
95115
     assert_eq!(
96116
         o3_obj_a, o3_obj_b,
97117
         "O3 vectorized object snapshot should stay deterministic"