@@ -82,16 +82,36 @@ fn o3_vectorizes_full_extent_do_loop_and_keeps_objects_deterministic() { |
| 82 | 82 | "O2 should keep the scalar loop for this ordinary DO map:\n{}", |
| 83 | 83 | o2_ir |
| 84 | 84 | ); |
| 85 | + // Two valid vectorization shapes at O3: |
| 86 | + // * NeonVectorize rewrites the inner body to vload/vadd/vstore |
| 87 | + // on 128-bit lanes (preferred — no call overhead). |
| 88 | + // * The older Vectorize pass replaces the loop with a single |
| 89 | + // afs_array_add_i32 kernel call (fallback). |
| 90 | + let o3_neon = o3_ir.contains("vstore") && o3_ir.contains("vadd"); |
| 91 | + let o3_kernel = o3_ir.contains("call @afs_array_add_i32("); |
| 85 | 92 | assert!( |
| 86 | | - o3_ir.contains("call @afs_array_add_i32(") && !o3_ir.contains("do_check_"), |
| 87 | | - "O3 should replace the scalar loop with a bulk kernel call:\n{}", |
| 93 | + o3_neon || o3_kernel, |
| 94 | + "O3 should vectorize the scalar loop (vload/vadd/vstore or bulk kernel call):\n{}", |
| 88 | 95 | o3_ir |
| 89 | 96 | ); |
| 90 | | - assert!( |
| 91 | | - o3_asm.contains("_afs_array_add_i32"), |
| 92 | | - "O3 assembly should reference the bulk add kernel:\n{}", |
| 93 | | - o3_asm |
| 94 | | - ); |
| 97 | + if o3_kernel { |
| 98 | + assert!( |
| 99 | + !o3_ir.contains("do_check_"), |
| 100 | + "kernel-form O3 should remove the loop CFG entirely:\n{}", |
| 101 | + o3_ir |
| 102 | + ); |
| 103 | + assert!( |
| 104 | + o3_asm.contains("_afs_array_add_i32"), |
| 105 | + "kernel-form O3 assembly should reference the bulk add kernel:\n{}", |
| 106 | + o3_asm |
| 107 | + ); |
| 108 | + } else { |
| 109 | + assert!( |
| 110 | + o3_asm.contains("ldr q") || o3_asm.contains("add.4s") || o3_asm.contains("str q"), |
| 111 | + "neon-form O3 assembly should reference 128-bit vector ops:\n{}", |
| 112 | + o3_asm |
| 113 | + ); |
| 114 | + } |
| 95 | 115 | assert_eq!( |
| 96 | 116 | o3_obj_a, o3_obj_b, |
| 97 | 117 | "O3 vectorized object snapshot should stay deterministic" |