@@ -267,21 +267,55 @@ fn o3_vectorizes_realworld_explicit_do_stage() { |
| 267 | 267 | ); |
| 268 | 268 | |
| 269 | 269 | assert!( |
| 270 | | - o2_ir.matches("do_check_").count() >= 2 && !o2_ir.contains("call @afs_array_add_i32("), |
| 270 | + o2_ir.matches("do_check_").count() >= 2 |
| 271 | + && !o2_ir.contains("call @afs_array_add_i32(") |
| 272 | + && !o2_ir.contains("vadd"), |
| 271 | 273 | "O2 should still keep the explicit scalar loop for this real-world stage:\n{}", |
| 272 | 274 | o2_ir |
| 273 | 275 | ); |
| 276 | + // O3 vectorization can land in either of two forms now: |
| 277 | + // * The newer NeonVectorize pass rewrites the inner body to |
| 278 | + // vload/vadd/vstore on 128-bit lanes (preferred — no call |
| 279 | + // overhead, fewer iterations). |
| 280 | + // * The older Vectorize pass redirects the whole loop to the |
| 281 | + // bulk runtime kernel `afs_array_add_i32` (fallback for |
| 282 | + // shapes the NEON pass does not yet handle). |
| 283 | + // Either is a valid "vectorization" claim for this loop; the |
| 284 | + // load-bearing invariant is that the explicit do_check chain |
| 285 | + // shrinks and the loop body becomes vector-shaped (or a kernel |
| 286 | + // call) instead of scalar load/iadd/store. |
| 287 | + let o3_neon = o3_ir.contains("vstore") && o3_ir.contains("vadd"); |
| 288 | + let o3_kernel = o3_ir.contains("call @afs_array_add_i32("); |
| 289 | + // For the kernel form the loop CFG is replaced by a single call, |
| 290 | + // so the do_check block count drops. For the NEON form the loop |
| 291 | + // CFG is preserved (vector ops live inside the body), so the |
| 292 | + // assertion is just that the body is vector-shaped, not that |
| 293 | + // the CFG shrank. |
| 274 | 294 | assert!( |
| 275 | | - o3_ir.contains("call @afs_array_add_i32(") |
| 276 | | - && o3_ir.matches("do_check_").count() < o2_ir.matches("do_check_").count(), |
| 277 | | - "O3 should redirect the real-world explicit DO loop to the bulk add kernel:\n{}", |
| 295 | + o3_kernel || o3_neon, |
| 296 | + "O3 should vectorize the real-world explicit DO loop (vload/vadd/vstore or bulk kernel call):\n{}", |
| 278 | 297 | o3_ir |
| 279 | 298 | ); |
| 280 | | - assert!( |
| 281 | | - o3_asm.contains("_afs_array_add_i32"), |
| 282 | | - "vectorized O3 assembly should reference the bulk add kernel:\n{}", |
| 283 | | - o3_asm |
| 284 | | - ); |
| 299 | + if o3_kernel { |
| 300 | + assert!( |
| 301 | + o3_ir.matches("do_check_").count() < o2_ir.matches("do_check_").count(), |
| 302 | + "kernel-form O3 should replace the explicit DO with a single call:\n{}", |
| 303 | + o3_ir |
| 304 | + ); |
| 305 | + } |
| 306 | + if o3_kernel { |
| 307 | + assert!( |
| 308 | + o3_asm.contains("_afs_array_add_i32"), |
| 309 | + "kernel-form O3 assembly should reference the bulk add kernel:\n{}", |
| 310 | + o3_asm |
| 311 | + ); |
| 312 | + } else { |
| 313 | + assert!( |
| 314 | + o3_asm.contains("add.4s") || o3_asm.contains("ldr q") || o3_asm.contains("str q"), |
| 315 | + "neon-form O3 assembly should reference 128-bit vector ops:\n{}", |
| 316 | + o3_asm |
| 317 | + ); |
| 318 | + } |
| 285 | 319 | assert_eq!( |
| 286 | 320 | o3_obj_a, o3_obj_b, |
| 287 | 321 | "vectorized O3 object snapshot should stay deterministic" |