armfortas Public

Watch 0 Fork 0 Star 0

Rust · 10427 bytes Raw Blame History

  
        1
        use std::collections::BTreeSet;
      
        2
        use std::path::PathBuf;
      
        3
        
        4
        use armfortas::driver::OptLevel;
      
        5
        use armfortas::testing::{capture_from_path, CaptureRequest, CapturedStage, Stage};
      
        6
        
        7
        fn fixture(name: &str) -> PathBuf {
      
        8
            let path = PathBuf::from("test_programs").join(name);
      
        9
            assert!(path.exists(), "missing test fixture {}", path.display());
      
        10
            path
      
        11
        }
      
        12
        
        13
        fn capture_text(request: CaptureRequest, stage: Stage) -> String {
      
        14
            let result = capture_from_path(&request).expect("capture should succeed");
      
        15
            match result.get(stage) {
      
        16
                Some(CapturedStage::Text(text)) => text.clone(),
      
        17
                Some(CapturedStage::Run(_)) => panic!("expected text stage for {}", stage.as_str()),
      
        18
                None => panic!("missing requested stage {}", stage.as_str()),
      
        19
            }
      
        20
        }
      
        21
        
        22
        fn function_section<'a>(ir: &'a str, name: &str) -> &'a str {
      
        23
            let header = format!("  func @{}", name);
      
        24
            let start = ir
      
        25
                .find(&header)
      
        26
                .unwrap_or_else(|| panic!("missing function section for {}", name));
      
        27
            let rest = &ir[start..];
      
        28
            let end = rest
      
        29
                .find("\n  }\n")
      
        30
                .unwrap_or_else(|| panic!("unterminated function section for {}", name));
      
        31
            &rest[..end + "\n  }".len()]
      
        32
        }
      
        33
        
        34
        fn function_sections(ir: &str) -> Vec<&str> {
      
        35
            ir.match_indices("  func @")
      
        36
                .map(|(idx, _)| {
      
        37
                    let rest = &ir[idx..];
      
        38
                    let end = rest
      
        39
                        .find("\n  }\n")
      
        40
                        .unwrap_or_else(|| panic!("unterminated function section in:\n{}", rest));
      
        41
                    &rest[..end + "\n  }".len()]
      
        42
                })
      
        43
                .collect()
      
        44
        }
      
        45
        
        46
        fn function_name<'a>(func_section: &'a str) -> &'a str {
      
        47
            let header = func_section.lines().next().expect("function header").trim();
      
        48
            let rest = header
      
        49
                .strip_prefix("func @")
      
        50
                .expect("function header prefix");
      
        51
            let end = rest
      
        52
                .find(|ch: char| ch == ' ' || ch == '(')
      
        53
                .unwrap_or(rest.len());
      
        54
            &rest[..end]
      
        55
        }
      
        56
        
        57
        fn param_count(func_section: &str) -> usize {
      
        58
            let header = func_section.lines().next().expect("function header");
      
        59
            let inside = header
      
        60
                .split_once('(')
      
        61
                .and_then(|(_, tail)| tail.split_once(") ->"))
      
        62
                .map(|(params, _)| params.trim())
      
        63
                .expect("function header params");
      
        64
            if inside.is_empty() {
      
        65
                0
      
        66
            } else {
      
        67
                inside.split(", ").count()
      
        68
            }
      
        69
        }
      
        70
        
        71
        #[test]
      
        72
        fn o0_realworld_elemental_stage_proves_elemental_and_concurrent_lowering() {
      
        73
            let source = fixture("realworld_elemental_stage.f90");
      
        74
        
        75
            let raw_ir = capture_text(
      
        76
                CaptureRequest {
      
        77
                    input: source,
      
        78
                    requested: BTreeSet::from([Stage::Ir]),
      
        79
                    opt_level: OptLevel::O0,
      
        80
                },
      
        81
                Stage::Ir,
      
        82
            );
      
        83
            let raw_sections = function_sections(&raw_ir);
      
        84
            assert_eq!(
      
        85
                raw_sections.len(),
      
        86
                2,
      
        87
                "raw IR should include the program body plus one scalar ELEMENTAL helper:\n{}",
      
        88
                raw_ir
      
        89
            );
      
        90
            let scalar_body_name = function_name(raw_sections[1]);
      
        91
        
        92
            assert!(
      
        93
                raw_ir.contains("doconc_check_"),
      
        94
                "whole-array ELEMENTAL lowering should still synthesize a DO CONCURRENT loop:\n{}",
      
        95
                raw_ir
      
        96
            );
      
        97
            assert!(
      
        98
                raw_ir.contains(&format!("call @{}(", scalar_body_name)),
      
        99
                "raw IR should still call the scalar ELEMENTAL body per element:\n{}",
      
        100
                raw_ir
      
        101
            );
      
        102
            assert!(
      
        103
                raw_ir.contains("call @afs_array_add_i32("),
      
        104
                "the clean DO CONCURRENT combine should redirect through the bulk runtime kernel:\n{}",
      
        105
                raw_ir
      
        106
            );
      
        107
        }
      
        108
        
        109
        #[test]
      
        110
        fn o2_realworld_ipo_chain_trims_dead_arg_and_removes_trivial_wrapper() {
      
        111
            let source = fixture("realworld_ipo_chain.f90");
      
        112
        
        113
            let raw_ir = capture_text(
      
        114
                CaptureRequest {
      
        115
                    input: source.clone(),
      
        116
                    requested: BTreeSet::from([Stage::Ir]),
      
        117
                    opt_level: OptLevel::O0,
      
        118
                },
      
        119
                Stage::Ir,
      
        120
            );
      
        121
            let opt_ir = capture_text(
      
        122
                CaptureRequest {
      
        123
                    input: source.clone(),
      
        124
                    requested: BTreeSet::from([Stage::OptIr, Stage::Obj]),
      
        125
                    opt_level: OptLevel::O2,
      
        126
                },
      
        127
                Stage::OptIr,
      
        128
            );
      
        129
            let obj_a = capture_text(
      
        130
                CaptureRequest {
      
        131
                    input: source.clone(),
      
        132
                    requested: BTreeSet::from([Stage::Obj]),
      
        133
                    opt_level: OptLevel::O2,
      
        134
                },
      
        135
                Stage::Obj,
      
        136
            );
      
        137
            let obj_b = capture_text(
      
        138
                CaptureRequest {
      
        139
                    input: source,
      
        140
                    requested: BTreeSet::from([Stage::Obj]),
      
        141
                    opt_level: OptLevel::O2,
      
        142
                },
      
        143
                Stage::Obj,
      
        144
            );
      
        145
        
        146
            let raw_sections = function_sections(&raw_ir);
      
        147
            assert_eq!(
      
        148
                raw_sections.len(),
      
        149
                5,
      
        150
                "raw IR should still include accumulate, emit_value, passthrough, and mix_step helpers:\n{}",
      
        151
                raw_ir
      
        152
            );
      
        153
            let raw_wrapper = raw_sections[3];
      
        154
            let raw_wrapper_name = function_name(raw_wrapper);
      
        155
            let raw_mix = raw_sections[4];
      
        156
            let raw_mix_name = function_name(raw_mix);
      
        157
            assert_eq!(
      
        158
                param_count(raw_mix),
      
        159
                3,
      
        160
                "raw helper should keep the live arg, constant arg, and dead arg before IPO:\n{}",
      
        161
                raw_mix
      
        162
            );
      
        163
            assert!(
      
        164
                param_count(raw_wrapper) == 1,
      
        165
                "raw IR should still materialize the trivial wrapper helper:\n{}",
      
        166
                raw_ir
      
        167
            );
      
        168
        
        169
            if opt_ir.contains(&format!("func @{}", raw_mix_name)) {
      
        170
                let opt_mix = function_section(&opt_ir, raw_mix_name);
      
        171
                assert_eq!(
      
        172
                    param_count(opt_mix),
      
        173
                    2,
      
        174
                    "optimized helper should at least trim the dead dummy from the real-world helper chain:\n{}",
      
        175
                    opt_mix
      
        176
                );
      
        177
            }
      
        178
            assert!(
      
        179
                !opt_ir.contains(&format!("func @{}", raw_wrapper_name)),
      
        180
                "optimized IR should remove the trivial wrapper helper:\n{}",
      
        181
                opt_ir
      
        182
            );
      
        183
            assert_eq!(
      
        184
                obj_a, obj_b,
      
        185
                "IPO-audited O2 object snapshot should stay deterministic"
      
        186
            );
      
        187
        }
      
        188
        
        189
        #[test]
      
        190
        fn o2_unrolls_realworld_small_do_concurrent_kernel() {
      
        191
            let source = fixture("realworld_doconc_square.f90");
      
        192
        
        193
            let raw_ir = capture_text(
      
        194
                CaptureRequest {
      
        195
                    input: source.clone(),
      
        196
                    requested: BTreeSet::from([Stage::Ir]),
      
        197
                    opt_level: OptLevel::O0,
      
        198
                },
      
        199
                Stage::Ir,
      
        200
            );
      
        201
            let opt_ir = capture_text(
      
        202
                CaptureRequest {
      
        203
                    input: source,
      
        204
                    requested: BTreeSet::from([Stage::OptIr]),
      
        205
                    opt_level: OptLevel::O2,
      
        206
                },
      
        207
                Stage::OptIr,
      
        208
            );
      
        209
        
        210
            assert!(
      
        211
                raw_ir.contains("doconc_check_")
      
        212
                    && raw_ir.contains("doconc_body_")
      
        213
                    && raw_ir.contains("doconc_incr_"),
      
        214
                "raw IR should preserve the real-world DO CONCURRENT loop identity:\n{}",
      
        215
                raw_ir
      
        216
            );
      
        217
            assert!(
      
        218
                !opt_ir.contains("doconc_check_") && !opt_ir.contains("doconc_body_"),
      
        219
                "O2 should exploit the small real-world DO CONCURRENT loop enough to erase the loop shape:\n{}",
      
        220
                opt_ir
      
        221
            );
      
        222
        }
      
        223
        
        224
        #[test]
      
        225
        fn o3_vectorizes_realworld_explicit_do_stage() {
      
        226
            let source = fixture("realworld_vector_stage.f90");
      
        227
        
        228
            let o2_ir = capture_text(
      
        229
                CaptureRequest {
      
        230
                    input: source.clone(),
      
        231
                    requested: BTreeSet::from([Stage::OptIr]),
      
        232
                    opt_level: OptLevel::O2,
      
        233
                },
      
        234
                Stage::OptIr,
      
        235
            );
      
        236
            let o3_ir = capture_text(
      
        237
                CaptureRequest {
      
        238
                    input: source.clone(),
      
        239
                    requested: BTreeSet::from([Stage::OptIr, Stage::Asm, Stage::Obj]),
      
        240
                    opt_level: OptLevel::O3,
      
        241
                },
      
        242
                Stage::OptIr,
      
        243
            );
      
        244
            let o3_asm = capture_text(
      
        245
                CaptureRequest {
      
        246
                    input: source.clone(),
      
        247
                    requested: BTreeSet::from([Stage::Asm]),
      
        248
                    opt_level: OptLevel::O3,
      
        249
                },
      
        250
                Stage::Asm,
      
        251
            );
      
        252
            let o3_obj_a = capture_text(
      
        253
                CaptureRequest {
      
        254
                    input: source.clone(),
      
        255
                    requested: BTreeSet::from([Stage::Obj]),
      
        256
                    opt_level: OptLevel::O3,
      
        257
                },
      
        258
                Stage::Obj,
      
        259
            );
      
        260
            let o3_obj_b = capture_text(
      
        261
                CaptureRequest {
      
        262
                    input: source,
      
        263
                    requested: BTreeSet::from([Stage::Obj]),
      
        264
                    opt_level: OptLevel::O3,
      
        265
                },
      
        266
                Stage::Obj,
      
        267
            );
      
        268
        
        269
            assert!(
      
        270
                o2_ir.matches("do_check_").count() >= 2
      
        271
                    && !o2_ir.contains("call @afs_array_add_i32(")
      
        272
                    && !o2_ir.contains("vadd"),
      
        273
                "O2 should still keep the explicit scalar loop for this real-world stage:\n{}",
      
        274
                o2_ir
      
        275
            );
      
        276
            // O3 vectorization can land in either of two forms now:
      
        277
            //   * The newer NeonVectorize pass rewrites the inner body to
      
        278
            //     vload/vadd/vstore on 128-bit lanes (preferred — no call
      
        279
            //     overhead, fewer iterations).
      
        280
            //   * The older Vectorize pass redirects the whole loop to the
      
        281
            //     bulk runtime kernel `afs_array_add_i32` (fallback for
      
        282
            //     shapes the NEON pass does not yet handle).
      
        283
            // Either is a valid "vectorization" claim for this loop; the
      
        284
            // load-bearing invariant is that the explicit do_check chain
      
        285
            // shrinks and the loop body becomes vector-shaped (or a kernel
      
        286
            // call) instead of scalar load/iadd/store.
      
        287
            let o3_neon = o3_ir.contains("vstore") && o3_ir.contains("vadd");
      
        288
            let o3_kernel = o3_ir.contains("call @afs_array_add_i32(");
      
        289
            // For the kernel form the loop CFG is replaced by a single call,
      
        290
            // so the do_check block count drops. For the NEON form the loop
      
        291
            // CFG is preserved (vector ops live inside the body), so the
      
        292
            // assertion is just that the body is vector-shaped, not that
      
        293
            // the CFG shrank.
      
        294
            assert!(
      
        295
                o3_kernel || o3_neon,
      
        296
                "O3 should vectorize the real-world explicit DO loop (vload/vadd/vstore or bulk kernel call):\n{}",
      
        297
                o3_ir
      
        298
            );
      
        299
            if o3_kernel {
      
        300
                assert!(
      
        301
                    o3_ir.matches("do_check_").count() < o2_ir.matches("do_check_").count(),
      
        302
                    "kernel-form O3 should replace the explicit DO with a single call:\n{}",
      
        303
                    o3_ir
      
        304
                );
      
        305
            }
      
        306
            if o3_kernel {
      
        307
                assert!(
      
        308
                    o3_asm.contains("_afs_array_add_i32"),
      
        309
                    "kernel-form O3 assembly should reference the bulk add kernel:\n{}",
      
        310
                    o3_asm
      
        311
                );
      
        312
            } else {
      
        313
                assert!(
      
        314
                    o3_asm.contains("add.4s") || o3_asm.contains("ldr q") || o3_asm.contains("str q"),
      
        315
                    "neon-form O3 assembly should reference 128-bit vector ops:\n{}",
      
        316
                    o3_asm
      
        317
                );
      
        318
            }
      
        319
            assert_eq!(
      
        320
                o3_obj_a, o3_obj_b,
      
        321
                "vectorized O3 object snapshot should stay deterministic"
      
        322
            );
      
        323
        }
      
        324

1	use std::collections::BTreeSet;
2	use std::path::PathBuf;
3
4	use armfortas::driver::OptLevel;
5	use armfortas::testing::{capture_from_path, CaptureRequest, CapturedStage, Stage};
6
7	fn fixture(name: &str) -> PathBuf {
8	let path = PathBuf::from("test_programs").join(name);
9	assert!(path.exists(), "missing test fixture {}", path.display());
10	path
11	}
12
13	fn capture_text(request: CaptureRequest, stage: Stage) -> String {
14	let result = capture_from_path(&request).expect("capture should succeed");
15	match result.get(stage) {
16	Some(CapturedStage::Text(text)) => text.clone(),
17	Some(CapturedStage::Run(_)) => panic!("expected text stage for {}", stage.as_str()),
18	None => panic!("missing requested stage {}", stage.as_str()),
19	}
20	}
21
22	fn function_section<'a>(ir: &'a str, name: &str) -> &'a str {
23	let header = format!(" func @{}", name);
24	let start = ir
25	.find(&header)
26	.unwrap_or_else(\|\| panic!("missing function section for {}", name));
27	let rest = &ir[start..];
28	let end = rest
29	.find("\n }\n")
30	.unwrap_or_else(\|\| panic!("unterminated function section for {}", name));
31	&rest[..end + "\n }".len()]
32	}
33
34	fn function_sections(ir: &str) -> Vec<&str> {
35	ir.match_indices(" func @")
36	.map(\|(idx, _)\| {
37	let rest = &ir[idx..];
38	let end = rest
39	.find("\n }\n")
40	.unwrap_or_else(\|\| panic!("unterminated function section in:\n{}", rest));
41	&rest[..end + "\n }".len()]
42	})
43	.collect()
44	}
45
46	fn function_name<'a>(func_section: &'a str) -> &'a str {
47	let header = func_section.lines().next().expect("function header").trim();
48	let rest = header
49	.strip_prefix("func @")
50	.expect("function header prefix");
51	let end = rest
52	.find(\|ch: char\| ch == ' ' \|\| ch == '(')
53	.unwrap_or(rest.len());
54	&rest[..end]
55	}
56
57	fn param_count(func_section: &str) -> usize {
58	let header = func_section.lines().next().expect("function header");
59	let inside = header
60	.split_once('(')
61	.and_then(\|(_, tail)\| tail.split_once(") ->"))
62	.map(\|(params, _)\| params.trim())
63	.expect("function header params");
64	if inside.is_empty() {
65	0
66	} else {
67	inside.split(", ").count()
68	}
69	}
70
71	#[test]
72	fn o0_realworld_elemental_stage_proves_elemental_and_concurrent_lowering() {
73	let source = fixture("realworld_elemental_stage.f90");
74
75	let raw_ir = capture_text(
76	CaptureRequest {
77	input: source,
78	requested: BTreeSet::from([Stage::Ir]),
79	opt_level: OptLevel::O0,
80	},
81	Stage::Ir,
82	);
83	let raw_sections = function_sections(&raw_ir);
84	assert_eq!(
85	raw_sections.len(),
86	2,
87	"raw IR should include the program body plus one scalar ELEMENTAL helper:\n{}",
88	raw_ir
89	);
90	let scalar_body_name = function_name(raw_sections[1]);
91
92	assert!(
93	raw_ir.contains("doconc_check_"),
94	"whole-array ELEMENTAL lowering should still synthesize a DO CONCURRENT loop:\n{}",
95	raw_ir
96	);
97	assert!(
98	raw_ir.contains(&format!("call @{}(", scalar_body_name)),
99	"raw IR should still call the scalar ELEMENTAL body per element:\n{}",
100	raw_ir
101	);
102	assert!(
103	raw_ir.contains("call @afs_array_add_i32("),
104	"the clean DO CONCURRENT combine should redirect through the bulk runtime kernel:\n{}",
105	raw_ir
106	);
107	}
108
109	#[test]
110	fn o2_realworld_ipo_chain_trims_dead_arg_and_removes_trivial_wrapper() {
111	let source = fixture("realworld_ipo_chain.f90");
112
113	let raw_ir = capture_text(
114	CaptureRequest {
115	input: source.clone(),
116	requested: BTreeSet::from([Stage::Ir]),
117	opt_level: OptLevel::O0,
118	},
119	Stage::Ir,
120	);
121	let opt_ir = capture_text(
122	CaptureRequest {
123	input: source.clone(),
124	requested: BTreeSet::from([Stage::OptIr, Stage::Obj]),
125	opt_level: OptLevel::O2,
126	},
127	Stage::OptIr,
128	);
129	let obj_a = capture_text(
130	CaptureRequest {
131	input: source.clone(),
132	requested: BTreeSet::from([Stage::Obj]),
133	opt_level: OptLevel::O2,
134	},
135	Stage::Obj,
136	);
137	let obj_b = capture_text(
138	CaptureRequest {
139	input: source,
140	requested: BTreeSet::from([Stage::Obj]),
141	opt_level: OptLevel::O2,
142	},
143	Stage::Obj,
144	);
145
146	let raw_sections = function_sections(&raw_ir);
147	assert_eq!(
148	raw_sections.len(),
149	5,
150	"raw IR should still include accumulate, emit_value, passthrough, and mix_step helpers:\n{}",
151	raw_ir
152	);
153	let raw_wrapper = raw_sections[3];
154	let raw_wrapper_name = function_name(raw_wrapper);
155	let raw_mix = raw_sections[4];
156	let raw_mix_name = function_name(raw_mix);
157	assert_eq!(
158	param_count(raw_mix),
159	3,
160	"raw helper should keep the live arg, constant arg, and dead arg before IPO:\n{}",
161	raw_mix
162	);
163	assert!(
164	param_count(raw_wrapper) == 1,
165	"raw IR should still materialize the trivial wrapper helper:\n{}",
166	raw_ir
167	);
168
169	if opt_ir.contains(&format!("func @{}", raw_mix_name)) {
170	let opt_mix = function_section(&opt_ir, raw_mix_name);
171	assert_eq!(
172	param_count(opt_mix),
173	2,
174	"optimized helper should at least trim the dead dummy from the real-world helper chain:\n{}",
175	opt_mix
176	);
177	}
178	assert!(
179	!opt_ir.contains(&format!("func @{}", raw_wrapper_name)),
180	"optimized IR should remove the trivial wrapper helper:\n{}",
181	opt_ir
182	);
183	assert_eq!(
184	obj_a, obj_b,
185	"IPO-audited O2 object snapshot should stay deterministic"
186	);
187	}
188
189	#[test]
190	fn o2_unrolls_realworld_small_do_concurrent_kernel() {
191	let source = fixture("realworld_doconc_square.f90");
192
193	let raw_ir = capture_text(
194	CaptureRequest {
195	input: source.clone(),
196	requested: BTreeSet::from([Stage::Ir]),
197	opt_level: OptLevel::O0,
198	},
199	Stage::Ir,
200	);
201	let opt_ir = capture_text(
202	CaptureRequest {
203	input: source,
204	requested: BTreeSet::from([Stage::OptIr]),
205	opt_level: OptLevel::O2,
206	},
207	Stage::OptIr,
208	);
209
210	assert!(
211	raw_ir.contains("doconc_check_")
212	&& raw_ir.contains("doconc_body_")
213	&& raw_ir.contains("doconc_incr_"),
214	"raw IR should preserve the real-world DO CONCURRENT loop identity:\n{}",
215	raw_ir
216	);
217	assert!(
218	!opt_ir.contains("doconc_check_") && !opt_ir.contains("doconc_body_"),
219	"O2 should exploit the small real-world DO CONCURRENT loop enough to erase the loop shape:\n{}",
220	opt_ir
221	);
222	}
223
224	#[test]
225	fn o3_vectorizes_realworld_explicit_do_stage() {
226	let source = fixture("realworld_vector_stage.f90");
227
228	let o2_ir = capture_text(
229	CaptureRequest {
230	input: source.clone(),
231	requested: BTreeSet::from([Stage::OptIr]),
232	opt_level: OptLevel::O2,
233	},
234	Stage::OptIr,
235	);
236	let o3_ir = capture_text(
237	CaptureRequest {
238	input: source.clone(),
239	requested: BTreeSet::from([Stage::OptIr, Stage::Asm, Stage::Obj]),
240	opt_level: OptLevel::O3,
241	},
242	Stage::OptIr,
243	);
244	let o3_asm = capture_text(
245	CaptureRequest {
246	input: source.clone(),
247	requested: BTreeSet::from([Stage::Asm]),
248	opt_level: OptLevel::O3,
249	},
250	Stage::Asm,
251	);
252	let o3_obj_a = capture_text(
253	CaptureRequest {
254	input: source.clone(),
255	requested: BTreeSet::from([Stage::Obj]),
256	opt_level: OptLevel::O3,
257	},
258	Stage::Obj,
259	);
260	let o3_obj_b = capture_text(
261	CaptureRequest {
262	input: source,
263	requested: BTreeSet::from([Stage::Obj]),
264	opt_level: OptLevel::O3,
265	},
266	Stage::Obj,
267	);
268
269	assert!(
270	o2_ir.matches("do_check_").count() >= 2
271	&& !o2_ir.contains("call @afs_array_add_i32(")
272	&& !o2_ir.contains("vadd"),
273	"O2 should still keep the explicit scalar loop for this real-world stage:\n{}",
274	o2_ir
275	);
276	// O3 vectorization can land in either of two forms now:
277	// * The newer NeonVectorize pass rewrites the inner body to
278	// vload/vadd/vstore on 128-bit lanes (preferred — no call
279	// overhead, fewer iterations).
280	// * The older Vectorize pass redirects the whole loop to the
281	// bulk runtime kernel `afs_array_add_i32` (fallback for
282	// shapes the NEON pass does not yet handle).
283	// Either is a valid "vectorization" claim for this loop; the
284	// load-bearing invariant is that the explicit do_check chain
285	// shrinks and the loop body becomes vector-shaped (or a kernel
286	// call) instead of scalar load/iadd/store.
287	let o3_neon = o3_ir.contains("vstore") && o3_ir.contains("vadd");
288	let o3_kernel = o3_ir.contains("call @afs_array_add_i32(");
289	// For the kernel form the loop CFG is replaced by a single call,
290	// so the do_check block count drops. For the NEON form the loop
291	// CFG is preserved (vector ops live inside the body), so the
292	// assertion is just that the body is vector-shaped, not that
293	// the CFG shrank.
294	assert!(
295	o3_kernel \|\| o3_neon,
296	"O3 should vectorize the real-world explicit DO loop (vload/vadd/vstore or bulk kernel call):\n{}",
297	o3_ir
298	);
299	if o3_kernel {
300	assert!(
301	o3_ir.matches("do_check_").count() < o2_ir.matches("do_check_").count(),
302	"kernel-form O3 should replace the explicit DO with a single call:\n{}",
303	o3_ir
304	);
305	}
306	if o3_kernel {
307	assert!(
308	o3_asm.contains("_afs_array_add_i32"),
309	"kernel-form O3 assembly should reference the bulk add kernel:\n{}",
310	o3_asm
311	);
312	} else {
313	assert!(
314	o3_asm.contains("add.4s") \|\| o3_asm.contains("ldr q") \|\| o3_asm.contains("str q"),
315	"neon-form O3 assembly should reference 128-bit vector ops:\n{}",
316	o3_asm
317	);
318	}
319	assert_eq!(
320	o3_obj_a, o3_obj_b,
321	"vectorized O3 object snapshot should stay deterministic"
322	);
323	}
324