| 1 | use std::collections::BTreeSet; |
| 2 | use std::path::PathBuf; |
| 3 | |
| 4 | use armfortas::driver::OptLevel; |
| 5 | use armfortas::testing::{capture_from_path, CaptureRequest, CapturedStage, Stage}; |
| 6 | |
| 7 | fn fixture(name: &str) -> PathBuf { |
| 8 | let path = PathBuf::from("test_programs").join(name); |
| 9 | assert!(path.exists(), "missing test fixture {}", path.display()); |
| 10 | path |
| 11 | } |
| 12 | |
| 13 | fn capture_text(request: CaptureRequest, stage: Stage) -> String { |
| 14 | let result = capture_from_path(&request).expect("capture should succeed"); |
| 15 | match result.get(stage) { |
| 16 | Some(CapturedStage::Text(text)) => text.clone(), |
| 17 | Some(CapturedStage::Run(_)) => panic!("expected text stage for {}", stage.as_str()), |
| 18 | None => panic!("missing requested stage {}", stage.as_str()), |
| 19 | } |
| 20 | } |
| 21 | |
| 22 | fn function_section<'a>(ir: &'a str, name: &str) -> &'a str { |
| 23 | let header = format!(" func @{}", name); |
| 24 | let start = ir |
| 25 | .find(&header) |
| 26 | .unwrap_or_else(|| panic!("missing function section for {}", name)); |
| 27 | let rest = &ir[start..]; |
| 28 | let end = rest |
| 29 | .find("\n }\n") |
| 30 | .unwrap_or_else(|| panic!("unterminated function section for {}", name)); |
| 31 | &rest[..end + "\n }".len()] |
| 32 | } |
| 33 | |
| 34 | fn function_sections(ir: &str) -> Vec<&str> { |
| 35 | ir.match_indices(" func @") |
| 36 | .map(|(idx, _)| { |
| 37 | let rest = &ir[idx..]; |
| 38 | let end = rest |
| 39 | .find("\n }\n") |
| 40 | .unwrap_or_else(|| panic!("unterminated function section in:\n{}", rest)); |
| 41 | &rest[..end + "\n }".len()] |
| 42 | }) |
| 43 | .collect() |
| 44 | } |
| 45 | |
| 46 | fn function_name<'a>(func_section: &'a str) -> &'a str { |
| 47 | let header = func_section.lines().next().expect("function header").trim(); |
| 48 | let rest = header |
| 49 | .strip_prefix("func @") |
| 50 | .expect("function header prefix"); |
| 51 | let end = rest |
| 52 | .find(|ch: char| ch == ' ' || ch == '(') |
| 53 | .unwrap_or(rest.len()); |
| 54 | &rest[..end] |
| 55 | } |
| 56 | |
| 57 | fn param_count(func_section: &str) -> usize { |
| 58 | let header = func_section.lines().next().expect("function header"); |
| 59 | let inside = header |
| 60 | .split_once('(') |
| 61 | .and_then(|(_, tail)| tail.split_once(") ->")) |
| 62 | .map(|(params, _)| params.trim()) |
| 63 | .expect("function header params"); |
| 64 | if inside.is_empty() { |
| 65 | 0 |
| 66 | } else { |
| 67 | inside.split(", ").count() |
| 68 | } |
| 69 | } |
| 70 | |
| 71 | #[test] |
| 72 | fn o0_realworld_elemental_stage_proves_elemental_and_concurrent_lowering() { |
| 73 | let source = fixture("realworld_elemental_stage.f90"); |
| 74 | |
| 75 | let raw_ir = capture_text( |
| 76 | CaptureRequest { |
| 77 | input: source, |
| 78 | requested: BTreeSet::from([Stage::Ir]), |
| 79 | opt_level: OptLevel::O0, |
| 80 | }, |
| 81 | Stage::Ir, |
| 82 | ); |
| 83 | let raw_sections = function_sections(&raw_ir); |
| 84 | assert_eq!( |
| 85 | raw_sections.len(), |
| 86 | 2, |
| 87 | "raw IR should include the program body plus one scalar ELEMENTAL helper:\n{}", |
| 88 | raw_ir |
| 89 | ); |
| 90 | let scalar_body_name = function_name(raw_sections[1]); |
| 91 | |
| 92 | assert!( |
| 93 | raw_ir.contains("doconc_check_"), |
| 94 | "whole-array ELEMENTAL lowering should still synthesize a DO CONCURRENT loop:\n{}", |
| 95 | raw_ir |
| 96 | ); |
| 97 | assert!( |
| 98 | raw_ir.contains(&format!("call @{}(", scalar_body_name)), |
| 99 | "raw IR should still call the scalar ELEMENTAL body per element:\n{}", |
| 100 | raw_ir |
| 101 | ); |
| 102 | assert!( |
| 103 | raw_ir.contains("call @afs_array_add_i32("), |
| 104 | "the clean DO CONCURRENT combine should redirect through the bulk runtime kernel:\n{}", |
| 105 | raw_ir |
| 106 | ); |
| 107 | } |
| 108 | |
| 109 | #[test] |
| 110 | fn o2_realworld_ipo_chain_trims_dead_arg_and_removes_trivial_wrapper() { |
| 111 | let source = fixture("realworld_ipo_chain.f90"); |
| 112 | |
| 113 | let raw_ir = capture_text( |
| 114 | CaptureRequest { |
| 115 | input: source.clone(), |
| 116 | requested: BTreeSet::from([Stage::Ir]), |
| 117 | opt_level: OptLevel::O0, |
| 118 | }, |
| 119 | Stage::Ir, |
| 120 | ); |
| 121 | let opt_ir = capture_text( |
| 122 | CaptureRequest { |
| 123 | input: source.clone(), |
| 124 | requested: BTreeSet::from([Stage::OptIr, Stage::Obj]), |
| 125 | opt_level: OptLevel::O2, |
| 126 | }, |
| 127 | Stage::OptIr, |
| 128 | ); |
| 129 | let obj_a = capture_text( |
| 130 | CaptureRequest { |
| 131 | input: source.clone(), |
| 132 | requested: BTreeSet::from([Stage::Obj]), |
| 133 | opt_level: OptLevel::O2, |
| 134 | }, |
| 135 | Stage::Obj, |
| 136 | ); |
| 137 | let obj_b = capture_text( |
| 138 | CaptureRequest { |
| 139 | input: source, |
| 140 | requested: BTreeSet::from([Stage::Obj]), |
| 141 | opt_level: OptLevel::O2, |
| 142 | }, |
| 143 | Stage::Obj, |
| 144 | ); |
| 145 | |
| 146 | let raw_sections = function_sections(&raw_ir); |
| 147 | assert_eq!( |
| 148 | raw_sections.len(), |
| 149 | 5, |
| 150 | "raw IR should still include accumulate, emit_value, passthrough, and mix_step helpers:\n{}", |
| 151 | raw_ir |
| 152 | ); |
| 153 | let raw_wrapper = raw_sections[3]; |
| 154 | let raw_wrapper_name = function_name(raw_wrapper); |
| 155 | let raw_mix = raw_sections[4]; |
| 156 | let raw_mix_name = function_name(raw_mix); |
| 157 | assert_eq!( |
| 158 | param_count(raw_mix), |
| 159 | 3, |
| 160 | "raw helper should keep the live arg, constant arg, and dead arg before IPO:\n{}", |
| 161 | raw_mix |
| 162 | ); |
| 163 | assert!( |
| 164 | param_count(raw_wrapper) == 1, |
| 165 | "raw IR should still materialize the trivial wrapper helper:\n{}", |
| 166 | raw_ir |
| 167 | ); |
| 168 | |
| 169 | if opt_ir.contains(&format!("func @{}", raw_mix_name)) { |
| 170 | let opt_mix = function_section(&opt_ir, raw_mix_name); |
| 171 | assert_eq!( |
| 172 | param_count(opt_mix), |
| 173 | 2, |
| 174 | "optimized helper should at least trim the dead dummy from the real-world helper chain:\n{}", |
| 175 | opt_mix |
| 176 | ); |
| 177 | } |
| 178 | assert!( |
| 179 | !opt_ir.contains(&format!("func @{}", raw_wrapper_name)), |
| 180 | "optimized IR should remove the trivial wrapper helper:\n{}", |
| 181 | opt_ir |
| 182 | ); |
| 183 | assert_eq!( |
| 184 | obj_a, obj_b, |
| 185 | "IPO-audited O2 object snapshot should stay deterministic" |
| 186 | ); |
| 187 | } |
| 188 | |
| 189 | #[test] |
| 190 | fn o2_unrolls_realworld_small_do_concurrent_kernel() { |
| 191 | let source = fixture("realworld_doconc_square.f90"); |
| 192 | |
| 193 | let raw_ir = capture_text( |
| 194 | CaptureRequest { |
| 195 | input: source.clone(), |
| 196 | requested: BTreeSet::from([Stage::Ir]), |
| 197 | opt_level: OptLevel::O0, |
| 198 | }, |
| 199 | Stage::Ir, |
| 200 | ); |
| 201 | let opt_ir = capture_text( |
| 202 | CaptureRequest { |
| 203 | input: source, |
| 204 | requested: BTreeSet::from([Stage::OptIr]), |
| 205 | opt_level: OptLevel::O2, |
| 206 | }, |
| 207 | Stage::OptIr, |
| 208 | ); |
| 209 | |
| 210 | assert!( |
| 211 | raw_ir.contains("doconc_check_") |
| 212 | && raw_ir.contains("doconc_body_") |
| 213 | && raw_ir.contains("doconc_incr_"), |
| 214 | "raw IR should preserve the real-world DO CONCURRENT loop identity:\n{}", |
| 215 | raw_ir |
| 216 | ); |
| 217 | assert!( |
| 218 | !opt_ir.contains("doconc_check_") && !opt_ir.contains("doconc_body_"), |
| 219 | "O2 should exploit the small real-world DO CONCURRENT loop enough to erase the loop shape:\n{}", |
| 220 | opt_ir |
| 221 | ); |
| 222 | } |
| 223 | |
| 224 | #[test] |
| 225 | fn o3_vectorizes_realworld_explicit_do_stage() { |
| 226 | let source = fixture("realworld_vector_stage.f90"); |
| 227 | |
| 228 | let o2_ir = capture_text( |
| 229 | CaptureRequest { |
| 230 | input: source.clone(), |
| 231 | requested: BTreeSet::from([Stage::OptIr]), |
| 232 | opt_level: OptLevel::O2, |
| 233 | }, |
| 234 | Stage::OptIr, |
| 235 | ); |
| 236 | let o3_ir = capture_text( |
| 237 | CaptureRequest { |
| 238 | input: source.clone(), |
| 239 | requested: BTreeSet::from([Stage::OptIr, Stage::Asm, Stage::Obj]), |
| 240 | opt_level: OptLevel::O3, |
| 241 | }, |
| 242 | Stage::OptIr, |
| 243 | ); |
| 244 | let o3_asm = capture_text( |
| 245 | CaptureRequest { |
| 246 | input: source.clone(), |
| 247 | requested: BTreeSet::from([Stage::Asm]), |
| 248 | opt_level: OptLevel::O3, |
| 249 | }, |
| 250 | Stage::Asm, |
| 251 | ); |
| 252 | let o3_obj_a = capture_text( |
| 253 | CaptureRequest { |
| 254 | input: source.clone(), |
| 255 | requested: BTreeSet::from([Stage::Obj]), |
| 256 | opt_level: OptLevel::O3, |
| 257 | }, |
| 258 | Stage::Obj, |
| 259 | ); |
| 260 | let o3_obj_b = capture_text( |
| 261 | CaptureRequest { |
| 262 | input: source, |
| 263 | requested: BTreeSet::from([Stage::Obj]), |
| 264 | opt_level: OptLevel::O3, |
| 265 | }, |
| 266 | Stage::Obj, |
| 267 | ); |
| 268 | |
| 269 | assert!( |
| 270 | o2_ir.matches("do_check_").count() >= 2 |
| 271 | && !o2_ir.contains("call @afs_array_add_i32(") |
| 272 | && !o2_ir.contains("vadd"), |
| 273 | "O2 should still keep the explicit scalar loop for this real-world stage:\n{}", |
| 274 | o2_ir |
| 275 | ); |
| 276 | // O3 vectorization can land in either of two forms now: |
| 277 | // * The newer NeonVectorize pass rewrites the inner body to |
| 278 | // vload/vadd/vstore on 128-bit lanes (preferred — no call |
| 279 | // overhead, fewer iterations). |
| 280 | // * The older Vectorize pass redirects the whole loop to the |
| 281 | // bulk runtime kernel `afs_array_add_i32` (fallback for |
| 282 | // shapes the NEON pass does not yet handle). |
| 283 | // Either is a valid "vectorization" claim for this loop; the |
| 284 | // load-bearing invariant is that the explicit do_check chain |
| 285 | // shrinks and the loop body becomes vector-shaped (or a kernel |
| 286 | // call) instead of scalar load/iadd/store. |
| 287 | let o3_neon = o3_ir.contains("vstore") && o3_ir.contains("vadd"); |
| 288 | let o3_kernel = o3_ir.contains("call @afs_array_add_i32("); |
| 289 | // For the kernel form the loop CFG is replaced by a single call, |
| 290 | // so the do_check block count drops. For the NEON form the loop |
| 291 | // CFG is preserved (vector ops live inside the body), so the |
| 292 | // assertion is just that the body is vector-shaped, not that |
| 293 | // the CFG shrank. |
| 294 | assert!( |
| 295 | o3_kernel || o3_neon, |
| 296 | "O3 should vectorize the real-world explicit DO loop (vload/vadd/vstore or bulk kernel call):\n{}", |
| 297 | o3_ir |
| 298 | ); |
| 299 | if o3_kernel { |
| 300 | assert!( |
| 301 | o3_ir.matches("do_check_").count() < o2_ir.matches("do_check_").count(), |
| 302 | "kernel-form O3 should replace the explicit DO with a single call:\n{}", |
| 303 | o3_ir |
| 304 | ); |
| 305 | } |
| 306 | if o3_kernel { |
| 307 | assert!( |
| 308 | o3_asm.contains("_afs_array_add_i32"), |
| 309 | "kernel-form O3 assembly should reference the bulk add kernel:\n{}", |
| 310 | o3_asm |
| 311 | ); |
| 312 | } else { |
| 313 | assert!( |
| 314 | o3_asm.contains("add.4s") || o3_asm.contains("ldr q") || o3_asm.contains("str q"), |
| 315 | "neon-form O3 assembly should reference 128-bit vector ops:\n{}", |
| 316 | o3_asm |
| 317 | ); |
| 318 | } |
| 319 | assert_eq!( |
| 320 | o3_obj_a, o3_obj_b, |
| 321 | "vectorized O3 object snapshot should stay deterministic" |
| 322 | ); |
| 323 | } |
| 324 |