| 1 | //! Loop interchange pass. |
| 2 | //! |
| 3 | //! Swaps the iteration order of perfectly-nested loop pairs to improve |
| 4 | //! memory access patterns. Critical for Fortran because arrays are |
| 5 | //! column-major: `a(i, j)` is stored with `i` varying fastest. If the |
| 6 | //! inner loop iterates over `j` while `i` is outer, array accesses |
| 7 | //! stride by the column extent on each iteration — cache-hostile. |
| 8 | //! Interchanging makes `i` the inner loop, giving stride-1 access. |
| 9 | //! |
| 10 | //! ## Algorithm |
| 11 | //! |
| 12 | //! 1. Build loop tree, find perfectly-nested pairs. |
| 13 | //! 2. For each pair, detect counted-loop structure (header, IV, bounds). |
| 14 | //! 3. Analyze the body's GEP instructions to determine which IV is used |
| 15 | //! as the "fast" (first/leftmost) subscript. |
| 16 | //! 4. If the OUTER IV appears as the fast subscript, interchange is |
| 17 | //! profitable (and almost always legal for simple array assignments). |
| 18 | //! 5. Transform by swapping the branch arguments that carry init values |
| 19 | //! to each header, and swapping the bounds used in each comparison. |
| 20 | //! |
| 21 | //! ## Legality |
| 22 | //! |
| 23 | //! Conservative: only interchange when all array accesses in the body |
| 24 | //! use both IVs as simple direct subscripts with no cross-iteration |
| 25 | //! read-before-write. This avoids needing full dependence analysis. |
| 26 | |
| 27 | use super::loop_tree::build_loop_tree; |
| 28 | use super::pass::Pass; |
| 29 | use crate::ir::inst::*; |
| 30 | use crate::ir::walk::predecessors; |
| 31 | |
| 32 | pub struct LoopInterchange; |
| 33 | |
| 34 | impl Pass for LoopInterchange { |
| 35 | fn name(&self) -> &'static str { |
| 36 | "loop-interchange" |
| 37 | } |
| 38 | |
| 39 | fn run(&self, module: &mut Module) -> bool { |
| 40 | let mut changed = false; |
| 41 | for func in &mut module.functions { |
| 42 | if interchange_in_function(func) { |
| 43 | changed = true; |
| 44 | } |
| 45 | } |
| 46 | changed |
| 47 | } |
| 48 | } |
| 49 | |
| 50 | fn interchange_in_function(func: &mut Function) -> bool { |
| 51 | let tree = build_loop_tree(func); |
| 52 | let preds = predecessors(func); |
| 53 | let pairs = tree.perfectly_nested_pairs(func); |
| 54 | |
| 55 | for (outer_id, inner_id) in &pairs { |
| 56 | let outer = tree.node(*outer_id); |
| 57 | let inner = tree.node(*inner_id); |
| 58 | |
| 59 | // Both loops must have a recognized counted-loop structure: |
| 60 | // header(%iv) → cmp_block(icmp, condBr) → body → latch(iadd, br header) |
| 61 | let Some(outer_shape) = detect_loop_shape(func, outer, &preds) else { |
| 62 | continue; |
| 63 | }; |
| 64 | let Some(inner_shape) = detect_loop_shape(func, inner, &preds) else { |
| 65 | continue; |
| 66 | }; |
| 67 | |
| 68 | // Check profitability: is the outer IV used as the first (fast) |
| 69 | // subscript of a multi-dimensional array GEP? |
| 70 | if !should_interchange(func, inner, outer_shape.iv, inner_shape.iv) { |
| 71 | continue; |
| 72 | } |
| 73 | |
| 74 | // Check legality: conservative — only if the body has no |
| 75 | // loop-carried dependencies that would change semantics. |
| 76 | if !is_interchange_legal(func, inner, outer_shape.iv, inner_shape.iv) { |
| 77 | continue; |
| 78 | } |
| 79 | |
| 80 | // Perform the interchange by swapping the loop bounds and inits. |
| 81 | do_interchange(func, &outer_shape, &inner_shape); |
| 82 | return true; // one at a time |
| 83 | } |
| 84 | false |
| 85 | } |
| 86 | |
| 87 | /// Minimal loop shape for interchange. |
| 88 | struct LoopShape { |
| 89 | header: BlockId, |
| 90 | cmp_block: BlockId, |
| 91 | iv: ValueId, // block param on header |
| 92 | bound: ValueId, // the upper-bound value in the comparison |
| 93 | latch: BlockId, |
| 94 | /// The value passed to the header from the preheader (initial IV). |
| 95 | init_arg_idx: usize, // index in preheader's branch args |
| 96 | } |
| 97 | |
| 98 | fn detect_loop_shape( |
| 99 | func: &Function, |
| 100 | node: &super::loop_tree::LoopTreeNode, |
| 101 | _preds: &std::collections::HashMap<BlockId, Vec<BlockId>>, |
| 102 | ) -> Option<LoopShape> { |
| 103 | let header = node.header; |
| 104 | let hdr = func.block(header); |
| 105 | |
| 106 | // Header must have exactly 1 block param (the IV). |
| 107 | if hdr.params.len() != 1 { |
| 108 | return None; |
| 109 | } |
| 110 | let iv = hdr.params[0].id; |
| 111 | |
| 112 | // Header must be a relay (0 instructions, branch to cmp_block). |
| 113 | if !hdr.insts.is_empty() { |
| 114 | return None; |
| 115 | } |
| 116 | let cmp_block = match &hdr.terminator { |
| 117 | Some(Terminator::Branch(t, args)) if args.is_empty() => *t, |
| 118 | _ => return None, |
| 119 | }; |
| 120 | if !node.body.contains(&cmp_block) { |
| 121 | return None; |
| 122 | } |
| 123 | |
| 124 | // Cmp block must have icmp + condBr. |
| 125 | let cmp_blk = func.block(cmp_block); |
| 126 | let bound = { |
| 127 | let mut found_bound = None; |
| 128 | for inst in &cmp_blk.insts { |
| 129 | if let InstKind::ICmp(_, a, b) = &inst.kind { |
| 130 | // One operand should be the IV, the other is the bound. |
| 131 | if *a == iv { |
| 132 | found_bound = Some(*b); |
| 133 | } else if *b == iv { |
| 134 | found_bound = Some(*a); |
| 135 | } |
| 136 | } |
| 137 | } |
| 138 | found_bound? |
| 139 | }; |
| 140 | |
| 141 | // Find the single latch. |
| 142 | if node.latches.len() != 1 { |
| 143 | return None; |
| 144 | } |
| 145 | let latch = node.latches[0]; |
| 146 | |
| 147 | Some(LoopShape { |
| 148 | header, |
| 149 | cmp_block, |
| 150 | iv, |
| 151 | bound, |
| 152 | latch, |
| 153 | init_arg_idx: 0, // always first param |
| 154 | }) |
| 155 | } |
| 156 | |
| 157 | /// Check if interchanging would improve memory access patterns. |
| 158 | /// |
| 159 | /// Returns true if the OUTER IV appears as the "fast-varying" (first) |
| 160 | /// subscript in a multi-dimensional array GEP. In column-major Fortran, |
| 161 | /// the first subscript should be the inner loop's IV for stride-1 access. |
| 162 | fn should_interchange( |
| 163 | func: &Function, |
| 164 | inner_loop: &super::loop_tree::LoopTreeNode, |
| 165 | outer_iv: ValueId, |
| 166 | inner_iv: ValueId, |
| 167 | ) -> bool { |
| 168 | // Scan the inner loop body for GEP instructions that use both IVs. |
| 169 | for &bid in &inner_loop.body { |
| 170 | let block = func.block(bid); |
| 171 | for inst in &block.insts { |
| 172 | if let InstKind::GetElementPtr(_, indices) = &inst.kind { |
| 173 | // We're looking for a flat-offset GEP where the offset |
| 174 | // is computed as: (outer_iv - lo) + (inner_iv - lo) * stride |
| 175 | // or equivalently: fast_part + slow_part * stride |
| 176 | // |
| 177 | // In column-major, the first addend (non-multiplied part) |
| 178 | // is the "fast" subscript. If the outer IV contributes to |
| 179 | // the non-multiplied addend, interchange is profitable. |
| 180 | if let Some(offset_val) = indices.first() { |
| 181 | if uses_iv_in_fast_position(func, *offset_val, outer_iv, inner_iv) { |
| 182 | return true; |
| 183 | } |
| 184 | } |
| 185 | } |
| 186 | } |
| 187 | } |
| 188 | false |
| 189 | } |
| 190 | |
| 191 | /// Check if the flat offset computation has the outer IV in the |
| 192 | /// non-multiplied (fast) position. The lowered pattern is: |
| 193 | /// %fast = isub %outer_iv, %lo |
| 194 | /// %slow_raw = isub %inner_iv, %lo |
| 195 | /// %slow = imul %slow_raw, %stride |
| 196 | /// %offset = iadd %fast, %slow |
| 197 | /// |
| 198 | /// We trace back from the GEP index to find if outer_iv feeds the |
| 199 | /// non-multiplied side of the final iadd. |
| 200 | fn uses_iv_in_fast_position( |
| 201 | func: &Function, |
| 202 | offset: ValueId, |
| 203 | outer_iv: ValueId, |
| 204 | _inner_iv: ValueId, |
| 205 | ) -> bool { |
| 206 | // Find the instruction that produces `offset`. |
| 207 | let Some(inst) = find_inst(func, offset) else { |
| 208 | return false; |
| 209 | }; |
| 210 | |
| 211 | // The offset should be an iadd of two parts. |
| 212 | let (a, b) = match &inst.kind { |
| 213 | InstKind::IAdd(a, b) => (*a, *b), |
| 214 | _ => return false, |
| 215 | }; |
| 216 | |
| 217 | // One side should be the fast part (derived from outer_iv without |
| 218 | // multiplication), the other should be the slow part (involves imul). |
| 219 | let a_uses_mul = trace_involves_mul(func, a); |
| 220 | let b_uses_mul = trace_involves_mul(func, b); |
| 221 | |
| 222 | // The fast part is the one WITHOUT multiplication. |
| 223 | let fast_part = if !a_uses_mul && b_uses_mul { |
| 224 | a |
| 225 | } else if a_uses_mul && !b_uses_mul { |
| 226 | b |
| 227 | } else { |
| 228 | return false; |
| 229 | }; |
| 230 | |
| 231 | // Does the fast part trace back to the outer IV? |
| 232 | traces_to_iv(func, fast_part, outer_iv) |
| 233 | } |
| 234 | |
| 235 | /// Check if a value's computation involves an IMul somewhere. |
| 236 | fn trace_involves_mul(func: &Function, val: ValueId) -> bool { |
| 237 | let Some(inst) = find_inst(func, val) else { |
| 238 | return false; |
| 239 | }; |
| 240 | match &inst.kind { |
| 241 | InstKind::IMul(..) => true, |
| 242 | InstKind::IAdd(a, b) | InstKind::ISub(a, b) => { |
| 243 | trace_involves_mul(func, *a) || trace_involves_mul(func, *b) |
| 244 | } |
| 245 | InstKind::IntExtend(a, _, _) => trace_involves_mul(func, *a), |
| 246 | _ => false, |
| 247 | } |
| 248 | } |
| 249 | |
| 250 | /// Check if a value traces back to a specific IV (through isub, int_extend). |
| 251 | fn traces_to_iv(func: &Function, val: ValueId, iv: ValueId) -> bool { |
| 252 | if val == iv { |
| 253 | return true; |
| 254 | } |
| 255 | let Some(inst) = find_inst(func, val) else { |
| 256 | return false; |
| 257 | }; |
| 258 | match &inst.kind { |
| 259 | InstKind::ISub(a, _) => traces_to_iv(func, *a, iv), |
| 260 | InstKind::IntExtend(a, _, _) => traces_to_iv(func, *a, iv), |
| 261 | _ => false, |
| 262 | } |
| 263 | } |
| 264 | |
| 265 | /// Find the instruction that defines a value. |
| 266 | fn find_inst(func: &Function, vid: ValueId) -> Option<&Inst> { |
| 267 | for block in &func.blocks { |
| 268 | for inst in &block.insts { |
| 269 | if inst.id == vid { |
| 270 | return Some(inst); |
| 271 | } |
| 272 | } |
| 273 | } |
| 274 | None |
| 275 | } |
| 276 | |
| 277 | /// Legality check using dependence analysis. Verifies that swapping |
| 278 | /// the loop order does not reverse any dependence direction. |
| 279 | fn is_interchange_legal( |
| 280 | func: &Function, |
| 281 | inner_loop: &super::loop_tree::LoopTreeNode, |
| 282 | outer_iv: ValueId, |
| 283 | inner_iv: ValueId, |
| 284 | ) -> bool { |
| 285 | super::dep_analysis::interchange_legal(func, &inner_loop.body, outer_iv, inner_iv) |
| 286 | } |
| 287 | |
| 288 | /// Trace through a GEP chain to find the base array pointer. |
| 289 | fn trace_gep_base(func: &Function, ptr: ValueId) -> Option<ValueId> { |
| 290 | let Some(inst) = find_inst(func, ptr) else { |
| 291 | return Some(ptr); |
| 292 | }; |
| 293 | match &inst.kind { |
| 294 | InstKind::GetElementPtr(base, _) => Some(*base), |
| 295 | _ => Some(ptr), |
| 296 | } |
| 297 | } |
| 298 | |
| 299 | /// Perform the actual interchange transformation. |
| 300 | /// |
| 301 | /// Strategy: swap the init and bound values between the two loops. |
| 302 | /// After swapping, what was the outer loop iterates over the inner |
| 303 | /// range and vice versa. |
| 304 | fn do_interchange(func: &mut Function, outer: &LoopShape, inner: &LoopShape) { |
| 305 | // Find the preheader of the outer loop (it branches to outer.header |
| 306 | // with the outer IV init value). |
| 307 | let outer_preheader = { |
| 308 | let mut ph = None; |
| 309 | for block in &func.blocks { |
| 310 | if let Some(Terminator::Branch(dest, _)) = &block.terminator { |
| 311 | if *dest == outer.header && block.id != outer.latch { |
| 312 | ph = Some(block.id); |
| 313 | break; |
| 314 | } |
| 315 | } |
| 316 | if let Some(Terminator::CondBranch { |
| 317 | true_dest, |
| 318 | false_dest, |
| 319 | .. |
| 320 | }) = &block.terminator |
| 321 | { |
| 322 | if *true_dest == outer.header || *false_dest == outer.header { |
| 323 | // Could be a condBr preheader from preheader insertion |
| 324 | // but we need the unconditional one. |
| 325 | } |
| 326 | } |
| 327 | } |
| 328 | ph |
| 329 | }; |
| 330 | let Some(outer_ph) = outer_preheader else { |
| 331 | return; |
| 332 | }; |
| 333 | |
| 334 | // Find the block that branches to the inner header with the inner |
| 335 | // IV init value (this is the outer loop's "body entry" block). |
| 336 | let inner_entry = { |
| 337 | let mut ie = None; |
| 338 | for block in &func.blocks { |
| 339 | if let Some(Terminator::Branch(dest, args)) = &block.terminator { |
| 340 | if *dest == inner.header && !args.is_empty() && block.id != inner.latch { |
| 341 | ie = Some(block.id); |
| 342 | break; |
| 343 | } |
| 344 | } |
| 345 | if let Some(Terminator::CondBranch { |
| 346 | true_dest, |
| 347 | true_args, |
| 348 | .. |
| 349 | }) = &block.terminator |
| 350 | { |
| 351 | if *true_dest == inner.header && !true_args.is_empty() { |
| 352 | ie = Some(block.id); |
| 353 | break; |
| 354 | } |
| 355 | } |
| 356 | } |
| 357 | ie |
| 358 | }; |
| 359 | let Some(inner_entry_block) = inner_entry else { |
| 360 | return; |
| 361 | }; |
| 362 | |
| 363 | // Get current init values. |
| 364 | let outer_init = get_branch_arg_to(func, outer_ph, outer.header, 0); |
| 365 | let inner_init = get_branch_arg_to(func, inner_entry_block, inner.header, 0); |
| 366 | let Some(outer_init_val) = outer_init else { |
| 367 | return; |
| 368 | }; |
| 369 | let Some(inner_init_val) = inner_init else { |
| 370 | return; |
| 371 | }; |
| 372 | |
| 373 | // Swap init values: outer preheader now passes inner's init to |
| 374 | // outer's header, and inner entry now passes outer's init to |
| 375 | // inner's header. |
| 376 | set_branch_arg_to(func, outer_ph, outer.header, 0, inner_init_val); |
| 377 | set_branch_arg_to(func, inner_entry_block, inner.header, 0, outer_init_val); |
| 378 | |
| 379 | // Swap the bounds in the comparison blocks. |
| 380 | swap_bound(func, outer.cmp_block, outer.iv, outer.bound, inner.bound); |
| 381 | swap_bound(func, inner.cmp_block, inner.iv, inner.bound, outer.bound); |
| 382 | } |
| 383 | |
| 384 | /// Get the Nth branch argument passed to a target block. |
| 385 | fn get_branch_arg_to(func: &Function, from: BlockId, to: BlockId, idx: usize) -> Option<ValueId> { |
| 386 | let block = func.block(from); |
| 387 | match &block.terminator { |
| 388 | Some(Terminator::Branch(dest, args)) if *dest == to => args.get(idx).copied(), |
| 389 | Some(Terminator::CondBranch { |
| 390 | true_dest, |
| 391 | true_args, |
| 392 | false_dest, |
| 393 | false_args, |
| 394 | .. |
| 395 | }) => { |
| 396 | if *true_dest == to { |
| 397 | true_args.get(idx).copied() |
| 398 | } else if *false_dest == to { |
| 399 | false_args.get(idx).copied() |
| 400 | } else { |
| 401 | None |
| 402 | } |
| 403 | } |
| 404 | _ => None, |
| 405 | } |
| 406 | } |
| 407 | |
| 408 | /// Set the Nth branch argument passed to a target block. |
| 409 | fn set_branch_arg_to(func: &mut Function, from: BlockId, to: BlockId, idx: usize, val: ValueId) { |
| 410 | let block = func.block_mut(from); |
| 411 | match &mut block.terminator { |
| 412 | Some(Terminator::Branch(dest, args)) if *dest == to && idx < args.len() => { |
| 413 | args[idx] = val; |
| 414 | } |
| 415 | Some(Terminator::CondBranch { |
| 416 | true_dest, |
| 417 | true_args, |
| 418 | false_dest, |
| 419 | false_args, |
| 420 | .. |
| 421 | }) => { |
| 422 | if *true_dest == to && idx < true_args.len() { |
| 423 | true_args[idx] = val; |
| 424 | } else if *false_dest == to && idx < false_args.len() { |
| 425 | false_args[idx] = val; |
| 426 | } |
| 427 | } |
| 428 | _ => {} |
| 429 | } |
| 430 | } |
| 431 | |
| 432 | /// Swap the bound value in a comparison block's ICmp instruction. |
| 433 | fn swap_bound( |
| 434 | func: &mut Function, |
| 435 | cmp_block: BlockId, |
| 436 | iv: ValueId, |
| 437 | old_bound: ValueId, |
| 438 | new_bound: ValueId, |
| 439 | ) { |
| 440 | let block = func.block_mut(cmp_block); |
| 441 | for inst in &mut block.insts { |
| 442 | if let InstKind::ICmp(_op, a, b) = &mut inst.kind { |
| 443 | if *a == iv && *b == old_bound { |
| 444 | *b = new_bound; |
| 445 | return; |
| 446 | } else if *b == iv && *a == old_bound { |
| 447 | *a = new_bound; |
| 448 | return; |
| 449 | } |
| 450 | } |
| 451 | } |
| 452 | } |
| 453 | |
| 454 | // --------------------------------------------------------------------------- |
| 455 | // Tests |
| 456 | // --------------------------------------------------------------------------- |
| 457 | |
| 458 | #[cfg(test)] |
| 459 | mod tests { |
| 460 | use super::*; |
| 461 | use crate::ir::types::IrType; |
| 462 | use crate::lexer::{Position, Span}; |
| 463 | use crate::opt::pass::Pass; |
| 464 | |
| 465 | fn span() -> Span { |
| 466 | let pos = Position { line: 0, col: 0 }; |
| 467 | Span { |
| 468 | file_id: 0, |
| 469 | start: pos, |
| 470 | end: pos, |
| 471 | } |
| 472 | } |
| 473 | |
| 474 | #[test] |
| 475 | fn interchange_pass_builds() { |
| 476 | // Smoke test: the pass can be constructed and run on an empty module. |
| 477 | let mut m = Module::new("test".into()); |
| 478 | let mut f = Function::new("test".into(), vec![], IrType::Void); |
| 479 | f.block_mut(f.entry).terminator = Some(Terminator::Return(None)); |
| 480 | m.add_function(f); |
| 481 | let pass = LoopInterchange; |
| 482 | let changed = pass.run(&mut m); |
| 483 | assert!(!changed, "no loops → no interchange"); |
| 484 | } |
| 485 | } |
| 486 |