`b9cfaba`

Use canonical column-major strides in allocatable assignment dest

Authored by

espadonne 3 days ago

Committed by mfwolffe

SHA: b9cfaba3d23e4b9661920b385bce4efe3be7bd7a
Parents: 5b3eccb
Tree: 889072e

1 changed file

Status	File	+	-
M	`runtime/src/array.rs`	135	19

runtime/src/array.rsmodified

              dest.flags &= !DESC_ALLOCATED;
+         }
 -        // Allocate with source's shape.
 +        // Allocate with source's shape, but compute canonical
 +        // column-major strides (1, ext_0, ext_0*ext_1, ...) — the
 +        // dest is freshly contiguous, so per-dim memory step must
 +        // match Fortran's column-major convention used by
 +        // afs_create_section / load_rank1_array_desc_elem. Setting
 +        // stride=1 across the board collapsed dim_1+ accesses onto
 +        // the dim_0 axis (e.g. allocatable A = transpose(reshape(...))
 +        // produced descriptor with stride=(1,1) and any subsequent
 +        // assumed-shape pass read overlapping bytes per "column").
          dest.rank = source.rank;
          dest.elem_size = source.elem_size;
 +        let mut running_stride: i64 = 1;
          for i in 0..source.rank as usize {
              dest.dims[i] = source.dims[i];
 -            dest.dims[i].stride = 1; // dest is always contiguous
 +            dest.dims[i].stride = running_stride;
 +            running_stride = running_stride.saturating_mul(source.dims[i].extent().max(1));
+         }
          let bytes = dest.total_bytes();
          dest.flags = DESC_ALLOCATED | DESC_CONTIGUOUS;
+     }
 -    // Copy data. Use ptr::copy (not copy_nonoverlapping) to handle self-assignment.
 +    // Copy data. Source may be non-contiguous (e.g. result of
 +    // transpose() returns a descriptor with reversed dim strides
 +    // pointing at the original buffer). A flat ptr::copy of
 +    // total_bytes from source.base_addr would drag adjacent bytes
 +    // forward without honoring per-dim strides — the same class of
 +    // bug as the original afs_copy_array_data flat copy. Detect
 +    // non-contiguous and walk every multi-index column-major.
 +    //
 +    // We only treat the source as non-contiguous when at least one
 +    // dim's stride is *strictly greater* than its canonical
 +    // column-major step. Strides smaller than canonical (e.g.
 +    // afs_matmul's 2x2 result emitted with stride=(1,1) instead of
 +    // (1,2)) describe an internally inconsistent descriptor whose
 +    // base_addr still points at a flat contiguous buffer; walking
 +    // those would re-read the same byte offset twice and drop the
 +    // last element. The conservative choice is the flat copy that
 +    // mirrors total_bytes — which the previous unconditional ptr::copy
 +    // did silently for both kinds of source.
      let bytes = source.total_bytes();
      if bytes > 0 && !source.base_addr.is_null() && !dest.base_addr.is_null() {
 -        unsafe {
 -            ptr::copy(source.base_addr, dest.base_addr, bytes as usize);
 +        let elem_size = source.elem_size;
 +        let mut canonical: i64 = 1;
 +        let mut strided = false;
 +        for i in 0..source.rank as usize {
 +            if source.dims[i].stride > canonical {
 +                strided = true;
 +                break;
 +            }
 +            canonical = canonical.saturating_mul(source.dims[i].extent().max(1));
 +        }
 +        if !strided {
 +            unsafe {
 +                ptr::copy(source.base_addr, dest.base_addr, bytes as usize);
 +            }
 +        } else {
 +            let rank = source.rank as usize;
 +            let extents: Vec<i64> = (0..rank).map(|i| source.dims[i].extent()).collect();
 +            let strides: Vec<i64> = (0..rank).map(|i| source.dims[i].stride).collect();
 +            let mut idx = vec![0i64; rank];
 +            let total = source.total_elements();
 +            for k in 0..total {
 +                let mut src_off: i64 = 0;
 +                for d in 0..rank {
 +                    src_off += idx[d] * strides[d];
 +                }
 +                src_off *= elem_size;
 +                let dst_off = k * elem_size;
 +                unsafe {
 +                    ptr::copy_nonoverlapping(
 +                        source.base_addr.offset(src_off as isize),
 +                        dest.base_addr.offset(dst_off as isize),
 +                        elem_size as usize,
 +                    );
 +                }
 +                for d in 0..rank {
 +                    idx[d] += 1;
 +                    if idx[d] < extents[d] {
 +                        break;
 +                    }
 +                    idx[d] = 0;
 +                }
 +            }
+         }
+     }
      dest.set_scalar_type_tag(source.scalar_type_tag());
+         }
          dest_ref.rank = source_ref.rank;
          dest_ref.elem_size = dest_elem_size;
 +        // Canonical column-major strides — see matching note in
 +        // afs_assign_allocatable. dest is freshly contiguous; the
 +        // per-dim memory step must be (1, ext_0, ext_0*ext_1, ...).
 +        let mut running_stride: i64 = 1;
          for i in 0..source_ref.rank as usize {
              dest_ref.dims[i] = source_ref.dims[i];
 -            dest_ref.dims[i].stride = 1;
 +            dest_ref.dims[i].stride = running_stride;
 +            running_stride =
 +                running_stride.saturating_mul(source_ref.dims[i].extent().max(1));
+         }
          let bytes = dest_ref.total_bytes();
          if bytes > 0 {
      let src_p = source_ref.base_addr;
      let dst_p = dest_ref.base_addr;
 -    for i in 0..n {
 +    let src_elem_size: i64 = match src_kind_tag {
 +        0 => 1,
 +        1 => 2,
 +        2 | 4 => 4,
 +        3 | 5 => 8,
 +        _ => return,
 +    };
 +    // Source may be non-contiguous (e.g. transpose result, section).
 +    // Walk each multi-index column-major and apply per-dim strides.
 +    // Mirror the same-class detection used in afs_assign_allocatable
 +    // and afs_copy_array_data: only apply per-dim strides when at
 +    // least one stride is *strictly greater* than its canonical
 +    // column-major step. A stride below canonical describes a
 +    // malformed descriptor (e.g. a 2x2 matmul result with stride=(1,1)
 +    // instead of (1,2)) whose underlying buffer is still flat
 +    // contiguous; walking those re-reads the same offset twice.
 +    let rank = source_ref.rank as usize;
 +    let extents: Vec<i64> = (0..rank).map(|i| source_ref.dims[i].extent()).collect();
 +    let raw_strides: Vec<i64> = (0..rank).map(|i| source_ref.dims[i].stride).collect();
 +    let mut canonical_step: i64 = 1;
 +    let mut canonical: Vec<i64> = Vec::with_capacity(rank);
 +    let mut strided = false;
 +    for d in 0..rank {
 +        canonical.push(canonical_step);
 +        if raw_strides[d] > canonical_step {
 +            strided = true;
 +        }
 +        canonical_step = canonical_step.saturating_mul(extents[d].max(1));
 +    }
 +    let strides: &[i64] = if strided { &raw_strides } else { &canonical };
 +    let mut idx = vec![0i64; rank];
 +    for k in 0..n {
 +        let mut src_off_elems: i64 = 0;
 +        for d in 0..rank {
 +            src_off_elems += idx[d] * strides[d];
 +        }
 +        let src_byte_off = src_off_elems * src_elem_size;
          let src_val_f64: f64 = unsafe {
              match src_kind_tag {
 -                0 => *(src_p.add(i) as *const i8) as f64,
 -                1 => *(src_p.add(2 * i) as *const i16) as f64,
 -                2 => *(src_p.add(4 * i) as *const i32) as f64,
 -                3 => *(src_p.add(8 * i) as *const i64) as f64,
 -                4 => *(src_p.add(4 * i) as *const f32) as f64,
 -                5 => *(src_p.add(8 * i) as *const f64),
 +                0 => *(src_p.offset(src_byte_off as isize) as *const i8) as f64,
 +                1 => *(src_p.offset(src_byte_off as isize) as *const i16) as f64,
 +                2 => *(src_p.offset(src_byte_off as isize) as *const i32) as f64,
 +                3 => *(src_p.offset(src_byte_off as isize) as *const i64) as f64,
 +                4 => *(src_p.offset(src_byte_off as isize) as *const f32) as f64,
 +                5 => *(src_p.offset(src_byte_off as isize) as *const f64),
                  _ => return,
+             }
          };
          unsafe {
              match dest_kind_tag {
 -                0 => *(dst_p.add(i) as *mut i8) = src_val_f64 as i8,
 -                1 => *(dst_p.add(2 * i) as *mut i16) = src_val_f64 as i16,
 -                2 => *(dst_p.add(4 * i) as *mut i32) = src_val_f64 as i32,
 -                3 => *(dst_p.add(8 * i) as *mut i64) = src_val_f64 as i64,
 -                4 => *(dst_p.add(4 * i) as *mut f32) = src_val_f64 as f32,
 -                5 => *(dst_p.add(8 * i) as *mut f64) = src_val_f64,
 +                0 => *(dst_p.add(k) as *mut i8) = src_val_f64 as i8,
 +                1 => *(dst_p.add(2 * k) as *mut i16) = src_val_f64 as i16,
 +                2 => *(dst_p.add(4 * k) as *mut i32) = src_val_f64 as i32,
 +                3 => *(dst_p.add(8 * k) as *mut i64) = src_val_f64 as i64,
 +                4 => *(dst_p.add(4 * k) as *mut f32) = src_val_f64 as f32,
 +                5 => *(dst_p.add(8 * k) as *mut f64) = src_val_f64,
                  _ => return,
+             }
+         }
 +        for d in 0..rank {
 +            idx[d] += 1;
 +            if idx[d] < extents[d] {
 +                break;
 +            }
 +            idx[d] = 0;
 +        }
+     }
+ }