@@ -1434,12 +1434,22 @@ pub extern "C" fn afs_assign_allocatable( |
| 1434 | 1434 | dest.flags &= !DESC_ALLOCATED; |
| 1435 | 1435 | } |
| 1436 | 1436 | |
| 1437 | | - // Allocate with source's shape. |
| 1437 | + // Allocate with source's shape, but compute canonical |
| 1438 | + // column-major strides (1, ext_0, ext_0*ext_1, ...) — the |
| 1439 | + // dest is freshly contiguous, so per-dim memory step must |
| 1440 | + // match Fortran's column-major convention used by |
| 1441 | + // afs_create_section / load_rank1_array_desc_elem. Setting |
| 1442 | + // stride=1 across the board collapsed dim_1+ accesses onto |
| 1443 | + // the dim_0 axis (e.g. allocatable A = transpose(reshape(...)) |
| 1444 | + // produced descriptor with stride=(1,1) and any subsequent |
| 1445 | + // assumed-shape pass read overlapping bytes per "column"). |
| 1438 | 1446 | dest.rank = source.rank; |
| 1439 | 1447 | dest.elem_size = source.elem_size; |
| 1448 | + let mut running_stride: i64 = 1; |
| 1440 | 1449 | for i in 0..source.rank as usize { |
| 1441 | 1450 | dest.dims[i] = source.dims[i]; |
| 1442 | | - dest.dims[i].stride = 1; // dest is always contiguous |
| 1451 | + dest.dims[i].stride = running_stride; |
| 1452 | + running_stride = running_stride.saturating_mul(source.dims[i].extent().max(1)); |
| 1443 | 1453 | } |
| 1444 | 1454 | |
| 1445 | 1455 | let bytes = dest.total_bytes(); |
@@ -1454,11 +1464,68 @@ pub extern "C" fn afs_assign_allocatable( |
| 1454 | 1464 | dest.flags = DESC_ALLOCATED | DESC_CONTIGUOUS; |
| 1455 | 1465 | } |
| 1456 | 1466 | |
| 1457 | | - // Copy data. Use ptr::copy (not copy_nonoverlapping) to handle self-assignment. |
| 1467 | + // Copy data. Source may be non-contiguous (e.g. result of |
| 1468 | + // transpose() returns a descriptor with reversed dim strides |
| 1469 | + // pointing at the original buffer). A flat ptr::copy of |
| 1470 | + // total_bytes from source.base_addr would drag adjacent bytes |
| 1471 | + // forward without honoring per-dim strides — the same class of |
| 1472 | + // bug as the original afs_copy_array_data flat copy. Detect |
| 1473 | + // non-contiguous and walk every multi-index column-major. |
| 1474 | + // |
| 1475 | + // We only treat the source as non-contiguous when at least one |
| 1476 | + // dim's stride is *strictly greater* than its canonical |
| 1477 | + // column-major step. Strides smaller than canonical (e.g. |
| 1478 | + // afs_matmul's 2x2 result emitted with stride=(1,1) instead of |
| 1479 | + // (1,2)) describe an internally inconsistent descriptor whose |
| 1480 | + // base_addr still points at a flat contiguous buffer; walking |
| 1481 | + // those would re-read the same byte offset twice and drop the |
| 1482 | + // last element. The conservative choice is the flat copy that |
| 1483 | + // mirrors total_bytes — which the previous unconditional ptr::copy |
| 1484 | + // did silently for both kinds of source. |
| 1458 | 1485 | let bytes = source.total_bytes(); |
| 1459 | 1486 | if bytes > 0 && !source.base_addr.is_null() && !dest.base_addr.is_null() { |
| 1460 | | - unsafe { |
| 1461 | | - ptr::copy(source.base_addr, dest.base_addr, bytes as usize); |
| 1487 | + let elem_size = source.elem_size; |
| 1488 | + let mut canonical: i64 = 1; |
| 1489 | + let mut strided = false; |
| 1490 | + for i in 0..source.rank as usize { |
| 1491 | + if source.dims[i].stride > canonical { |
| 1492 | + strided = true; |
| 1493 | + break; |
| 1494 | + } |
| 1495 | + canonical = canonical.saturating_mul(source.dims[i].extent().max(1)); |
| 1496 | + } |
| 1497 | + if !strided { |
| 1498 | + unsafe { |
| 1499 | + ptr::copy(source.base_addr, dest.base_addr, bytes as usize); |
| 1500 | + } |
| 1501 | + } else { |
| 1502 | + let rank = source.rank as usize; |
| 1503 | + let extents: Vec<i64> = (0..rank).map(|i| source.dims[i].extent()).collect(); |
| 1504 | + let strides: Vec<i64> = (0..rank).map(|i| source.dims[i].stride).collect(); |
| 1505 | + let mut idx = vec![0i64; rank]; |
| 1506 | + let total = source.total_elements(); |
| 1507 | + for k in 0..total { |
| 1508 | + let mut src_off: i64 = 0; |
| 1509 | + for d in 0..rank { |
| 1510 | + src_off += idx[d] * strides[d]; |
| 1511 | + } |
| 1512 | + src_off *= elem_size; |
| 1513 | + let dst_off = k * elem_size; |
| 1514 | + unsafe { |
| 1515 | + ptr::copy_nonoverlapping( |
| 1516 | + source.base_addr.offset(src_off as isize), |
| 1517 | + dest.base_addr.offset(dst_off as isize), |
| 1518 | + elem_size as usize, |
| 1519 | + ); |
| 1520 | + } |
| 1521 | + for d in 0..rank { |
| 1522 | + idx[d] += 1; |
| 1523 | + if idx[d] < extents[d] { |
| 1524 | + break; |
| 1525 | + } |
| 1526 | + idx[d] = 0; |
| 1527 | + } |
| 1528 | + } |
| 1462 | 1529 | } |
| 1463 | 1530 | } |
| 1464 | 1531 | dest.set_scalar_type_tag(source.scalar_type_tag()); |
@@ -1523,9 +1590,15 @@ pub extern "C" fn afs_assign_allocatable_convert( |
| 1523 | 1590 | } |
| 1524 | 1591 | dest_ref.rank = source_ref.rank; |
| 1525 | 1592 | dest_ref.elem_size = dest_elem_size; |
| 1593 | + // Canonical column-major strides — see matching note in |
| 1594 | + // afs_assign_allocatable. dest is freshly contiguous; the |
| 1595 | + // per-dim memory step must be (1, ext_0, ext_0*ext_1, ...). |
| 1596 | + let mut running_stride: i64 = 1; |
| 1526 | 1597 | for i in 0..source_ref.rank as usize { |
| 1527 | 1598 | dest_ref.dims[i] = source_ref.dims[i]; |
| 1528 | | - dest_ref.dims[i].stride = 1; |
| 1599 | + dest_ref.dims[i].stride = running_stride; |
| 1600 | + running_stride = |
| 1601 | + running_stride.saturating_mul(source_ref.dims[i].extent().max(1)); |
| 1529 | 1602 | } |
| 1530 | 1603 | let bytes = dest_ref.total_bytes(); |
| 1531 | 1604 | if bytes > 0 { |
@@ -1548,29 +1621,72 @@ pub extern "C" fn afs_assign_allocatable_convert( |
| 1548 | 1621 | |
| 1549 | 1622 | let src_p = source_ref.base_addr; |
| 1550 | 1623 | let dst_p = dest_ref.base_addr; |
| 1551 | | - for i in 0..n { |
| 1624 | + let src_elem_size: i64 = match src_kind_tag { |
| 1625 | + 0 => 1, |
| 1626 | + 1 => 2, |
| 1627 | + 2 | 4 => 4, |
| 1628 | + 3 | 5 => 8, |
| 1629 | + _ => return, |
| 1630 | + }; |
| 1631 | + // Source may be non-contiguous (e.g. transpose result, section). |
| 1632 | + // Walk each multi-index column-major and apply per-dim strides. |
| 1633 | + // Mirror the same-class detection used in afs_assign_allocatable |
| 1634 | + // and afs_copy_array_data: only apply per-dim strides when at |
| 1635 | + // least one stride is *strictly greater* than its canonical |
| 1636 | + // column-major step. A stride below canonical describes a |
| 1637 | + // malformed descriptor (e.g. a 2x2 matmul result with stride=(1,1) |
| 1638 | + // instead of (1,2)) whose underlying buffer is still flat |
| 1639 | + // contiguous; walking those re-reads the same offset twice. |
| 1640 | + let rank = source_ref.rank as usize; |
| 1641 | + let extents: Vec<i64> = (0..rank).map(|i| source_ref.dims[i].extent()).collect(); |
| 1642 | + let raw_strides: Vec<i64> = (0..rank).map(|i| source_ref.dims[i].stride).collect(); |
| 1643 | + let mut canonical_step: i64 = 1; |
| 1644 | + let mut canonical: Vec<i64> = Vec::with_capacity(rank); |
| 1645 | + let mut strided = false; |
| 1646 | + for d in 0..rank { |
| 1647 | + canonical.push(canonical_step); |
| 1648 | + if raw_strides[d] > canonical_step { |
| 1649 | + strided = true; |
| 1650 | + } |
| 1651 | + canonical_step = canonical_step.saturating_mul(extents[d].max(1)); |
| 1652 | + } |
| 1653 | + let strides: &[i64] = if strided { &raw_strides } else { &canonical }; |
| 1654 | + let mut idx = vec![0i64; rank]; |
| 1655 | + for k in 0..n { |
| 1656 | + let mut src_off_elems: i64 = 0; |
| 1657 | + for d in 0..rank { |
| 1658 | + src_off_elems += idx[d] * strides[d]; |
| 1659 | + } |
| 1660 | + let src_byte_off = src_off_elems * src_elem_size; |
| 1552 | 1661 | let src_val_f64: f64 = unsafe { |
| 1553 | 1662 | match src_kind_tag { |
| 1554 | | - 0 => *(src_p.add(i) as *const i8) as f64, |
| 1555 | | - 1 => *(src_p.add(2 * i) as *const i16) as f64, |
| 1556 | | - 2 => *(src_p.add(4 * i) as *const i32) as f64, |
| 1557 | | - 3 => *(src_p.add(8 * i) as *const i64) as f64, |
| 1558 | | - 4 => *(src_p.add(4 * i) as *const f32) as f64, |
| 1559 | | - 5 => *(src_p.add(8 * i) as *const f64), |
| 1663 | + 0 => *(src_p.offset(src_byte_off as isize) as *const i8) as f64, |
| 1664 | + 1 => *(src_p.offset(src_byte_off as isize) as *const i16) as f64, |
| 1665 | + 2 => *(src_p.offset(src_byte_off as isize) as *const i32) as f64, |
| 1666 | + 3 => *(src_p.offset(src_byte_off as isize) as *const i64) as f64, |
| 1667 | + 4 => *(src_p.offset(src_byte_off as isize) as *const f32) as f64, |
| 1668 | + 5 => *(src_p.offset(src_byte_off as isize) as *const f64), |
| 1560 | 1669 | _ => return, |
| 1561 | 1670 | } |
| 1562 | 1671 | }; |
| 1563 | 1672 | unsafe { |
| 1564 | 1673 | match dest_kind_tag { |
| 1565 | | - 0 => *(dst_p.add(i) as *mut i8) = src_val_f64 as i8, |
| 1566 | | - 1 => *(dst_p.add(2 * i) as *mut i16) = src_val_f64 as i16, |
| 1567 | | - 2 => *(dst_p.add(4 * i) as *mut i32) = src_val_f64 as i32, |
| 1568 | | - 3 => *(dst_p.add(8 * i) as *mut i64) = src_val_f64 as i64, |
| 1569 | | - 4 => *(dst_p.add(4 * i) as *mut f32) = src_val_f64 as f32, |
| 1570 | | - 5 => *(dst_p.add(8 * i) as *mut f64) = src_val_f64, |
| 1674 | + 0 => *(dst_p.add(k) as *mut i8) = src_val_f64 as i8, |
| 1675 | + 1 => *(dst_p.add(2 * k) as *mut i16) = src_val_f64 as i16, |
| 1676 | + 2 => *(dst_p.add(4 * k) as *mut i32) = src_val_f64 as i32, |
| 1677 | + 3 => *(dst_p.add(8 * k) as *mut i64) = src_val_f64 as i64, |
| 1678 | + 4 => *(dst_p.add(4 * k) as *mut f32) = src_val_f64 as f32, |
| 1679 | + 5 => *(dst_p.add(8 * k) as *mut f64) = src_val_f64, |
| 1571 | 1680 | _ => return, |
| 1572 | 1681 | } |
| 1573 | 1682 | } |
| 1683 | + for d in 0..rank { |
| 1684 | + idx[d] += 1; |
| 1685 | + if idx[d] < extents[d] { |
| 1686 | + break; |
| 1687 | + } |
| 1688 | + idx[d] = 0; |
| 1689 | + } |
| 1574 | 1690 | } |
| 1575 | 1691 | } |
| 1576 | 1692 | |