Rust · 151907 bytes Raw Blame History
1 //! Array memory management — ALLOCATE, DEALLOCATE, allocatable assignment.
2 //!
3 //! These functions operate on ArrayDescriptor pointers passed from generated
4 //! code. They handle allocation, deallocation, reallocation, and descriptor
5 //! population.
6
7 use crate::descriptor::*;
8 use std::ptr;
9
10 // ---- BOUNDS CHECKS ----
11
12 /// Abort if an array subscript is outside the legal closed interval.
13 #[no_mangle]
14 pub extern "C" fn afs_check_bounds(index: i64, lower: i64, upper: i64) {
15 if index < lower || index > upper {
16 eprintln!(
17 "Bounds check failed: index {} outside [{}, {}]",
18 index, lower, upper
19 );
20 std::process::exit(1);
21 }
22 }
23
24 fn bulk_len(n: i64) -> usize {
25 if n <= 0 {
26 0
27 } else {
28 usize::try_from(n).unwrap_or(0)
29 }
30 }
31
32 #[cfg(target_arch = "aarch64")]
33 unsafe fn fill_i32_impl(dest: *mut i32, len: usize, value: i32) {
34 use core::arch::aarch64::{vdupq_n_s32, vst1q_s32};
35
36 let mut i = 0usize;
37 let splat = vdupq_n_s32(value);
38 while i + 4 <= len {
39 vst1q_s32(dest.add(i), splat);
40 i += 4;
41 }
42 while i < len {
43 *dest.add(i) = value;
44 i += 1;
45 }
46 }
47
48 #[cfg(not(target_arch = "aarch64"))]
49 unsafe fn fill_i32_impl(dest: *mut i32, len: usize, value: i32) {
50 for i in 0..len {
51 *dest.add(i) = value;
52 }
53 }
54
55 #[cfg(target_arch = "aarch64")]
56 unsafe fn fill_f32_impl(dest: *mut f32, len: usize, value: f32) {
57 use core::arch::aarch64::{vdupq_n_f32, vst1q_f32};
58
59 let mut i = 0usize;
60 let splat = vdupq_n_f32(value);
61 while i + 4 <= len {
62 vst1q_f32(dest.add(i), splat);
63 i += 4;
64 }
65 while i < len {
66 *dest.add(i) = value;
67 i += 1;
68 }
69 }
70
71 #[cfg(not(target_arch = "aarch64"))]
72 unsafe fn fill_f32_impl(dest: *mut f32, len: usize, value: f32) {
73 for i in 0..len {
74 *dest.add(i) = value;
75 }
76 }
77
78 #[cfg(target_arch = "aarch64")]
79 unsafe fn fill_f64_impl(dest: *mut f64, len: usize, value: f64) {
80 use core::arch::aarch64::{vdupq_n_f64, vst1q_f64};
81
82 let mut i = 0usize;
83 let splat = vdupq_n_f64(value);
84 while i + 2 <= len {
85 vst1q_f64(dest.add(i), splat);
86 i += 2;
87 }
88 while i < len {
89 *dest.add(i) = value;
90 i += 1;
91 }
92 }
93
94 #[cfg(not(target_arch = "aarch64"))]
95 unsafe fn fill_f64_impl(dest: *mut f64, len: usize, value: f64) {
96 for i in 0..len {
97 *dest.add(i) = value;
98 }
99 }
100
101 #[cfg(target_arch = "aarch64")]
102 unsafe fn add_i32_impl(dest: *mut i32, lhs: *const i32, rhs: *const i32, len: usize) {
103 use core::arch::aarch64::{vaddq_s32, vld1q_s32, vst1q_s32};
104
105 let mut i = 0usize;
106 while i + 4 <= len {
107 let a = vld1q_s32(lhs.add(i));
108 let b = vld1q_s32(rhs.add(i));
109 vst1q_s32(dest.add(i), vaddq_s32(a, b));
110 i += 4;
111 }
112 while i < len {
113 *dest.add(i) = *lhs.add(i) + *rhs.add(i);
114 i += 1;
115 }
116 }
117
118 #[cfg(not(target_arch = "aarch64"))]
119 unsafe fn add_i32_impl(dest: *mut i32, lhs: *const i32, rhs: *const i32, len: usize) {
120 for i in 0..len {
121 *dest.add(i) = *lhs.add(i) + *rhs.add(i);
122 }
123 }
124
125 #[cfg(target_arch = "aarch64")]
126 unsafe fn add_f32_impl(dest: *mut f32, lhs: *const f32, rhs: *const f32, len: usize) {
127 use core::arch::aarch64::{vaddq_f32, vld1q_f32, vst1q_f32};
128
129 let mut i = 0usize;
130 while i + 4 <= len {
131 let a = vld1q_f32(lhs.add(i));
132 let b = vld1q_f32(rhs.add(i));
133 vst1q_f32(dest.add(i), vaddq_f32(a, b));
134 i += 4;
135 }
136 while i < len {
137 *dest.add(i) = *lhs.add(i) + *rhs.add(i);
138 i += 1;
139 }
140 }
141
142 #[cfg(not(target_arch = "aarch64"))]
143 unsafe fn add_f32_impl(dest: *mut f32, lhs: *const f32, rhs: *const f32, len: usize) {
144 for i in 0..len {
145 *dest.add(i) = *lhs.add(i) + *rhs.add(i);
146 }
147 }
148
149 #[cfg(target_arch = "aarch64")]
150 unsafe fn add_f64_impl(dest: *mut f64, lhs: *const f64, rhs: *const f64, len: usize) {
151 use core::arch::aarch64::{vaddq_f64, vld1q_f64, vst1q_f64};
152
153 let mut i = 0usize;
154 while i + 2 <= len {
155 let a = vld1q_f64(lhs.add(i));
156 let b = vld1q_f64(rhs.add(i));
157 vst1q_f64(dest.add(i), vaddq_f64(a, b));
158 i += 2;
159 }
160 while i < len {
161 *dest.add(i) = *lhs.add(i) + *rhs.add(i);
162 i += 1;
163 }
164 }
165
166 #[cfg(not(target_arch = "aarch64"))]
167 unsafe fn add_f64_impl(dest: *mut f64, lhs: *const f64, rhs: *const f64, len: usize) {
168 for i in 0..len {
169 *dest.add(i) = *lhs.add(i) + *rhs.add(i);
170 }
171 }
172
173 #[cfg(target_arch = "aarch64")]
174 unsafe fn sub_i32_impl(dest: *mut i32, lhs: *const i32, rhs: *const i32, len: usize) {
175 use core::arch::aarch64::{vld1q_s32, vst1q_s32, vsubq_s32};
176
177 let mut i = 0usize;
178 while i + 4 <= len {
179 let a = vld1q_s32(lhs.add(i));
180 let b = vld1q_s32(rhs.add(i));
181 vst1q_s32(dest.add(i), vsubq_s32(a, b));
182 i += 4;
183 }
184 while i < len {
185 *dest.add(i) = *lhs.add(i) - *rhs.add(i);
186 i += 1;
187 }
188 }
189
190 #[cfg(not(target_arch = "aarch64"))]
191 unsafe fn sub_i32_impl(dest: *mut i32, lhs: *const i32, rhs: *const i32, len: usize) {
192 for i in 0..len {
193 *dest.add(i) = *lhs.add(i) - *rhs.add(i);
194 }
195 }
196
197 #[cfg(target_arch = "aarch64")]
198 unsafe fn sub_f32_impl(dest: *mut f32, lhs: *const f32, rhs: *const f32, len: usize) {
199 use core::arch::aarch64::{vld1q_f32, vst1q_f32, vsubq_f32};
200
201 let mut i = 0usize;
202 while i + 4 <= len {
203 let a = vld1q_f32(lhs.add(i));
204 let b = vld1q_f32(rhs.add(i));
205 vst1q_f32(dest.add(i), vsubq_f32(a, b));
206 i += 4;
207 }
208 while i < len {
209 *dest.add(i) = *lhs.add(i) - *rhs.add(i);
210 i += 1;
211 }
212 }
213
214 #[cfg(not(target_arch = "aarch64"))]
215 unsafe fn sub_f32_impl(dest: *mut f32, lhs: *const f32, rhs: *const f32, len: usize) {
216 for i in 0..len {
217 *dest.add(i) = *lhs.add(i) - *rhs.add(i);
218 }
219 }
220
221 #[cfg(target_arch = "aarch64")]
222 unsafe fn sub_f64_impl(dest: *mut f64, lhs: *const f64, rhs: *const f64, len: usize) {
223 use core::arch::aarch64::{vld1q_f64, vst1q_f64, vsubq_f64};
224
225 let mut i = 0usize;
226 while i + 2 <= len {
227 let a = vld1q_f64(lhs.add(i));
228 let b = vld1q_f64(rhs.add(i));
229 vst1q_f64(dest.add(i), vsubq_f64(a, b));
230 i += 2;
231 }
232 while i < len {
233 *dest.add(i) = *lhs.add(i) - *rhs.add(i);
234 i += 1;
235 }
236 }
237
238 #[cfg(not(target_arch = "aarch64"))]
239 unsafe fn sub_f64_impl(dest: *mut f64, lhs: *const f64, rhs: *const f64, len: usize) {
240 for i in 0..len {
241 *dest.add(i) = *lhs.add(i) - *rhs.add(i);
242 }
243 }
244
245 #[cfg(target_arch = "aarch64")]
246 unsafe fn mul_i32_impl(dest: *mut i32, lhs: *const i32, rhs: *const i32, len: usize) {
247 use core::arch::aarch64::{vld1q_s32, vmulq_s32, vst1q_s32};
248
249 let mut i = 0usize;
250 while i + 4 <= len {
251 let a = vld1q_s32(lhs.add(i));
252 let b = vld1q_s32(rhs.add(i));
253 vst1q_s32(dest.add(i), vmulq_s32(a, b));
254 i += 4;
255 }
256 while i < len {
257 *dest.add(i) = *lhs.add(i) * *rhs.add(i);
258 i += 1;
259 }
260 }
261
262 #[cfg(not(target_arch = "aarch64"))]
263 unsafe fn mul_i32_impl(dest: *mut i32, lhs: *const i32, rhs: *const i32, len: usize) {
264 for i in 0..len {
265 *dest.add(i) = *lhs.add(i) * *rhs.add(i);
266 }
267 }
268
269 #[cfg(target_arch = "aarch64")]
270 unsafe fn mul_f32_impl(dest: *mut f32, lhs: *const f32, rhs: *const f32, len: usize) {
271 use core::arch::aarch64::{vld1q_f32, vmulq_f32, vst1q_f32};
272
273 let mut i = 0usize;
274 while i + 4 <= len {
275 let a = vld1q_f32(lhs.add(i));
276 let b = vld1q_f32(rhs.add(i));
277 vst1q_f32(dest.add(i), vmulq_f32(a, b));
278 i += 4;
279 }
280 while i < len {
281 *dest.add(i) = *lhs.add(i) * *rhs.add(i);
282 i += 1;
283 }
284 }
285
286 #[cfg(not(target_arch = "aarch64"))]
287 unsafe fn mul_f32_impl(dest: *mut f32, lhs: *const f32, rhs: *const f32, len: usize) {
288 for i in 0..len {
289 *dest.add(i) = *lhs.add(i) * *rhs.add(i);
290 }
291 }
292
293 #[cfg(target_arch = "aarch64")]
294 unsafe fn mul_f64_impl(dest: *mut f64, lhs: *const f64, rhs: *const f64, len: usize) {
295 use core::arch::aarch64::{vld1q_f64, vmulq_f64, vst1q_f64};
296
297 let mut i = 0usize;
298 while i + 2 <= len {
299 let a = vld1q_f64(lhs.add(i));
300 let b = vld1q_f64(rhs.add(i));
301 vst1q_f64(dest.add(i), vmulq_f64(a, b));
302 i += 2;
303 }
304 while i < len {
305 *dest.add(i) = *lhs.add(i) * *rhs.add(i);
306 i += 1;
307 }
308 }
309
310 #[cfg(not(target_arch = "aarch64"))]
311 unsafe fn mul_f64_impl(dest: *mut f64, lhs: *const f64, rhs: *const f64, len: usize) {
312 for i in 0..len {
313 *dest.add(i) = *lhs.add(i) * *rhs.add(i);
314 }
315 }
316
317 #[cfg(target_arch = "aarch64")]
318 unsafe fn add_scalar_i32_impl(dest: *mut i32, src: *const i32, scalar: i32, len: usize) {
319 use core::arch::aarch64::{vaddq_s32, vdupq_n_s32, vld1q_s32, vst1q_s32};
320
321 let mut i = 0usize;
322 let splat = vdupq_n_s32(scalar);
323 while i + 4 <= len {
324 let a = vld1q_s32(src.add(i));
325 vst1q_s32(dest.add(i), vaddq_s32(a, splat));
326 i += 4;
327 }
328 while i < len {
329 *dest.add(i) = *src.add(i) + scalar;
330 i += 1;
331 }
332 }
333
334 #[cfg(not(target_arch = "aarch64"))]
335 unsafe fn add_scalar_i32_impl(dest: *mut i32, src: *const i32, scalar: i32, len: usize) {
336 for i in 0..len {
337 *dest.add(i) = *src.add(i) + scalar;
338 }
339 }
340
341 #[cfg(target_arch = "aarch64")]
342 unsafe fn add_scalar_f32_impl(dest: *mut f32, src: *const f32, scalar: f32, len: usize) {
343 use core::arch::aarch64::{vaddq_f32, vdupq_n_f32, vld1q_f32, vst1q_f32};
344
345 let mut i = 0usize;
346 let splat = vdupq_n_f32(scalar);
347 while i + 4 <= len {
348 let a = vld1q_f32(src.add(i));
349 vst1q_f32(dest.add(i), vaddq_f32(a, splat));
350 i += 4;
351 }
352 while i < len {
353 *dest.add(i) = *src.add(i) + scalar;
354 i += 1;
355 }
356 }
357
358 #[cfg(not(target_arch = "aarch64"))]
359 unsafe fn add_scalar_f32_impl(dest: *mut f32, src: *const f32, scalar: f32, len: usize) {
360 for i in 0..len {
361 *dest.add(i) = *src.add(i) + scalar;
362 }
363 }
364
365 #[cfg(target_arch = "aarch64")]
366 unsafe fn add_scalar_f64_impl(dest: *mut f64, src: *const f64, scalar: f64, len: usize) {
367 use core::arch::aarch64::{vaddq_f64, vdupq_n_f64, vld1q_f64, vst1q_f64};
368
369 let mut i = 0usize;
370 let splat = vdupq_n_f64(scalar);
371 while i + 2 <= len {
372 let a = vld1q_f64(src.add(i));
373 vst1q_f64(dest.add(i), vaddq_f64(a, splat));
374 i += 2;
375 }
376 while i < len {
377 *dest.add(i) = *src.add(i) + scalar;
378 i += 1;
379 }
380 }
381
382 #[cfg(not(target_arch = "aarch64"))]
383 unsafe fn add_scalar_f64_impl(dest: *mut f64, src: *const f64, scalar: f64, len: usize) {
384 for i in 0..len {
385 *dest.add(i) = *src.add(i) + scalar;
386 }
387 }
388
389 #[cfg(target_arch = "aarch64")]
390 unsafe fn sub_scalar_i32_impl(dest: *mut i32, src: *const i32, scalar: i32, len: usize) {
391 use core::arch::aarch64::{vdupq_n_s32, vld1q_s32, vst1q_s32, vsubq_s32};
392
393 let mut i = 0usize;
394 let splat = vdupq_n_s32(scalar);
395 while i + 4 <= len {
396 let a = vld1q_s32(src.add(i));
397 vst1q_s32(dest.add(i), vsubq_s32(a, splat));
398 i += 4;
399 }
400 while i < len {
401 *dest.add(i) = *src.add(i) - scalar;
402 i += 1;
403 }
404 }
405
406 #[cfg(not(target_arch = "aarch64"))]
407 unsafe fn sub_scalar_i32_impl(dest: *mut i32, src: *const i32, scalar: i32, len: usize) {
408 for i in 0..len {
409 *dest.add(i) = *src.add(i) - scalar;
410 }
411 }
412
413 #[cfg(target_arch = "aarch64")]
414 unsafe fn sub_scalar_f32_impl(dest: *mut f32, src: *const f32, scalar: f32, len: usize) {
415 use core::arch::aarch64::{vdupq_n_f32, vld1q_f32, vst1q_f32, vsubq_f32};
416
417 let mut i = 0usize;
418 let splat = vdupq_n_f32(scalar);
419 while i + 4 <= len {
420 let a = vld1q_f32(src.add(i));
421 vst1q_f32(dest.add(i), vsubq_f32(a, splat));
422 i += 4;
423 }
424 while i < len {
425 *dest.add(i) = *src.add(i) - scalar;
426 i += 1;
427 }
428 }
429
430 #[cfg(not(target_arch = "aarch64"))]
431 unsafe fn sub_scalar_f32_impl(dest: *mut f32, src: *const f32, scalar: f32, len: usize) {
432 for i in 0..len {
433 *dest.add(i) = *src.add(i) - scalar;
434 }
435 }
436
437 #[cfg(target_arch = "aarch64")]
438 unsafe fn sub_scalar_f64_impl(dest: *mut f64, src: *const f64, scalar: f64, len: usize) {
439 use core::arch::aarch64::{vdupq_n_f64, vld1q_f64, vst1q_f64, vsubq_f64};
440
441 let mut i = 0usize;
442 let splat = vdupq_n_f64(scalar);
443 while i + 2 <= len {
444 let a = vld1q_f64(src.add(i));
445 vst1q_f64(dest.add(i), vsubq_f64(a, splat));
446 i += 2;
447 }
448 while i < len {
449 *dest.add(i) = *src.add(i) - scalar;
450 i += 1;
451 }
452 }
453
454 #[cfg(not(target_arch = "aarch64"))]
455 unsafe fn sub_scalar_f64_impl(dest: *mut f64, src: *const f64, scalar: f64, len: usize) {
456 for i in 0..len {
457 *dest.add(i) = *src.add(i) - scalar;
458 }
459 }
460
461 #[cfg(target_arch = "aarch64")]
462 unsafe fn scalar_sub_i32_impl(dest: *mut i32, scalar: i32, src: *const i32, len: usize) {
463 use core::arch::aarch64::{vdupq_n_s32, vld1q_s32, vst1q_s32, vsubq_s32};
464
465 let mut i = 0usize;
466 let splat = vdupq_n_s32(scalar);
467 while i + 4 <= len {
468 let a = vld1q_s32(src.add(i));
469 vst1q_s32(dest.add(i), vsubq_s32(splat, a));
470 i += 4;
471 }
472 while i < len {
473 *dest.add(i) = scalar - *src.add(i);
474 i += 1;
475 }
476 }
477
478 #[cfg(not(target_arch = "aarch64"))]
479 unsafe fn scalar_sub_i32_impl(dest: *mut i32, scalar: i32, src: *const i32, len: usize) {
480 for i in 0..len {
481 *dest.add(i) = scalar - *src.add(i);
482 }
483 }
484
485 #[cfg(target_arch = "aarch64")]
486 unsafe fn scalar_sub_f32_impl(dest: *mut f32, scalar: f32, src: *const f32, len: usize) {
487 use core::arch::aarch64::{vdupq_n_f32, vld1q_f32, vst1q_f32, vsubq_f32};
488
489 let mut i = 0usize;
490 let splat = vdupq_n_f32(scalar);
491 while i + 4 <= len {
492 let a = vld1q_f32(src.add(i));
493 vst1q_f32(dest.add(i), vsubq_f32(splat, a));
494 i += 4;
495 }
496 while i < len {
497 *dest.add(i) = scalar - *src.add(i);
498 i += 1;
499 }
500 }
501
502 #[cfg(not(target_arch = "aarch64"))]
503 unsafe fn scalar_sub_f32_impl(dest: *mut f32, scalar: f32, src: *const f32, len: usize) {
504 for i in 0..len {
505 *dest.add(i) = scalar - *src.add(i);
506 }
507 }
508
509 #[cfg(target_arch = "aarch64")]
510 unsafe fn scalar_sub_f64_impl(dest: *mut f64, scalar: f64, src: *const f64, len: usize) {
511 use core::arch::aarch64::{vdupq_n_f64, vld1q_f64, vst1q_f64, vsubq_f64};
512
513 let mut i = 0usize;
514 let splat = vdupq_n_f64(scalar);
515 while i + 2 <= len {
516 let a = vld1q_f64(src.add(i));
517 vst1q_f64(dest.add(i), vsubq_f64(splat, a));
518 i += 2;
519 }
520 while i < len {
521 *dest.add(i) = scalar - *src.add(i);
522 i += 1;
523 }
524 }
525
526 #[cfg(not(target_arch = "aarch64"))]
527 unsafe fn scalar_sub_f64_impl(dest: *mut f64, scalar: f64, src: *const f64, len: usize) {
528 for i in 0..len {
529 *dest.add(i) = scalar - *src.add(i);
530 }
531 }
532
533 #[cfg(target_arch = "aarch64")]
534 unsafe fn mul_scalar_i32_impl(dest: *mut i32, src: *const i32, scalar: i32, len: usize) {
535 use core::arch::aarch64::{vdupq_n_s32, vld1q_s32, vmulq_s32, vst1q_s32};
536
537 let mut i = 0usize;
538 let splat = vdupq_n_s32(scalar);
539 while i + 4 <= len {
540 let a = vld1q_s32(src.add(i));
541 vst1q_s32(dest.add(i), vmulq_s32(a, splat));
542 i += 4;
543 }
544 while i < len {
545 *dest.add(i) = *src.add(i) * scalar;
546 i += 1;
547 }
548 }
549
550 #[cfg(not(target_arch = "aarch64"))]
551 unsafe fn mul_scalar_i32_impl(dest: *mut i32, src: *const i32, scalar: i32, len: usize) {
552 for i in 0..len {
553 *dest.add(i) = *src.add(i) * scalar;
554 }
555 }
556
557 #[cfg(target_arch = "aarch64")]
558 unsafe fn mul_scalar_f32_impl(dest: *mut f32, src: *const f32, scalar: f32, len: usize) {
559 use core::arch::aarch64::{vdupq_n_f32, vld1q_f32, vmulq_f32, vst1q_f32};
560
561 let mut i = 0usize;
562 let splat = vdupq_n_f32(scalar);
563 while i + 4 <= len {
564 let a = vld1q_f32(src.add(i));
565 vst1q_f32(dest.add(i), vmulq_f32(a, splat));
566 i += 4;
567 }
568 while i < len {
569 *dest.add(i) = *src.add(i) * scalar;
570 i += 1;
571 }
572 }
573
574 #[cfg(not(target_arch = "aarch64"))]
575 unsafe fn mul_scalar_f32_impl(dest: *mut f32, src: *const f32, scalar: f32, len: usize) {
576 for i in 0..len {
577 *dest.add(i) = *src.add(i) * scalar;
578 }
579 }
580
581 #[cfg(target_arch = "aarch64")]
582 unsafe fn mul_scalar_f64_impl(dest: *mut f64, src: *const f64, scalar: f64, len: usize) {
583 use core::arch::aarch64::{vdupq_n_f64, vld1q_f64, vmulq_f64, vst1q_f64};
584
585 let mut i = 0usize;
586 let splat = vdupq_n_f64(scalar);
587 while i + 2 <= len {
588 let a = vld1q_f64(src.add(i));
589 vst1q_f64(dest.add(i), vmulq_f64(a, splat));
590 i += 2;
591 }
592 while i < len {
593 *dest.add(i) = *src.add(i) * scalar;
594 i += 1;
595 }
596 }
597
598 #[cfg(not(target_arch = "aarch64"))]
599 unsafe fn mul_scalar_f64_impl(dest: *mut f64, src: *const f64, scalar: f64, len: usize) {
600 for i in 0..len {
601 *dest.add(i) = *src.add(i) * scalar;
602 }
603 }
604
605 #[no_mangle]
606 pub extern "C" fn afs_fill_i32(dest: *mut i32, n: i64, value: i32) {
607 if dest.is_null() {
608 return;
609 }
610 let len = bulk_len(n);
611 if len == 0 {
612 return;
613 }
614 unsafe {
615 fill_i32_impl(dest, len, value);
616 }
617 }
618
619 #[no_mangle]
620 pub extern "C" fn afs_fill_f32(dest: *mut f32, n: i64, value: f32) {
621 if dest.is_null() {
622 return;
623 }
624 let len = bulk_len(n);
625 if len == 0 {
626 return;
627 }
628 unsafe {
629 fill_f32_impl(dest, len, value);
630 }
631 }
632
633 #[no_mangle]
634 pub extern "C" fn afs_fill_f64(dest: *mut f64, n: i64, value: f64) {
635 if dest.is_null() {
636 return;
637 }
638 let len = bulk_len(n);
639 if len == 0 {
640 return;
641 }
642 unsafe {
643 fill_f64_impl(dest, len, value);
644 }
645 }
646
647 #[no_mangle]
648 pub extern "C" fn afs_array_add_i32(dest: *mut i32, lhs: *const i32, rhs: *const i32, n: i64) {
649 if dest.is_null() || lhs.is_null() || rhs.is_null() {
650 return;
651 }
652 let len = bulk_len(n);
653 if len == 0 {
654 return;
655 }
656 unsafe {
657 add_i32_impl(dest, lhs, rhs, len);
658 }
659 }
660
661 #[no_mangle]
662 pub extern "C" fn afs_array_add_f32(dest: *mut f32, lhs: *const f32, rhs: *const f32, n: i64) {
663 if dest.is_null() || lhs.is_null() || rhs.is_null() {
664 return;
665 }
666 let len = bulk_len(n);
667 if len == 0 {
668 return;
669 }
670 unsafe {
671 add_f32_impl(dest, lhs, rhs, len);
672 }
673 }
674
675 #[no_mangle]
676 pub extern "C" fn afs_array_add_f64(dest: *mut f64, lhs: *const f64, rhs: *const f64, n: i64) {
677 if dest.is_null() || lhs.is_null() || rhs.is_null() {
678 return;
679 }
680 let len = bulk_len(n);
681 if len == 0 {
682 return;
683 }
684 unsafe {
685 add_f64_impl(dest, lhs, rhs, len);
686 }
687 }
688
689 #[no_mangle]
690 pub extern "C" fn afs_array_sub_i32(dest: *mut i32, lhs: *const i32, rhs: *const i32, n: i64) {
691 if dest.is_null() || lhs.is_null() || rhs.is_null() {
692 return;
693 }
694 let len = bulk_len(n);
695 if len == 0 {
696 return;
697 }
698 unsafe {
699 sub_i32_impl(dest, lhs, rhs, len);
700 }
701 }
702
703 #[no_mangle]
704 pub extern "C" fn afs_array_sub_f32(dest: *mut f32, lhs: *const f32, rhs: *const f32, n: i64) {
705 if dest.is_null() || lhs.is_null() || rhs.is_null() {
706 return;
707 }
708 let len = bulk_len(n);
709 if len == 0 {
710 return;
711 }
712 unsafe {
713 sub_f32_impl(dest, lhs, rhs, len);
714 }
715 }
716
717 #[no_mangle]
718 pub extern "C" fn afs_array_sub_f64(dest: *mut f64, lhs: *const f64, rhs: *const f64, n: i64) {
719 if dest.is_null() || lhs.is_null() || rhs.is_null() {
720 return;
721 }
722 let len = bulk_len(n);
723 if len == 0 {
724 return;
725 }
726 unsafe {
727 sub_f64_impl(dest, lhs, rhs, len);
728 }
729 }
730
731 #[no_mangle]
732 pub extern "C" fn afs_array_mul_i32(dest: *mut i32, lhs: *const i32, rhs: *const i32, n: i64) {
733 if dest.is_null() || lhs.is_null() || rhs.is_null() {
734 return;
735 }
736 let len = bulk_len(n);
737 if len == 0 {
738 return;
739 }
740 unsafe {
741 mul_i32_impl(dest, lhs, rhs, len);
742 }
743 }
744
745 #[no_mangle]
746 pub extern "C" fn afs_array_mul_f32(dest: *mut f32, lhs: *const f32, rhs: *const f32, n: i64) {
747 if dest.is_null() || lhs.is_null() || rhs.is_null() {
748 return;
749 }
750 let len = bulk_len(n);
751 if len == 0 {
752 return;
753 }
754 unsafe {
755 mul_f32_impl(dest, lhs, rhs, len);
756 }
757 }
758
759 #[no_mangle]
760 pub extern "C" fn afs_array_mul_f64(dest: *mut f64, lhs: *const f64, rhs: *const f64, n: i64) {
761 if dest.is_null() || lhs.is_null() || rhs.is_null() {
762 return;
763 }
764 let len = bulk_len(n);
765 if len == 0 {
766 return;
767 }
768 unsafe {
769 mul_f64_impl(dest, lhs, rhs, len);
770 }
771 }
772
773 #[no_mangle]
774 pub extern "C" fn afs_array_add_scalar_i32(dest: *mut i32, src: *const i32, scalar: i32, n: i64) {
775 if dest.is_null() || src.is_null() {
776 return;
777 }
778 let len = bulk_len(n);
779 if len == 0 {
780 return;
781 }
782 unsafe {
783 add_scalar_i32_impl(dest, src, scalar, len);
784 }
785 }
786
787 #[no_mangle]
788 pub extern "C" fn afs_array_add_scalar_f32(dest: *mut f32, src: *const f32, scalar: f32, n: i64) {
789 if dest.is_null() || src.is_null() {
790 return;
791 }
792 let len = bulk_len(n);
793 if len == 0 {
794 return;
795 }
796 unsafe {
797 add_scalar_f32_impl(dest, src, scalar, len);
798 }
799 }
800
801 #[no_mangle]
802 pub extern "C" fn afs_array_add_scalar_f64(dest: *mut f64, src: *const f64, scalar: f64, n: i64) {
803 if dest.is_null() || src.is_null() {
804 return;
805 }
806 let len = bulk_len(n);
807 if len == 0 {
808 return;
809 }
810 unsafe {
811 add_scalar_f64_impl(dest, src, scalar, len);
812 }
813 }
814
815 #[no_mangle]
816 pub extern "C" fn afs_array_sub_scalar_i32(dest: *mut i32, src: *const i32, scalar: i32, n: i64) {
817 if dest.is_null() || src.is_null() {
818 return;
819 }
820 let len = bulk_len(n);
821 if len == 0 {
822 return;
823 }
824 unsafe {
825 sub_scalar_i32_impl(dest, src, scalar, len);
826 }
827 }
828
829 #[no_mangle]
830 pub extern "C" fn afs_array_sub_scalar_f32(dest: *mut f32, src: *const f32, scalar: f32, n: i64) {
831 if dest.is_null() || src.is_null() {
832 return;
833 }
834 let len = bulk_len(n);
835 if len == 0 {
836 return;
837 }
838 unsafe {
839 sub_scalar_f32_impl(dest, src, scalar, len);
840 }
841 }
842
843 #[no_mangle]
844 pub extern "C" fn afs_array_sub_scalar_f64(dest: *mut f64, src: *const f64, scalar: f64, n: i64) {
845 if dest.is_null() || src.is_null() {
846 return;
847 }
848 let len = bulk_len(n);
849 if len == 0 {
850 return;
851 }
852 unsafe {
853 sub_scalar_f64_impl(dest, src, scalar, len);
854 }
855 }
856
857 #[no_mangle]
858 pub extern "C" fn afs_scalar_sub_array_i32(dest: *mut i32, scalar: i32, src: *const i32, n: i64) {
859 if dest.is_null() || src.is_null() {
860 return;
861 }
862 let len = bulk_len(n);
863 if len == 0 {
864 return;
865 }
866 unsafe {
867 scalar_sub_i32_impl(dest, scalar, src, len);
868 }
869 }
870
871 #[no_mangle]
872 pub extern "C" fn afs_scalar_sub_array_f32(dest: *mut f32, scalar: f32, src: *const f32, n: i64) {
873 if dest.is_null() || src.is_null() {
874 return;
875 }
876 let len = bulk_len(n);
877 if len == 0 {
878 return;
879 }
880 unsafe {
881 scalar_sub_f32_impl(dest, scalar, src, len);
882 }
883 }
884
885 #[no_mangle]
886 pub extern "C" fn afs_scalar_sub_array_f64(dest: *mut f64, scalar: f64, src: *const f64, n: i64) {
887 if dest.is_null() || src.is_null() {
888 return;
889 }
890 let len = bulk_len(n);
891 if len == 0 {
892 return;
893 }
894 unsafe {
895 scalar_sub_f64_impl(dest, scalar, src, len);
896 }
897 }
898
899 #[no_mangle]
900 pub extern "C" fn afs_array_mul_scalar_i32(dest: *mut i32, src: *const i32, scalar: i32, n: i64) {
901 if dest.is_null() || src.is_null() {
902 return;
903 }
904 let len = bulk_len(n);
905 if len == 0 {
906 return;
907 }
908 unsafe {
909 mul_scalar_i32_impl(dest, src, scalar, len);
910 }
911 }
912
913 #[no_mangle]
914 pub extern "C" fn afs_array_mul_scalar_f32(dest: *mut f32, src: *const f32, scalar: f32, n: i64) {
915 if dest.is_null() || src.is_null() {
916 return;
917 }
918 let len = bulk_len(n);
919 if len == 0 {
920 return;
921 }
922 unsafe {
923 mul_scalar_f32_impl(dest, src, scalar, len);
924 }
925 }
926
927 #[no_mangle]
928 pub extern "C" fn afs_array_mul_scalar_f64(dest: *mut f64, src: *const f64, scalar: f64, n: i64) {
929 if dest.is_null() || src.is_null() {
930 return;
931 }
932 let len = bulk_len(n);
933 if len == 0 {
934 return;
935 }
936 unsafe {
937 mul_scalar_f64_impl(dest, src, scalar, len);
938 }
939 }
940
941 // ---- ALLOCATE ----
942
943 /// Allocate an array described by the given dimensions.
944 /// Populates the descriptor with base_addr, elem_size, rank, dims, and flags.
945 ///
946 /// `dims_ptr` points to `rank` DimDescriptor values (lower, upper, stride=1).
947 /// `stat` is an optional pointer to a STAT variable (0 = success, nonzero = error).
948 /// `errmsg` is an optional pointer to a StringDescriptor for error messages.
949 ///
950 /// If stat is null and allocation fails, the program aborts.
951 #[no_mangle]
952 pub extern "C" fn afs_allocate_array(
953 desc: *mut ArrayDescriptor,
954 elem_size: i64,
955 rank: i32,
956 dims_ptr: *const DimDescriptor,
957 stat: *mut i32,
958 ) {
959 if desc.is_null() {
960 if !stat.is_null() {
961 unsafe {
962 *stat = 1;
963 }
964 }
965 return;
966 }
967
968 let desc = unsafe { &mut *desc };
969
970 // Check if already allocated.
971 if desc.is_allocated() {
972 if !stat.is_null() {
973 unsafe {
974 *stat = 2;
975 } // already allocated
976 return;
977 }
978 eprintln!("ALLOCATE: array is already allocated");
979 std::process::exit(1);
980 }
981
982 // Copy dimensions.
983 desc.rank = rank;
984 desc.elem_size = elem_size;
985 desc.clear_scalar_type_tag();
986 if !dims_ptr.is_null() && rank > 0 {
987 let dims_slice = unsafe { std::slice::from_raw_parts(dims_ptr, rank as usize) };
988 for (i, dim) in dims_slice.iter().enumerate() {
989 desc.dims[i] = *dim;
990 }
991 }
992
993 // ALLOCATE always produces a single contiguous block, so the
994 // descriptor's per-dim memory stride must be the column-major
995 // canonical step (1 for dim 0, then product of preceding extents).
996 // Compiler-generated dim_buf entries pass stride=1 across the
997 // board (the rank-1 case happens to be correct, multi-dim wasn't),
998 // so fix it up here. Without this, allocatable rank-N section
999 // assignments fed afs_create_section a stride=1 source and the
1000 // produced section descriptor stepped through memory contiguously,
1001 // collapsing column-major rows into a single linear walk and
1002 // corrupting both the LHS write and any subsequent RHS read.
1003 let mut running_stride: i64 = 1;
1004 for i in 0..rank as usize {
1005 desc.dims[i].stride = running_stride;
1006 running_stride = running_stride.saturating_mul(desc.dims[i].extent().max(1));
1007 }
1008
1009 // Compute total bytes.
1010 let total = desc.total_elements();
1011 let bytes = total * elem_size;
1012
1013 if bytes <= 0 {
1014 // Zero-size allocation: valid but produces a null/empty array.
1015 desc.base_addr = ptr::null_mut();
1016 desc.flags = DESC_ALLOCATED | DESC_CONTIGUOUS;
1017 if !stat.is_null() {
1018 unsafe {
1019 *stat = 0;
1020 }
1021 }
1022 return;
1023 }
1024
1025 // Allocate.
1026 let ptr = unsafe { libc_malloc(bytes as usize) };
1027 if ptr.is_null() {
1028 if !stat.is_null() {
1029 unsafe {
1030 *stat = 3;
1031 } // allocation failed
1032 return;
1033 }
1034 eprintln!("ALLOCATE: out of memory ({} bytes)", bytes);
1035 std::process::exit(1);
1036 }
1037
1038 // Zero-initialize (Fortran doesn't require this, but it's safer).
1039 unsafe {
1040 ptr::write_bytes(ptr, 0, bytes as usize);
1041 }
1042
1043 desc.base_addr = ptr;
1044 desc.flags = DESC_ALLOCATED | DESC_CONTIGUOUS;
1045
1046 if !stat.is_null() {
1047 unsafe {
1048 *stat = 0;
1049 }
1050 }
1051 }
1052
1053 /// Simplified allocate for a 1D array with given element count.
1054 /// Used by generated code for simple `allocate(a(n))` patterns.
1055 #[no_mangle]
1056 pub extern "C" fn afs_allocate_1d(desc: *mut ArrayDescriptor, elem_size: i64, n: i64) {
1057 let dim = DimDescriptor {
1058 lower_bound: 1,
1059 upper_bound: n,
1060 stride: 1,
1061 };
1062 afs_allocate_array(
1063 desc,
1064 elem_size,
1065 1,
1066 &dim as *const DimDescriptor,
1067 ptr::null_mut(),
1068 );
1069 }
1070
1071 /// Allocate `dest` with the same SHAPE (rank + extents) as `source`,
1072 /// but a caller-provided `elem_size` and a 1-based bound view per
1073 /// F2018 §10.1.5 / §6.5.3.5(2): elemental and relational array
1074 /// expressions yield a result whose lower bound is 1 in every
1075 /// dimension regardless of the operand's bounds. Used by the
1076 /// rank-N relational path so callees receiving the mask through
1077 /// e.g. `mask(:,:)` see a coherent rank-N descriptor instead of
1078 /// the rank-1 placeholder the old path emitted.
1079 #[no_mangle]
1080 pub extern "C" fn afs_allocate_like_with_elem_size(
1081 dest: *mut ArrayDescriptor,
1082 source: *const ArrayDescriptor,
1083 elem_size: i64,
1084 stat: *mut i32,
1085 ) {
1086 if dest.is_null() || source.is_null() {
1087 if !stat.is_null() {
1088 unsafe {
1089 *stat = 1;
1090 }
1091 }
1092 return;
1093 }
1094
1095 let source = unsafe { &*source };
1096 let mut dims = [DimDescriptor::default(); MAX_RANK];
1097 // Column-major running strides: dim[0]=1, dim[k]=Π_{j<k} extent_j.
1098 // The previous flat-1 stride caused per-dim consumers
1099 // (`for_each_reduce_along_dim`, `for_each_reduce_along_dim_with_mask`,
1100 // `lower_multi_d_section_assign`) to compute colliding byte offsets,
1101 // so for `m = (y > 3.)` over `real :: y(2,3)` only 4 of the 6 mask
1102 // bytes were read by the masked sum-along-dim helper — the
1103 // remaining two ran over the same byte twice and dropped the
1104 // column-2 hit entirely.
1105 let mut running: i64 = 1;
1106 for (i, dim) in dims.iter_mut().enumerate().take(source.rank as usize) {
1107 let extent = source.dims[i].extent();
1108 *dim = DimDescriptor {
1109 lower_bound: 1,
1110 upper_bound: extent,
1111 stride: running,
1112 };
1113 running = running.saturating_mul(extent.max(1));
1114 }
1115
1116 let dims_ptr = if source.rank > 0 {
1117 dims.as_ptr()
1118 } else {
1119 ptr::null()
1120 };
1121 afs_allocate_array(dest, elem_size, source.rank, dims_ptr, stat);
1122 }
1123
1124 /// Allocate `dest` with the same shape and element size as `source`.
1125 ///
1126 /// The resulting destination is always contiguous, even when `source`
1127 /// is a section descriptor with non-unit strides.
1128 #[no_mangle]
1129 pub extern "C" fn afs_allocate_like(
1130 dest: *mut ArrayDescriptor,
1131 source: *const ArrayDescriptor,
1132 stat: *mut i32,
1133 ) {
1134 if dest.is_null() || source.is_null() {
1135 if !stat.is_null() {
1136 unsafe {
1137 *stat = 1;
1138 }
1139 }
1140 return;
1141 }
1142
1143 let source = unsafe { &*source };
1144 let mut dims = [DimDescriptor::default(); MAX_RANK];
1145 for (i, dim) in dims.iter_mut().enumerate().take(source.rank as usize) {
1146 *dim = DimDescriptor {
1147 lower_bound: source.dims[i].lower_bound,
1148 upper_bound: source.dims[i].upper_bound,
1149 stride: 1,
1150 };
1151 }
1152
1153 let dims_ptr = if source.rank > 0 {
1154 dims.as_ptr()
1155 } else {
1156 ptr::null()
1157 };
1158 afs_allocate_array(dest, source.elem_size, source.rank, dims_ptr, stat);
1159 let dest = unsafe { &mut *dest };
1160 dest.set_scalar_type_tag(source.scalar_type_tag());
1161 dest.set_scalar_tbp_lookup_ptr(source.scalar_tbp_lookup_ptr());
1162 }
1163
1164 /// Copy array payload from `source` into an already-allocated `dest` without
1165 /// reshaping or reallocating `dest`.
1166 ///
1167 /// Used by `ALLOCATE(..., SOURCE=...)` after the destination shape has already
1168 /// been fixed by explicit bounds. On mismatch, the fresh destination allocation
1169 /// is rolled back so the overall statement still fails loudly instead of
1170 /// silently changing shape.
1171 #[no_mangle]
1172 pub extern "C" fn afs_copy_array_data(
1173 dest: *mut ArrayDescriptor,
1174 source: *const ArrayDescriptor,
1175 stat: *mut i32,
1176 ) {
1177 afs_prepare_array_copy(dest, source, stat);
1178 if !stat.is_null() {
1179 let status = unsafe { *stat };
1180 if status != 0 {
1181 return;
1182 }
1183 }
1184
1185 let dest = unsafe { &mut *dest };
1186 let source = unsafe { &*source };
1187 let bytes = source.total_bytes();
1188 if bytes > 0 && !source.base_addr.is_null() && !dest.base_addr.is_null() {
1189 // F2018 §9.7.1.2: SOURCE-expr may be a non-contiguous section
1190 // (e.g. `allocate(col, source = idx(2, 1:n))` reads only every
1191 // other element of `idx`). A flat ptr::copy treats the source
1192 // base..base+total_bytes as contiguous, picking up adjacent
1193 // dim-0 entries instead of stepping by the per-dim memory
1194 // stride. Detect non-contiguous (any source stride != the
1195 // canonical column-major step) and walk element-by-element.
1196 let elem_size = source.elem_size;
1197 let mut canonical: i64 = 1;
1198 let mut contiguous = true;
1199 for i in 0..source.rank as usize {
1200 if source.dims[i].stride != canonical {
1201 contiguous = false;
1202 break;
1203 }
1204 canonical = canonical.saturating_mul(source.dims[i].extent().max(1));
1205 }
1206 if contiguous {
1207 unsafe {
1208 ptr::copy(source.base_addr, dest.base_addr, bytes as usize);
1209 }
1210 } else {
1211 // Walk every multi-index of the source in column-major
1212 // order and copy `elem_size` bytes per slot. Dest is
1213 // contiguous (just allocated) so its destination index
1214 // is the linear count.
1215 let rank = source.rank as usize;
1216 let extents: Vec<i64> = (0..rank).map(|i| source.dims[i].extent()).collect();
1217 let strides: Vec<i64> = (0..rank).map(|i| source.dims[i].stride).collect();
1218 let mut idx = vec![0i64; rank];
1219 let total = source.total_elements();
1220 for k in 0..total {
1221 let mut src_off: i64 = 0;
1222 for d in 0..rank {
1223 src_off += idx[d] * strides[d];
1224 }
1225 src_off *= elem_size;
1226 let dst_off = k * elem_size;
1227 unsafe {
1228 ptr::copy_nonoverlapping(
1229 source.base_addr.offset(src_off as isize),
1230 dest.base_addr.offset(dst_off as isize),
1231 elem_size as usize,
1232 );
1233 }
1234 // increment column-major: dim 0 fastest
1235 for d in 0..rank {
1236 idx[d] += 1;
1237 if idx[d] < extents[d] {
1238 break;
1239 }
1240 idx[d] = 0;
1241 }
1242 }
1243 }
1244 }
1245 dest.set_scalar_type_tag(source.scalar_type_tag());
1246 dest.set_scalar_tbp_lookup_ptr(source.scalar_tbp_lookup_ptr());
1247
1248 if !stat.is_null() {
1249 unsafe {
1250 *stat = 0;
1251 }
1252 }
1253 }
1254
1255 /// Validate `ALLOCATE(..., SOURCE=...)` array conformance after the destination
1256 /// has already been allocated with its final shape.
1257 ///
1258 /// On mismatch, the fresh destination allocation is rolled back so the overall
1259 /// statement still fails loudly instead of silently changing shape.
1260 #[no_mangle]
1261 pub extern "C" fn afs_prepare_array_copy(
1262 dest: *mut ArrayDescriptor,
1263 source: *const ArrayDescriptor,
1264 stat: *mut i32,
1265 ) {
1266 if dest.is_null() || source.is_null() {
1267 if !stat.is_null() {
1268 unsafe {
1269 *stat = 1;
1270 }
1271 return;
1272 }
1273 eprintln!("ALLOCATE SOURCE=: null descriptor");
1274 std::process::exit(1);
1275 }
1276
1277 let dest = unsafe { &mut *dest };
1278 let source = unsafe { &*source };
1279
1280 // F2018 §9.7.1.2: SOURCE-expr need only have a defined value of the
1281 // right type/kind/shape — it doesn't have to be an ALLOCATABLE.
1282 // Common case: `allocate(amat(...), source=a)` where `a` is an
1283 // assumed-shape dummy `a(:,:)`. Such dummies have flags=CONTIGUOUS
1284 // (no DESC_ALLOCATED) since they're bound to the caller's data, not
1285 // owned. Treat the source as valid as long as it has a non-null
1286 // base_addr; require DESC_ALLOCATED only on the freshly-allocated
1287 // destination.
1288 let ok = dest.is_allocated()
1289 && !source.base_addr.is_null()
1290 && dest.elem_size == source.elem_size
1291 && dest.rank == source.rank
1292 && (0..dest.rank as usize).all(|i| dest.dims[i].extent() == source.dims[i].extent());
1293
1294 if !ok {
1295 if dest.is_allocated() && !dest.base_addr.is_null() {
1296 unsafe {
1297 libc_free(dest.base_addr);
1298 }
1299 }
1300 dest.base_addr = ptr::null_mut();
1301 dest.flags &= !DESC_ALLOCATED;
1302 if !stat.is_null() {
1303 unsafe {
1304 *stat = 4;
1305 }
1306 return;
1307 }
1308 eprintln!("ALLOCATE SOURCE=: destination shape does not conform to source");
1309 std::process::exit(1);
1310 }
1311
1312 if !stat.is_null() {
1313 unsafe {
1314 *stat = 0;
1315 }
1316 }
1317 }
1318
1319 // ---- DEALLOCATE ----
1320
1321 /// Deallocate an array, freeing its memory and clearing the descriptor.
1322 ///
1323 /// Safe to call on an already-deallocated descriptor (no-op with stat=0).
1324 #[no_mangle]
1325 pub extern "C" fn afs_deallocate_array(desc: *mut ArrayDescriptor, stat: *mut i32) {
1326 if desc.is_null() {
1327 if !stat.is_null() {
1328 unsafe {
1329 *stat = 1;
1330 }
1331 }
1332 return;
1333 }
1334
1335 let desc = unsafe { &mut *desc };
1336
1337 if !desc.is_allocated() {
1338 // Not allocated — not an error with STAT, abort without STAT.
1339 if !stat.is_null() {
1340 unsafe {
1341 *stat = 0;
1342 }
1343 return;
1344 }
1345 // Without STAT, deallocating an unallocated array is an error.
1346 eprintln!("DEALLOCATE: array is not allocated");
1347 std::process::exit(1);
1348 }
1349
1350 // Free the data.
1351 if !desc.base_addr.is_null() {
1352 unsafe {
1353 libc_free(desc.base_addr);
1354 }
1355 }
1356
1357 // Clear the descriptor.
1358 desc.base_addr = ptr::null_mut();
1359 desc.flags &= !DESC_ALLOCATED;
1360 desc.clear_scalar_type_tag();
1361 // Leave rank, elem_size, dims intact (they describe the shape for future allocate).
1362
1363 if !stat.is_null() {
1364 unsafe {
1365 *stat = 0;
1366 }
1367 }
1368 }
1369
1370 // ---- ALLOCATABLE ASSIGNMENT ----
1371
1372 fn descriptor_looks_sane(desc: &ArrayDescriptor) -> bool {
1373 let known_flags = DESC_ALLOCATED | DESC_CONTIGUOUS | DESC_POINTER;
1374 if desc.flags & !known_flags != 0 {
1375 return false;
1376 }
1377 if desc.rank < 0 || desc.rank as usize > MAX_RANK {
1378 return false;
1379 }
1380 if desc.elem_size < 0 {
1381 return false;
1382 }
1383 if desc.is_allocated() && desc.base_addr.is_null() {
1384 return false;
1385 }
1386 if !desc.is_allocated() && !desc.base_addr.is_null() {
1387 return false;
1388 }
1389 true
1390 }
1391
1392 /// Assign one array to another with automatic reallocation (F2003).
1393 ///
1394 /// If dest's shape doesn't match source's shape, deallocate dest and
1395 /// reallocate with source's shape. Then copy data.
1396 #[no_mangle]
1397 pub extern "C" fn afs_assign_allocatable(
1398 dest: *mut ArrayDescriptor,
1399 source: *const ArrayDescriptor,
1400 ) {
1401 if dest.is_null() || source.is_null() {
1402 return;
1403 }
1404
1405 let dest = unsafe { &mut *dest };
1406 let source = unsafe { &*source };
1407
1408 if !descriptor_looks_sane(dest) {
1409 *dest = ArrayDescriptor::zeroed();
1410 }
1411
1412 if !source.is_allocated() && source.base_addr.is_null() {
1413 if dest.is_allocated() && !dest.base_addr.is_null() {
1414 unsafe {
1415 libc_free(dest.base_addr);
1416 }
1417 }
1418 *dest = ArrayDescriptor::zeroed();
1419 return;
1420 }
1421
1422 // Check if shapes match.
1423 let shapes_match = dest.rank == source.rank && {
1424 (0..dest.rank as usize).all(|i| dest.dims[i].extent() == source.dims[i].extent())
1425 };
1426
1427 if !shapes_match || !dest.is_allocated() {
1428 // Deallocate dest if allocated.
1429 if dest.is_allocated() && !dest.base_addr.is_null() {
1430 unsafe {
1431 libc_free(dest.base_addr);
1432 }
1433 dest.base_addr = ptr::null_mut();
1434 dest.flags &= !DESC_ALLOCATED;
1435 }
1436
1437 // Allocate with source's shape, but compute canonical
1438 // column-major strides (1, ext_0, ext_0*ext_1, ...) — the
1439 // dest is freshly contiguous, so per-dim memory step must
1440 // match Fortran's column-major convention used by
1441 // afs_create_section / load_rank1_array_desc_elem. Setting
1442 // stride=1 across the board collapsed dim_1+ accesses onto
1443 // the dim_0 axis (e.g. allocatable A = transpose(reshape(...))
1444 // produced descriptor with stride=(1,1) and any subsequent
1445 // assumed-shape pass read overlapping bytes per "column").
1446 dest.rank = source.rank;
1447 dest.elem_size = source.elem_size;
1448 let mut running_stride: i64 = 1;
1449 for i in 0..source.rank as usize {
1450 dest.dims[i] = source.dims[i];
1451 dest.dims[i].stride = running_stride;
1452 running_stride = running_stride.saturating_mul(source.dims[i].extent().max(1));
1453 }
1454
1455 let bytes = dest.total_bytes();
1456 if bytes > 0 {
1457 let ptr = unsafe { libc_malloc(bytes as usize) };
1458 if ptr.is_null() {
1459 eprintln!("ALLOCATE (assignment): out of memory ({} bytes)", bytes);
1460 std::process::exit(1);
1461 }
1462 dest.base_addr = ptr;
1463 }
1464 dest.flags = DESC_ALLOCATED | DESC_CONTIGUOUS;
1465 }
1466
1467 // Copy data. Source may be non-contiguous (e.g. result of
1468 // transpose() returns a descriptor with reversed dim strides
1469 // pointing at the original buffer). A flat ptr::copy of
1470 // total_bytes from source.base_addr would drag adjacent bytes
1471 // forward without honoring per-dim strides — the same class of
1472 // bug as the original afs_copy_array_data flat copy. Detect
1473 // non-contiguous and walk every multi-index column-major.
1474 //
1475 // We only treat the source as non-contiguous when at least one
1476 // dim's stride is *strictly greater* than its canonical
1477 // column-major step. Strides smaller than canonical (e.g.
1478 // afs_matmul's 2x2 result emitted with stride=(1,1) instead of
1479 // (1,2)) describe an internally inconsistent descriptor whose
1480 // base_addr still points at a flat contiguous buffer; walking
1481 // those would re-read the same byte offset twice and drop the
1482 // last element. The conservative choice is the flat copy that
1483 // mirrors total_bytes — which the previous unconditional ptr::copy
1484 // did silently for both kinds of source.
1485 let bytes = source.total_bytes();
1486 if bytes > 0 && !source.base_addr.is_null() && !dest.base_addr.is_null() {
1487 let elem_size = source.elem_size;
1488 let mut canonical: i64 = 1;
1489 let mut strided = false;
1490 for i in 0..source.rank as usize {
1491 if source.dims[i].stride > canonical {
1492 strided = true;
1493 break;
1494 }
1495 canonical = canonical.saturating_mul(source.dims[i].extent().max(1));
1496 }
1497 if !strided {
1498 unsafe {
1499 ptr::copy(source.base_addr, dest.base_addr, bytes as usize);
1500 }
1501 } else {
1502 let rank = source.rank as usize;
1503 let extents: Vec<i64> = (0..rank).map(|i| source.dims[i].extent()).collect();
1504 let strides: Vec<i64> = (0..rank).map(|i| source.dims[i].stride).collect();
1505 let mut idx = vec![0i64; rank];
1506 let total = source.total_elements();
1507 for k in 0..total {
1508 let mut src_off: i64 = 0;
1509 for d in 0..rank {
1510 src_off += idx[d] * strides[d];
1511 }
1512 src_off *= elem_size;
1513 let dst_off = k * elem_size;
1514 unsafe {
1515 ptr::copy_nonoverlapping(
1516 source.base_addr.offset(src_off as isize),
1517 dest.base_addr.offset(dst_off as isize),
1518 elem_size as usize,
1519 );
1520 }
1521 for d in 0..rank {
1522 idx[d] += 1;
1523 if idx[d] < extents[d] {
1524 break;
1525 }
1526 idx[d] = 0;
1527 }
1528 }
1529 }
1530 }
1531 dest.set_scalar_type_tag(source.scalar_type_tag());
1532 dest.set_scalar_tbp_lookup_ptr(source.scalar_tbp_lookup_ptr());
1533 }
1534
1535 /// Element-converting allocatable assignment.
1536 ///
1537 /// F2018 §10.2.1.3: when the LHS and RHS of an array assignment have
1538 /// different numeric element types, each element is converted to the
1539 /// LHS type. This entry point performs that conversion when the source
1540 /// descriptor's element kind differs from the destination's.
1541 ///
1542 /// kind_tag: 0=i8, 1=i16, 2=i32, 3=i64, 4=f32, 5=f64
1543 #[no_mangle]
1544 pub extern "C" fn afs_assign_allocatable_convert(
1545 dest: *mut ArrayDescriptor,
1546 source: *const ArrayDescriptor,
1547 dest_kind_tag: i32,
1548 src_kind_tag: i32,
1549 ) {
1550 if dest.is_null() || source.is_null() {
1551 return;
1552 }
1553 let dest_ref = unsafe { &mut *dest };
1554 let source_ref = unsafe { &*source };
1555
1556 if !descriptor_looks_sane(dest_ref) {
1557 *dest_ref = ArrayDescriptor::zeroed();
1558 }
1559
1560 if !source_ref.is_allocated() && source_ref.base_addr.is_null() {
1561 if dest_ref.is_allocated() && !dest_ref.base_addr.is_null() {
1562 unsafe {
1563 libc_free(dest_ref.base_addr);
1564 }
1565 }
1566 *dest_ref = ArrayDescriptor::zeroed();
1567 return;
1568 }
1569
1570 let dest_elem_size: i64 = match dest_kind_tag {
1571 0 => 1,
1572 1 => 2,
1573 2 | 4 => 4,
1574 3 | 5 => 8,
1575 _ => return,
1576 };
1577
1578 let shapes_match = dest_ref.rank == source_ref.rank
1579 && dest_ref.elem_size == dest_elem_size
1580 && (0..dest_ref.rank as usize)
1581 .all(|i| dest_ref.dims[i].extent() == source_ref.dims[i].extent());
1582
1583 if !shapes_match || !dest_ref.is_allocated() {
1584 if dest_ref.is_allocated() && !dest_ref.base_addr.is_null() {
1585 unsafe {
1586 libc_free(dest_ref.base_addr);
1587 }
1588 dest_ref.base_addr = ptr::null_mut();
1589 dest_ref.flags &= !DESC_ALLOCATED;
1590 }
1591 dest_ref.rank = source_ref.rank;
1592 dest_ref.elem_size = dest_elem_size;
1593 // Canonical column-major strides — see matching note in
1594 // afs_assign_allocatable. dest is freshly contiguous; the
1595 // per-dim memory step must be (1, ext_0, ext_0*ext_1, ...).
1596 let mut running_stride: i64 = 1;
1597 for i in 0..source_ref.rank as usize {
1598 dest_ref.dims[i] = source_ref.dims[i];
1599 dest_ref.dims[i].stride = running_stride;
1600 running_stride =
1601 running_stride.saturating_mul(source_ref.dims[i].extent().max(1));
1602 }
1603 let bytes = dest_ref.total_bytes();
1604 if bytes > 0 {
1605 let ptr = unsafe { libc_malloc(bytes as usize) };
1606 if ptr.is_null() {
1607 eprintln!("ALLOCATE (assignment): out of memory ({} bytes)", bytes);
1608 std::process::exit(1);
1609 }
1610 dest_ref.base_addr = ptr;
1611 }
1612 dest_ref.flags = DESC_ALLOCATED | DESC_CONTIGUOUS;
1613 }
1614
1615 let n: usize = (0..source_ref.rank as usize)
1616 .map(|i| source_ref.dims[i].extent() as usize)
1617 .product();
1618 if n == 0 || source_ref.base_addr.is_null() || dest_ref.base_addr.is_null() {
1619 return;
1620 }
1621
1622 let src_p = source_ref.base_addr;
1623 let dst_p = dest_ref.base_addr;
1624 let src_elem_size: i64 = match src_kind_tag {
1625 0 => 1,
1626 1 => 2,
1627 2 | 4 => 4,
1628 3 | 5 => 8,
1629 _ => return,
1630 };
1631 // Source may be non-contiguous (e.g. transpose result, section).
1632 // Walk each multi-index column-major and apply per-dim strides.
1633 // Mirror the same-class detection used in afs_assign_allocatable
1634 // and afs_copy_array_data: only apply per-dim strides when at
1635 // least one stride is *strictly greater* than its canonical
1636 // column-major step. A stride below canonical describes a
1637 // malformed descriptor (e.g. a 2x2 matmul result with stride=(1,1)
1638 // instead of (1,2)) whose underlying buffer is still flat
1639 // contiguous; walking those re-reads the same offset twice.
1640 let rank = source_ref.rank as usize;
1641 let extents: Vec<i64> = (0..rank).map(|i| source_ref.dims[i].extent()).collect();
1642 let raw_strides: Vec<i64> = (0..rank).map(|i| source_ref.dims[i].stride).collect();
1643 let mut canonical_step: i64 = 1;
1644 let mut canonical: Vec<i64> = Vec::with_capacity(rank);
1645 let mut strided = false;
1646 for d in 0..rank {
1647 canonical.push(canonical_step);
1648 if raw_strides[d] > canonical_step {
1649 strided = true;
1650 }
1651 canonical_step = canonical_step.saturating_mul(extents[d].max(1));
1652 }
1653 let strides: &[i64] = if strided { &raw_strides } else { &canonical };
1654 let mut idx = vec![0i64; rank];
1655 for k in 0..n {
1656 let mut src_off_elems: i64 = 0;
1657 for d in 0..rank {
1658 src_off_elems += idx[d] * strides[d];
1659 }
1660 let src_byte_off = src_off_elems * src_elem_size;
1661 let src_val_f64: f64 = unsafe {
1662 match src_kind_tag {
1663 0 => *(src_p.offset(src_byte_off as isize) as *const i8) as f64,
1664 1 => *(src_p.offset(src_byte_off as isize) as *const i16) as f64,
1665 2 => *(src_p.offset(src_byte_off as isize) as *const i32) as f64,
1666 3 => *(src_p.offset(src_byte_off as isize) as *const i64) as f64,
1667 4 => *(src_p.offset(src_byte_off as isize) as *const f32) as f64,
1668 5 => *(src_p.offset(src_byte_off as isize) as *const f64),
1669 _ => return,
1670 }
1671 };
1672 unsafe {
1673 match dest_kind_tag {
1674 0 => *(dst_p.add(k) as *mut i8) = src_val_f64 as i8,
1675 1 => *(dst_p.add(2 * k) as *mut i16) = src_val_f64 as i16,
1676 2 => *(dst_p.add(4 * k) as *mut i32) = src_val_f64 as i32,
1677 3 => *(dst_p.add(8 * k) as *mut i64) = src_val_f64 as i64,
1678 4 => *(dst_p.add(4 * k) as *mut f32) = src_val_f64 as f32,
1679 5 => *(dst_p.add(8 * k) as *mut f64) = src_val_f64,
1680 _ => return,
1681 }
1682 }
1683 for d in 0..rank {
1684 idx[d] += 1;
1685 if idx[d] < extents[d] {
1686 break;
1687 }
1688 idx[d] = 0;
1689 }
1690 }
1691 }
1692
1693 // ---- MOVE_ALLOC ----
1694
1695 /// Transfer allocation from `from` to `to` (F2003 MOVE_ALLOC).
1696 ///
1697 /// `to` is deallocated if allocated, then receives `from`'s descriptor.
1698 /// `from` is cleared (becomes unallocated).
1699 #[no_mangle]
1700 pub extern "C" fn afs_move_alloc(from: *mut ArrayDescriptor, to: *mut ArrayDescriptor) {
1701 if from.is_null() || to.is_null() {
1702 return;
1703 }
1704
1705 let from_desc = unsafe { &mut *from };
1706 let to_desc = unsafe { &mut *to };
1707
1708 // Deallocate `to` if allocated.
1709 if to_desc.is_allocated() && !to_desc.base_addr.is_null() {
1710 unsafe {
1711 libc_free(to_desc.base_addr);
1712 }
1713 }
1714
1715 // Copy descriptor from `from` to `to`.
1716 *to_desc = from_desc.clone();
1717
1718 // Clear `from`.
1719 from_desc.base_addr = ptr::null_mut();
1720 from_desc.flags &= !DESC_ALLOCATED;
1721 from_desc.clear_scalar_type_tag();
1722 }
1723
1724 // ---- ALLOCATED INTRINSIC ----
1725
1726 /// Check if an array is allocated. Returns 1 (true) or 0 (false).
1727 #[no_mangle]
1728 pub extern "C" fn afs_allocated(desc: *const ArrayDescriptor) -> i32 {
1729 if desc.is_null() {
1730 return 0;
1731 }
1732 unsafe { (*desc).is_allocated() as i32 }
1733 }
1734
1735 // ---- ARRAY SECTIONS ----
1736
1737 /// Create a section descriptor that views into an existing array.
1738 ///
1739 /// `specs` is an array of SectionSpec values (one per dimension), specifying
1740 /// the start, end, and stride of the section. The result descriptor points
1741 /// into the source's data with adjusted base_addr and strides.
1742 #[repr(C)]
1743 #[derive(Clone, Copy)]
1744 pub struct SectionSpec {
1745 pub start: i64,
1746 pub end: i64,
1747 pub stride: i64,
1748 }
1749
1750 #[no_mangle]
1751 pub extern "C" fn afs_create_section(
1752 source: *const ArrayDescriptor,
1753 result: *mut ArrayDescriptor,
1754 specs: *const SectionSpec,
1755 n_dims: i32,
1756 ) {
1757 if source.is_null() || result.is_null() || specs.is_null() {
1758 return;
1759 }
1760
1761 let source = unsafe { &*source };
1762 let result = unsafe { &mut *result };
1763 let specs_slice = unsafe { std::slice::from_raw_parts(specs, n_dims as usize) };
1764
1765 result.elem_size = source.elem_size;
1766 result.flags = DESC_CONTIGUOUS; // sections may not be contiguous
1767 // Don't set DESC_ALLOCATED — section doesn't own the data.
1768
1769 // Compute base address offset and new dims.
1770 //
1771 // The descriptor convention here is that `dim[k].stride` already encodes
1772 // the *memory step in elements* between adjacent positions along dim k —
1773 // see materialize_array_descriptor_for_info in src/ir/lower.rs which
1774 // builds dim[k].stride = product(extents[0..k]) for a contiguous array.
1775 // So byte_offset and surviving-dim memory strides are computed directly
1776 // from src_dim.stride; no extra column-major multiplier is needed.
1777 //
1778 // SectionSpec.stride == 0 is a sentinel for *rank-reducing* scalar
1779 // selection (e.g. the `1` in `y(1,:)`). Those dims contribute to the
1780 // base offset but do NOT appear in the result descriptor.
1781 let mut byte_offset: i64 = 0;
1782 let mut result_rank: i32 = 0;
1783
1784 for (i, spec) in specs_slice.iter().enumerate() {
1785 let src_dim = &source.dims[i];
1786
1787 // Offset from source lower bound to section start.
1788 let start_idx = spec.start - src_dim.lower_bound;
1789 byte_offset += start_idx * src_dim.stride * source.elem_size;
1790
1791 if spec.stride != 0 {
1792 // Slice: keep this dim. Extent = max(0, (end - start) / stride + 1).
1793 // For negative strides, start > end and (end-start)/stride is positive.
1794 // For a positive stride where start > end, result is empty (extent 0).
1795 let extent = if (spec.stride > 0 && spec.start > spec.end)
1796 || (spec.stride < 0 && spec.start < spec.end)
1797 {
1798 0 // empty section
1799 } else {
1800 (spec.end - spec.start) / spec.stride + 1
1801 };
1802 result.dims[result_rank as usize] = DimDescriptor {
1803 lower_bound: 1, // sections are always 1-based
1804 upper_bound: extent,
1805 stride: src_dim.stride * spec.stride,
1806 };
1807 result_rank += 1;
1808 }
1809 }
1810
1811 result.rank = result_rank;
1812
1813 // Result base_addr = source base_addr + offset.
1814 if !source.base_addr.is_null() {
1815 // byte_offset can be negative for negative-stride sections.
1816 result.base_addr = unsafe { source.base_addr.offset(byte_offset as isize) };
1817 } else {
1818 result.base_addr = ptr::null_mut();
1819 }
1820
1821 // Check contiguity: contiguous iff every surviving dim has stride 1.
1822 let is_contig = (0..result_rank as usize).all(|i| result.dims[i].stride == 1);
1823 if !is_contig {
1824 result.flags &= !DESC_CONTIGUOUS;
1825 }
1826 }
1827
1828 // ---- libc interop ----
1829
1830 extern "C" {
1831 fn malloc(size: usize) -> *mut u8;
1832 fn free(ptr: *mut u8);
1833 }
1834
1835 unsafe fn libc_malloc(size: usize) -> *mut u8 {
1836 malloc(size)
1837 }
1838
1839 unsafe fn libc_free(ptr: *mut u8) {
1840 free(ptr)
1841 }
1842
1843 #[cfg(test)]
1844 mod tests {
1845 use super::*;
1846
1847 #[test]
1848 fn allocate_1d() {
1849 let mut desc = ArrayDescriptor::zeroed();
1850 afs_allocate_1d(&mut desc, 4, 10);
1851 assert!(desc.is_allocated());
1852 assert_eq!(desc.rank, 1);
1853 assert_eq!(desc.elem_size, 4);
1854 assert_eq!(desc.total_elements(), 10);
1855 assert!(!desc.base_addr.is_null());
1856 afs_deallocate_array(&mut desc, ptr::null_mut());
1857 assert!(!desc.is_allocated());
1858 assert!(desc.base_addr.is_null());
1859 }
1860
1861 #[test]
1862 fn allocate_2d() {
1863 let mut desc = ArrayDescriptor::zeroed();
1864 let dims = [
1865 DimDescriptor {
1866 lower_bound: 1,
1867 upper_bound: 3,
1868 stride: 1,
1869 },
1870 DimDescriptor {
1871 lower_bound: 1,
1872 upper_bound: 4,
1873 stride: 1,
1874 },
1875 ];
1876 afs_allocate_array(&mut desc, 8, 2, dims.as_ptr(), ptr::null_mut());
1877 assert!(desc.is_allocated());
1878 assert_eq!(desc.total_elements(), 12);
1879 assert_eq!(desc.total_bytes(), 96); // 12 * 8
1880 afs_deallocate_array(&mut desc, ptr::null_mut());
1881 }
1882
1883 #[test]
1884 fn allocate_with_stat() {
1885 let mut desc = ArrayDescriptor::zeroed();
1886 let mut stat: i32 = -1;
1887 afs_allocate_1d(&mut desc, 4, 10);
1888 // Allocating again should fail with stat.
1889 let dim = DimDescriptor {
1890 lower_bound: 1,
1891 upper_bound: 10,
1892 stride: 1,
1893 };
1894 afs_allocate_array(&mut desc, 4, 1, &dim, &mut stat);
1895 assert_eq!(stat, 2); // already allocated
1896 afs_deallocate_array(&mut desc, ptr::null_mut());
1897 }
1898
1899 #[test]
1900 fn allocate_like_preserves_shape_and_forces_contiguous_stride() {
1901 let mut source = ArrayDescriptor::zeroed();
1902 source.elem_size = 8;
1903 source.rank = 2;
1904 source.flags = DESC_ALLOCATED;
1905 source.dims[0] = DimDescriptor {
1906 lower_bound: -2,
1907 upper_bound: 1,
1908 stride: 3,
1909 };
1910 source.dims[1] = DimDescriptor {
1911 lower_bound: 4,
1912 upper_bound: 6,
1913 stride: 5,
1914 };
1915
1916 let mut dest = ArrayDescriptor::zeroed();
1917 let mut stat = -1;
1918 afs_allocate_like(&mut dest, &source, &mut stat);
1919 assert_eq!(stat, 0);
1920 assert!(dest.is_allocated());
1921 assert_eq!(dest.elem_size, 8);
1922 assert_eq!(dest.rank, 2);
1923 // Bounds carry over from source. Strides are canonical
1924 // column-major (stride[0]=1, stride[k]=Π extent[0..k]) — see
1925 // matching note in afs_allocate_array. The previous flat-1
1926 // strides made downstream `afs_create_section` compute
1927 // colliding byte offsets for any rank-2 reshape.
1928 assert_eq!(dest.dims[0].lower_bound, -2);
1929 assert_eq!(dest.dims[0].upper_bound, 1);
1930 assert_eq!(dest.dims[0].stride, 1);
1931 assert_eq!(dest.dims[1].lower_bound, 4);
1932 assert_eq!(dest.dims[1].upper_bound, 6);
1933 // dim[1].stride = extent[0] = 1-(-2)+1 = 4
1934 assert_eq!(dest.dims[1].stride, 4);
1935
1936 afs_deallocate_array(&mut dest, ptr::null_mut());
1937 }
1938
1939 #[test]
1940 fn copy_array_data_preserves_explicit_destination_shape() {
1941 let mut source = ArrayDescriptor::zeroed();
1942 let mut dest = ArrayDescriptor::zeroed();
1943 let mut stat = -1;
1944
1945 afs_allocate_1d(&mut source, 4, 2);
1946 afs_allocate_1d(&mut dest, 4, 2);
1947 unsafe {
1948 let src = source.base_addr as *mut i32;
1949 *src.add(0) = 4;
1950 *src.add(1) = 5;
1951 }
1952
1953 afs_copy_array_data(&mut dest, &source, &mut stat);
1954 assert_eq!(stat, 0);
1955 assert_eq!(dest.total_elements(), 2);
1956 unsafe {
1957 let data = dest.base_addr as *const i32;
1958 assert_eq!(*data.add(0), 4);
1959 assert_eq!(*data.add(1), 5);
1960 }
1961
1962 afs_deallocate_array(&mut source, ptr::null_mut());
1963 afs_deallocate_array(&mut dest, ptr::null_mut());
1964 }
1965
1966 #[test]
1967 fn copy_array_data_rolls_back_on_shape_mismatch() {
1968 let mut source = ArrayDescriptor::zeroed();
1969 let mut dest = ArrayDescriptor::zeroed();
1970 let mut stat = -1;
1971
1972 afs_allocate_1d(&mut source, 4, 3);
1973 afs_allocate_1d(&mut dest, 4, 2);
1974 afs_copy_array_data(&mut dest, &source, &mut stat);
1975 assert_eq!(stat, 4);
1976 assert!(!dest.is_allocated());
1977 assert!(dest.base_addr.is_null());
1978
1979 afs_deallocate_array(&mut source, ptr::null_mut());
1980 }
1981
1982 #[test]
1983 fn move_alloc() {
1984 let mut from = ArrayDescriptor::zeroed();
1985 let mut to = ArrayDescriptor::zeroed();
1986 afs_allocate_1d(&mut from, 4, 100);
1987 assert!(from.is_allocated());
1988 assert!(!to.is_allocated());
1989
1990 afs_move_alloc(&mut from, &mut to);
1991 assert!(!from.is_allocated());
1992 assert!(to.is_allocated());
1993 assert_eq!(to.total_elements(), 100);
1994
1995 afs_deallocate_array(&mut to, ptr::null_mut());
1996 }
1997
1998 #[test]
1999 fn move_alloc_preserves_scalar_type_tag() {
2000 let mut from = ArrayDescriptor::zeroed();
2001 let mut to = ArrayDescriptor::zeroed();
2002 afs_allocate_array(&mut from, 8, 0, ptr::null(), ptr::null_mut());
2003 from.set_scalar_type_tag(77);
2004
2005 afs_move_alloc(&mut from, &mut to);
2006 assert_eq!(to.scalar_type_tag(), 77);
2007 assert_eq!(from.scalar_type_tag(), 0);
2008
2009 afs_deallocate_array(&mut to, ptr::null_mut());
2010 }
2011
2012 #[test]
2013 fn allocated_intrinsic() {
2014 let mut desc = ArrayDescriptor::zeroed();
2015 assert_eq!(afs_allocated(&desc), 0);
2016 afs_allocate_1d(&mut desc, 4, 10);
2017 assert_eq!(afs_allocated(&desc), 1);
2018 afs_deallocate_array(&mut desc, ptr::null_mut());
2019 assert_eq!(afs_allocated(&desc), 0);
2020 }
2021
2022 #[test]
2023 fn assign_allocatable() {
2024 let mut source = ArrayDescriptor::zeroed();
2025 let mut dest = ArrayDescriptor::zeroed();
2026
2027 afs_allocate_1d(&mut source, 4, 5);
2028 // Write some data into source.
2029 unsafe {
2030 let data = source.base_addr as *mut i32;
2031 for i in 0..5 {
2032 *data.add(i) = (i + 1) as i32;
2033 }
2034 }
2035
2036 afs_assign_allocatable(&mut dest, &source);
2037 assert!(dest.is_allocated());
2038 assert_eq!(dest.total_elements(), 5);
2039
2040 // Verify data was copied.
2041 unsafe {
2042 let data = dest.base_addr as *const i32;
2043 for i in 0..5 {
2044 assert_eq!(*data.add(i), (i + 1) as i32);
2045 }
2046 }
2047
2048 afs_deallocate_array(&mut source, ptr::null_mut());
2049 afs_deallocate_array(&mut dest, ptr::null_mut());
2050 }
2051
2052 #[test]
2053 fn assign_allocatable_preserves_scalar_type_tag() {
2054 let mut source = ArrayDescriptor::zeroed();
2055 let mut dest = ArrayDescriptor::zeroed();
2056
2057 afs_allocate_array(&mut source, 8, 0, ptr::null(), ptr::null_mut());
2058 source.set_scalar_type_tag(42);
2059
2060 afs_assign_allocatable(&mut dest, &source);
2061 assert_eq!(dest.scalar_type_tag(), 42);
2062
2063 afs_deallocate_array(&mut source, ptr::null_mut());
2064 afs_deallocate_array(&mut dest, ptr::null_mut());
2065 }
2066
2067 #[test]
2068 fn assign_allocatable_from_unallocated_source_clears_dest() {
2069 let source = ArrayDescriptor::zeroed();
2070 let mut dest = ArrayDescriptor::zeroed();
2071
2072 afs_allocate_1d(&mut dest, 4, 3);
2073 assert!(dest.is_allocated());
2074
2075 afs_assign_allocatable(&mut dest, &source);
2076 assert!(!dest.is_allocated());
2077 assert!(dest.base_addr.is_null());
2078 assert_eq!(dest.rank, 0);
2079 }
2080
2081 #[test]
2082 fn assign_allocatable_ignores_invalid_garbage_dest_for_unallocated_source() {
2083 let source = ArrayDescriptor::zeroed();
2084 let mut dest = ArrayDescriptor::zeroed();
2085
2086 dest.flags = DESC_ALLOCATED;
2087 dest.rank = 1;
2088 dest.elem_size = 4;
2089 dest.dims[0] = DimDescriptor {
2090 lower_bound: 1,
2091 upper_bound: 0,
2092 stride: 1,
2093 };
2094
2095 afs_assign_allocatable(&mut dest, &source);
2096 assert!(!dest.is_allocated());
2097 assert!(dest.base_addr.is_null());
2098 assert_eq!(dest.rank, 0);
2099 }
2100
2101 #[test]
2102 fn assign_allocatable_copies_from_nonowning_section_descriptor() {
2103 let mut backing = ArrayDescriptor::zeroed();
2104 let mut section = ArrayDescriptor::zeroed();
2105 let mut dest = ArrayDescriptor::zeroed();
2106
2107 afs_allocate_1d(&mut backing, 4, 4);
2108 unsafe {
2109 let data = backing.base_addr as *mut i32;
2110 *data.add(0) = 10;
2111 *data.add(1) = 20;
2112 *data.add(2) = 30;
2113 *data.add(3) = 40;
2114 }
2115
2116 let spec = SectionSpec {
2117 start: 2,
2118 end: 3,
2119 stride: 1,
2120 };
2121 afs_create_section(&backing, &mut section, &spec, 1);
2122 assert!(!section.is_allocated());
2123 assert!(!section.base_addr.is_null());
2124
2125 afs_assign_allocatable(&mut dest, &section);
2126 assert!(dest.is_allocated());
2127 assert_eq!(dest.total_elements(), 2);
2128 unsafe {
2129 let data = dest.base_addr as *const i32;
2130 assert_eq!(*data.add(0), 20);
2131 assert_eq!(*data.add(1), 30);
2132 }
2133
2134 afs_deallocate_array(&mut backing, ptr::null_mut());
2135 afs_deallocate_array(&mut dest, ptr::null_mut());
2136 }
2137
2138 #[test]
2139 fn zero_size_allocation() {
2140 let mut desc = ArrayDescriptor::zeroed();
2141 afs_allocate_1d(&mut desc, 4, 0);
2142 assert!(desc.is_allocated());
2143 assert_eq!(desc.total_elements(), 0);
2144 afs_deallocate_array(&mut desc, ptr::null_mut());
2145 }
2146
2147 #[test]
2148 fn fill_i32_bulk_kernel() {
2149 let mut data = [0_i32; 8];
2150 afs_fill_i32(data.as_mut_ptr(), data.len() as i64, 7);
2151 assert_eq!(data, [7, 7, 7, 7, 7, 7, 7, 7]);
2152 }
2153
2154 #[test]
2155 fn array_add_i32_bulk_kernel() {
2156 let lhs = [1_i32, 2, 3, 4, 5, 6, 7, 8];
2157 let rhs = [10_i32, 20, 30, 40, 50, 60, 70, 80];
2158 let mut out = [0_i32; 8];
2159 afs_array_add_i32(
2160 out.as_mut_ptr(),
2161 lhs.as_ptr(),
2162 rhs.as_ptr(),
2163 out.len() as i64,
2164 );
2165 assert_eq!(out, [11, 22, 33, 44, 55, 66, 77, 88]);
2166 }
2167
2168 #[test]
2169 fn array_add_f64_bulk_kernel() {
2170 let lhs = [1.5_f64, 2.5, 3.5, 4.5];
2171 let rhs = [10.0_f64, 20.0, 30.0, 40.0];
2172 let mut out = [0.0_f64; 4];
2173 afs_array_add_f64(
2174 out.as_mut_ptr(),
2175 lhs.as_ptr(),
2176 rhs.as_ptr(),
2177 out.len() as i64,
2178 );
2179 assert_eq!(out, [11.5, 22.5, 33.5, 44.5]);
2180 }
2181
2182 #[test]
2183 fn array_sub_i32_bulk_kernel() {
2184 let lhs = [11_i32, 22, 33, 44, 55, 66, 77, 88];
2185 let rhs = [1_i32, 2, 3, 4, 5, 6, 7, 8];
2186 let mut out = [0_i32; 8];
2187 afs_array_sub_i32(
2188 out.as_mut_ptr(),
2189 lhs.as_ptr(),
2190 rhs.as_ptr(),
2191 out.len() as i64,
2192 );
2193 assert_eq!(out, [10, 20, 30, 40, 50, 60, 70, 80]);
2194 }
2195
2196 #[test]
2197 fn array_mul_f32_bulk_kernel() {
2198 let lhs = [1.0_f32, 2.0, 3.0, 4.0];
2199 let rhs = [2.0_f32, 3.0, 4.0, 5.0];
2200 let mut out = [0.0_f32; 4];
2201 afs_array_mul_f32(
2202 out.as_mut_ptr(),
2203 lhs.as_ptr(),
2204 rhs.as_ptr(),
2205 out.len() as i64,
2206 );
2207 assert_eq!(out, [2.0, 6.0, 12.0, 20.0]);
2208 }
2209
2210 #[test]
2211 fn array_add_scalar_i32_bulk_kernel() {
2212 let src = [1_i32, 2, 3, 4, 5, 6, 7, 8];
2213 let mut out = [0_i32; 8];
2214 afs_array_add_scalar_i32(out.as_mut_ptr(), src.as_ptr(), 5, out.len() as i64);
2215 assert_eq!(out, [6, 7, 8, 9, 10, 11, 12, 13]);
2216 }
2217
2218 #[test]
2219 fn scalar_sub_array_i32_bulk_kernel() {
2220 let src = [1_i32, 2, 3, 4, 5, 6, 7, 8];
2221 let mut out = [0_i32; 8];
2222 afs_scalar_sub_array_i32(out.as_mut_ptr(), 20, src.as_ptr(), out.len() as i64);
2223 assert_eq!(out, [19, 18, 17, 16, 15, 14, 13, 12]);
2224 }
2225
2226 #[test]
2227 fn array_mul_scalar_f64_bulk_kernel() {
2228 let src = [1.5_f64, 2.5, 3.5, 4.5];
2229 let mut out = [0.0_f64; 4];
2230 afs_array_mul_scalar_f64(out.as_mut_ptr(), src.as_ptr(), 2.0, out.len() as i64);
2231 assert_eq!(out, [3.0, 5.0, 7.0, 9.0]);
2232 }
2233 }
2234
2235 // ---- Array query intrinsics ----
2236
2237 /// SIZE(array) — total number of elements.
2238 #[no_mangle]
2239 pub extern "C" fn afs_array_size(desc: *const ArrayDescriptor) -> i64 {
2240 if desc.is_null() {
2241 return 0;
2242 }
2243 unsafe { (*desc).total_elements() }
2244 }
2245
2246 /// SIZE(array, dim) — number of elements along dimension `dim` (1-based).
2247 #[no_mangle]
2248 pub extern "C" fn afs_array_size_dim(desc: *const ArrayDescriptor, dim: i32) -> i64 {
2249 if desc.is_null() || dim < 1 {
2250 return 0;
2251 }
2252 let d = unsafe { &*desc };
2253 let idx = (dim - 1) as usize;
2254 if idx < d.rank as usize {
2255 d.dims[idx].extent()
2256 } else {
2257 0
2258 }
2259 }
2260
2261 /// SHAPE(array) → fresh rank-1 default-integer (i32) array of length
2262 /// `rank`, holding each dimension's extent. Allocates the destination
2263 /// via `afs_allocate_array`. F2018 §16.9.207.
2264 #[no_mangle]
2265 pub extern "C" fn afs_array_shape_int4(dst: *mut ArrayDescriptor, src: *const ArrayDescriptor) {
2266 if dst.is_null() || src.is_null() {
2267 return;
2268 }
2269 let s = unsafe { &*src };
2270 let n = s.rank as i64;
2271 let dim = DimDescriptor {
2272 lower_bound: 1,
2273 upper_bound: n,
2274 stride: 1,
2275 };
2276 afs_allocate_array(dst, 4, 1, &dim as *const DimDescriptor, ptr::null_mut());
2277 let d = unsafe { &mut *dst };
2278 let base = d.base_addr as *mut i32;
2279 for i in 0..s.rank as usize {
2280 unsafe {
2281 base.add(i).write(s.dims[i].extent() as i32);
2282 }
2283 }
2284 }
2285
2286 /// SHAPE(array, kind=int64) → rank-1 i64 array of extents.
2287 #[no_mangle]
2288 pub extern "C" fn afs_array_shape_int8(dst: *mut ArrayDescriptor, src: *const ArrayDescriptor) {
2289 if dst.is_null() || src.is_null() {
2290 return;
2291 }
2292 let s = unsafe { &*src };
2293 let n = s.rank as i64;
2294 let dim = DimDescriptor {
2295 lower_bound: 1,
2296 upper_bound: n,
2297 stride: 1,
2298 };
2299 afs_allocate_array(dst, 8, 1, &dim as *const DimDescriptor, ptr::null_mut());
2300 let d = unsafe { &mut *dst };
2301 let base = d.base_addr as *mut i64;
2302 for i in 0..s.rank as usize {
2303 unsafe {
2304 base.add(i).write(s.dims[i].extent());
2305 }
2306 }
2307 }
2308
2309 /// LBOUND(array, dim) — lower bound along dimension `dim` (1-based).
2310 #[no_mangle]
2311 pub extern "C" fn afs_array_lbound(desc: *const ArrayDescriptor, dim: i32) -> i64 {
2312 if desc.is_null() || dim < 1 {
2313 return 1;
2314 }
2315 let d = unsafe { &*desc };
2316 let idx = (dim - 1) as usize;
2317 if idx < d.rank as usize {
2318 d.dims[idx].lower_bound
2319 } else {
2320 1
2321 }
2322 }
2323
2324 /// UBOUND(array, dim) — upper bound along dimension `dim` (1-based).
2325 #[no_mangle]
2326 pub extern "C" fn afs_array_ubound(desc: *const ArrayDescriptor, dim: i32) -> i64 {
2327 if desc.is_null() || dim < 1 {
2328 return 0;
2329 }
2330 let d = unsafe { &*desc };
2331 let idx = (dim - 1) as usize;
2332 if idx < d.rank as usize {
2333 d.dims[idx].upper_bound
2334 } else {
2335 0
2336 }
2337 }
2338
2339 /// ALLOCATED(array) — check if array is allocated (returns 1 or 0).
2340 #[no_mangle]
2341 pub extern "C" fn afs_array_allocated(desc: *const ArrayDescriptor) -> i32 {
2342 if desc.is_null() {
2343 return 0;
2344 }
2345 unsafe { (*desc).is_allocated() as i32 }
2346 }
2347
2348 fn logical_desc_value(desc: &ArrayDescriptor, index: usize) -> bool {
2349 if desc.base_addr.is_null() || desc.rank < 1 {
2350 return false;
2351 }
2352 let stride = desc.dims[0].stride.max(1) as usize;
2353 let elem_size = desc.elem_size.max(1) as usize;
2354 let byte_offset = index.saturating_mul(stride).saturating_mul(elem_size);
2355 let ptr = unsafe { desc.base_addr.add(byte_offset) };
2356 unsafe { *ptr != 0 }
2357 }
2358
2359 /// ANY(array) for logical arrays — return 1 when any element is true.
2360 #[no_mangle]
2361 pub extern "C" fn afs_array_any_logical(desc: *const ArrayDescriptor) -> i32 {
2362 if desc.is_null() {
2363 return 0;
2364 }
2365 let d = unsafe { &*desc };
2366 let n = d.total_elements().max(0) as usize;
2367 for i in 0..n {
2368 if logical_desc_value(d, i) {
2369 return 1;
2370 }
2371 }
2372 0
2373 }
2374
2375 /// ALL(array) for logical arrays — return 1 when every element is true.
2376 #[no_mangle]
2377 pub extern "C" fn afs_array_all_logical(desc: *const ArrayDescriptor) -> i32 {
2378 if desc.is_null() {
2379 return 0;
2380 }
2381 let d = unsafe { &*desc };
2382 let n = d.total_elements().max(0) as usize;
2383 for i in 0..n {
2384 if !logical_desc_value(d, i) {
2385 return 0;
2386 }
2387 }
2388 1
2389 }
2390
2391 /// COUNT(array) for logical arrays — number of true elements.
2392 #[no_mangle]
2393 pub extern "C" fn afs_array_count_logical(desc: *const ArrayDescriptor) -> i32 {
2394 if desc.is_null() {
2395 return 0;
2396 }
2397 let d = unsafe { &*desc };
2398 let n = d.total_elements().max(0) as usize;
2399 let mut count = 0i32;
2400 for i in 0..n {
2401 if logical_desc_value(d, i) {
2402 count += 1;
2403 }
2404 }
2405 count
2406 }
2407
2408 /// COUNT(mask, DIM=k) — reduce along dimension k, allocate `dst` with
2409 /// rank `mask.rank - 1` and extents = mask extents minus the reduction
2410 /// dim, fill with per-slice counts of true elements (i32). Caller passes
2411 /// a zeroed 384-byte descriptor; this helper populates it. Surfaced in
2412 /// stdlib_stats var_mask_2_*: `n = count(mask, dim)` where n is rank-1
2413 /// (and a real array — caller does the int→real conversion after).
2414 /// Without this helper count(mask, dim) lowered to the rank-0 helper
2415 /// and returned a single int, which the compiler then passed as the
2416 /// source descriptor pointer to afs_assign_allocatable, crashing with
2417 /// a misaligned-pointer dereference (address 0x3 = the count value).
2418 // The column-major stride loops below intentionally use indexed access
2419 // across `extents`, `idx`, `dst_running_stride`, and `s.dims` together
2420 // with a separately-incrementing `dk` counter — clippy's
2421 // `enumerate().take(rank)` rewrite doesn't apply cleanly without
2422 // duplicating the index plumbing.
2423 #[allow(clippy::needless_range_loop)]
2424 #[no_mangle]
2425 pub extern "C" fn afs_array_count_logical_dim(
2426 src: *const ArrayDescriptor,
2427 dim: i32,
2428 dst: *mut ArrayDescriptor,
2429 ) {
2430 if src.is_null() || dst.is_null() {
2431 return;
2432 }
2433 let s = unsafe { &*src };
2434 let d = unsafe { &mut *dst };
2435 if s.base_addr.is_null() {
2436 return;
2437 }
2438 if !d.is_allocated() {
2439 let new_rank = (s.rank - 1).max(0);
2440 let mut dim_buf: [DimDescriptor; 15] = [DimDescriptor {
2441 lower_bound: 0,
2442 upper_bound: 0,
2443 stride: 0,
2444 }; 15];
2445 let mut k = 0usize;
2446 let mut acc: i64 = 1;
2447 for i in 0..s.rank as usize {
2448 if i + 1 == dim as usize {
2449 continue;
2450 }
2451 let extent = s.dims[i].extent();
2452 dim_buf[k].lower_bound = 1;
2453 dim_buf[k].upper_bound = extent;
2454 dim_buf[k].stride = acc;
2455 acc *= extent;
2456 k += 1;
2457 }
2458 let dim_ptr = if new_rank > 0 {
2459 dim_buf.as_ptr()
2460 } else {
2461 ptr::null()
2462 };
2463 let mut stat: i32 = 0;
2464 // Result is integer(int32) per F2018 §16.9.46 default kind.
2465 afs_allocate_array(dst, 4, new_rank, dim_ptr, &mut stat);
2466 if stat != 0 || d.base_addr.is_null() {
2467 return;
2468 }
2469 }
2470 let dst_total = d.total_elements() as usize;
2471 let buf = d.base_addr as *mut i32;
2472 for i in 0..dst_total {
2473 unsafe {
2474 *buf.add(i) = 0;
2475 }
2476 }
2477 // Walk the mask in column-major order. logical_desc_value uses a
2478 // flat index which itself does column-major stride math, so we
2479 // mirror that here rather than use for_each_reduce_along_dim
2480 // (which gives byte offsets — not what logical_desc_value wants).
2481 let rank = s.rank as usize;
2482 if rank == 0 {
2483 return;
2484 }
2485 let reduce_dim_idx = dim as usize - 1;
2486 if reduce_dim_idx >= rank {
2487 return;
2488 }
2489 let mut extents: [i64; 15] = [0; 15];
2490 let mut dst_running_stride: [i64; 15] = [0; 15];
2491 let mut k = 0usize;
2492 let mut acc = 1i64;
2493 for i in 0..rank {
2494 extents[i] = s.dims[i].extent();
2495 if i == reduce_dim_idx {
2496 continue;
2497 }
2498 dst_running_stride[k] = acc;
2499 acc *= extents[i];
2500 k += 1;
2501 }
2502 let mut idx: [i64; 15] = [0; 15];
2503 let total = (0..rank).map(|i| extents[i]).product::<i64>();
2504 if total <= 0 {
2505 return;
2506 }
2507 for _ in 0..total {
2508 // Flat (column-major) index into the source mask.
2509 let mut src_flat: i64 = 0;
2510 let mut src_stride: i64 = 1;
2511 for d_i in 0..rank {
2512 src_flat += idx[d_i] * src_stride;
2513 src_stride *= extents[d_i];
2514 }
2515 let mut dst_flat: i64 = 0;
2516 let mut dk = 0usize;
2517 for d_i in 0..rank {
2518 if d_i != reduce_dim_idx {
2519 dst_flat += idx[d_i] * dst_running_stride[dk];
2520 dk += 1;
2521 }
2522 }
2523 if logical_desc_value(s, src_flat as usize) {
2524 unsafe {
2525 *buf.add(dst_flat as usize) += 1;
2526 }
2527 }
2528 for d_i in 0..rank {
2529 idx[d_i] += 1;
2530 if idx[d_i] < extents[d_i] {
2531 break;
2532 }
2533 idx[d_i] = 0;
2534 }
2535 }
2536 }
2537
2538 /// NORM2(array) — Euclidean norm `sqrt(sum(x**2))` (real(8)).
2539 /// F2018 §16.9.158. Respects strides for non-contiguous sections.
2540 #[no_mangle]
2541 pub extern "C" fn afs_array_norm2_real8(desc: *const ArrayDescriptor) -> f64 {
2542 if desc.is_null() {
2543 return 0.0;
2544 }
2545 let d = unsafe { &*desc };
2546 if d.base_addr.is_null() {
2547 return 0.0;
2548 }
2549 let n = d.total_elements() as usize;
2550 let stride = d.dims[0].stride.max(1) as usize;
2551 let ptr = d.base_addr as *const f64;
2552 let mut acc = 0.0_f64;
2553 for i in 0..n {
2554 let v = unsafe { *ptr.add(i * stride) };
2555 acc += v * v;
2556 }
2557 acc.sqrt()
2558 }
2559
2560 /// NORM2(array) — Euclidean norm `sqrt(sum(x**2))` (real(4)).
2561 #[no_mangle]
2562 pub extern "C" fn afs_array_norm2_real4(desc: *const ArrayDescriptor) -> f32 {
2563 if desc.is_null() {
2564 return 0.0;
2565 }
2566 let d = unsafe { &*desc };
2567 if d.base_addr.is_null() {
2568 return 0.0;
2569 }
2570 let n = d.total_elements() as usize;
2571 let stride = d.dims[0].stride.max(1) as usize;
2572 let ptr = d.base_addr as *const f32;
2573 let mut acc = 0.0_f64;
2574 for i in 0..n {
2575 let v = unsafe { *ptr.add(i * stride) } as f64;
2576 acc += v * v;
2577 }
2578 acc.sqrt() as f32
2579 }
2580
2581 /// SUM(array) — sum all elements (real version).
2582 /// Dispatches on `elem_size` so real(4) and real(8) arrays both sum
2583 /// correctly. Returns f64 (callers downcast for real(4) destinations).
2584 /// Respects strides for non-contiguous sections.
2585 #[no_mangle]
2586 pub extern "C" fn afs_array_sum_real8(desc: *const ArrayDescriptor) -> f64 {
2587 if desc.is_null() {
2588 return 0.0;
2589 }
2590 let d = unsafe { &*desc };
2591 if d.base_addr.is_null() {
2592 return 0.0;
2593 }
2594 let n = d.total_elements() as usize;
2595 let stride = d.dims[0].stride.max(1) as usize;
2596 let mut sum: f64 = 0.0;
2597 if d.elem_size == 4 {
2598 let ptr = d.base_addr as *const f32;
2599 for i in 0..n {
2600 sum += unsafe { *ptr.add(i * stride) } as f64;
2601 }
2602 } else {
2603 let ptr = d.base_addr as *const f64;
2604 for i in 0..n {
2605 sum += unsafe { *ptr.add(i * stride) };
2606 }
2607 }
2608 sum
2609 }
2610
2611 /// Walk every element of an n-dimensional array `src`, computing the
2612 /// flat byte offset of each element relative to `src.base_addr` and
2613 /// the corresponding flat dst index after collapsing dimension
2614 /// `reduce_dim` (1-based) — the dst array has rank `src.rank - 1` with
2615 /// extents copied from src skipping the reduction dim.
2616 ///
2617 /// The closure `accum(byte_offset, dst_flat_idx)` is invoked once per
2618 /// element. Caller supplies the accumulator/store logic for whatever
2619 /// reduction is being computed (sum, product, maxval, minval, etc.).
2620 fn for_each_reduce_along_dim<F: FnMut(usize, usize)>(
2621 src: &ArrayDescriptor,
2622 reduce_dim: i32,
2623 mut accum: F,
2624 ) {
2625 let rank = src.rank as usize;
2626 if rank == 0 {
2627 return;
2628 }
2629 let reduce_dim_idx = reduce_dim as usize - 1;
2630 if reduce_dim_idx >= rank {
2631 return;
2632 }
2633 let mut extents: [i64; 15] = [0; 15];
2634 let mut strides: [i64; 15] = [0; 15];
2635 // Layout of dst dims (rank - 1) — extents from src minus reduce_dim;
2636 // computed running stride for column-major dst layout.
2637 let mut dst_extents: [i64; 15] = [0; 15];
2638 let mut dst_running_stride: [i64; 15] = [0; 15];
2639 let mut k = 0usize;
2640 let mut acc = 1i64;
2641 for i in 0..rank {
2642 extents[i] = src.dims[i].extent();
2643 strides[i] = src.dims[i].stride.max(1);
2644 if i == reduce_dim_idx {
2645 continue;
2646 }
2647 dst_extents[k] = extents[i];
2648 dst_running_stride[k] = acc;
2649 acc *= extents[i];
2650 k += 1;
2651 }
2652 let mut idx: [i64; 15] = [0; 15];
2653 let total = (0..rank).map(|i| extents[i]).product::<i64>();
2654 if total <= 0 {
2655 return;
2656 }
2657 for _ in 0..total {
2658 let mut byte_off: i64 = 0;
2659 let mut dst_flat: i64 = 0;
2660 let mut dk = 0usize;
2661 for d in 0..rank {
2662 byte_off += idx[d] * strides[d] * src.elem_size;
2663 if d != reduce_dim_idx {
2664 dst_flat += idx[d] * dst_running_stride[dk];
2665 dk += 1;
2666 }
2667 }
2668 accum(byte_off as usize, dst_flat as usize);
2669 // Increment idx in column-major order (innermost = idx[0]).
2670 for d in 0..rank {
2671 idx[d] += 1;
2672 if idx[d] < extents[d] {
2673 break;
2674 }
2675 idx[d] = 0;
2676 }
2677 }
2678 }
2679
2680 /// SUM(array, DIM=k) — reduce along dimension k, allocate `dst` with
2681 /// rank `src.rank - 1` and extents = src extents minus the reduction
2682 /// dim, then write the per-slice sums into dst. Caller passes a
2683 /// zeroed 384-byte descriptor; this helper populates rank/dims/flags
2684 /// and malloc's the result buffer. Real version (real4 + real8
2685 /// dispatching on `src.elem_size`).
2686 #[no_mangle]
2687 pub extern "C" fn afs_array_sum_real8_dim(
2688 src: *const ArrayDescriptor,
2689 dim: i32,
2690 dst: *mut ArrayDescriptor,
2691 ) {
2692 if src.is_null() || dst.is_null() {
2693 return;
2694 }
2695 let s = unsafe { &*src };
2696 let d = unsafe { &mut *dst };
2697 if s.base_addr.is_null() {
2698 return;
2699 }
2700 if !d.is_allocated() {
2701 let new_rank = (s.rank - 1).max(0);
2702 let mut dim_buf: [DimDescriptor; 15] = [DimDescriptor {
2703 lower_bound: 0,
2704 upper_bound: 0,
2705 stride: 0,
2706 }; 15];
2707 let mut k = 0usize;
2708 let mut acc: i64 = 1;
2709 for i in 0..s.rank as usize {
2710 if i + 1 == dim as usize {
2711 continue;
2712 }
2713 let extent = s.dims[i].extent();
2714 dim_buf[k].lower_bound = 1;
2715 dim_buf[k].upper_bound = extent;
2716 dim_buf[k].stride = acc;
2717 acc *= extent;
2718 k += 1;
2719 }
2720 let dim_ptr = if new_rank > 0 {
2721 dim_buf.as_ptr()
2722 } else {
2723 ptr::null()
2724 };
2725 let mut stat: i32 = 0;
2726 afs_allocate_array(dst, s.elem_size, new_rank, dim_ptr, &mut stat);
2727 if stat != 0 || d.base_addr.is_null() {
2728 return;
2729 }
2730 }
2731 let dst_total = d.total_elements() as usize;
2732 if s.elem_size == 4 {
2733 let buf = d.base_addr as *mut f32;
2734 for i in 0..dst_total {
2735 unsafe {
2736 *buf.add(i) = 0.0;
2737 }
2738 }
2739 let src_ptr = s.base_addr as *const u8;
2740 for_each_reduce_along_dim(s, dim, |byte_off, dst_flat| {
2741 let v = unsafe { *(src_ptr.add(byte_off) as *const f32) };
2742 unsafe {
2743 *buf.add(dst_flat) += v;
2744 }
2745 });
2746 } else {
2747 let buf = d.base_addr as *mut f64;
2748 for i in 0..dst_total {
2749 unsafe {
2750 *buf.add(i) = 0.0;
2751 }
2752 }
2753 let src_ptr = s.base_addr as *const u8;
2754 for_each_reduce_along_dim(s, dim, |byte_off, dst_flat| {
2755 let v = unsafe { *(src_ptr.add(byte_off) as *const f64) };
2756 unsafe {
2757 *buf.add(dst_flat) += v;
2758 }
2759 });
2760 }
2761 }
2762
2763 /// SUM(array, DIM=k) — integer version, dispatching on
2764 /// `src.elem_size` (1/2/4/8). Result element width matches
2765 /// `src.elem_size`. Auto-allocates `dst` if not already allocated.
2766 #[no_mangle]
2767 pub extern "C" fn afs_array_sum_int_dim(
2768 src: *const ArrayDescriptor,
2769 dim: i32,
2770 dst: *mut ArrayDescriptor,
2771 ) {
2772 if src.is_null() || dst.is_null() {
2773 return;
2774 }
2775 let s = unsafe { &*src };
2776 let d = unsafe { &mut *dst };
2777 if s.base_addr.is_null() {
2778 return;
2779 }
2780 if !d.is_allocated() {
2781 let new_rank = (s.rank - 1).max(0);
2782 let mut dim_buf: [DimDescriptor; 15] = [DimDescriptor {
2783 lower_bound: 0,
2784 upper_bound: 0,
2785 stride: 0,
2786 }; 15];
2787 let mut k = 0usize;
2788 let mut acc: i64 = 1;
2789 for i in 0..s.rank as usize {
2790 if i + 1 == dim as usize {
2791 continue;
2792 }
2793 let extent = s.dims[i].extent();
2794 dim_buf[k].lower_bound = 1;
2795 dim_buf[k].upper_bound = extent;
2796 dim_buf[k].stride = acc;
2797 acc *= extent;
2798 k += 1;
2799 }
2800 let dim_ptr = if new_rank > 0 {
2801 dim_buf.as_ptr()
2802 } else {
2803 ptr::null()
2804 };
2805 let mut stat: i32 = 0;
2806 afs_allocate_array(dst, s.elem_size, new_rank, dim_ptr, &mut stat);
2807 if stat != 0 || d.base_addr.is_null() {
2808 return;
2809 }
2810 }
2811 let dst_total = d.total_elements() as usize;
2812 let src_ptr = s.base_addr as *const u8;
2813 match s.elem_size {
2814 1 => {
2815 let buf = d.base_addr as *mut i8;
2816 for i in 0..dst_total {
2817 unsafe {
2818 *buf.add(i) = 0;
2819 }
2820 }
2821 for_each_reduce_along_dim(s, dim, |byte_off, dst_flat| {
2822 let v = unsafe { *(src_ptr.add(byte_off) as *const i8) };
2823 unsafe {
2824 *buf.add(dst_flat) = (*buf.add(dst_flat)).wrapping_add(v);
2825 }
2826 });
2827 }
2828 2 => {
2829 let buf = d.base_addr as *mut i16;
2830 for i in 0..dst_total {
2831 unsafe {
2832 *buf.add(i) = 0;
2833 }
2834 }
2835 for_each_reduce_along_dim(s, dim, |byte_off, dst_flat| {
2836 let v = unsafe { *(src_ptr.add(byte_off) as *const i16) };
2837 unsafe {
2838 *buf.add(dst_flat) = (*buf.add(dst_flat)).wrapping_add(v);
2839 }
2840 });
2841 }
2842 4 => {
2843 let buf = d.base_addr as *mut i32;
2844 for i in 0..dst_total {
2845 unsafe {
2846 *buf.add(i) = 0;
2847 }
2848 }
2849 for_each_reduce_along_dim(s, dim, |byte_off, dst_flat| {
2850 let v = unsafe { *(src_ptr.add(byte_off) as *const i32) };
2851 unsafe {
2852 *buf.add(dst_flat) = (*buf.add(dst_flat)).wrapping_add(v);
2853 }
2854 });
2855 }
2856 _ => {
2857 let buf = d.base_addr as *mut i64;
2858 for i in 0..dst_total {
2859 unsafe {
2860 *buf.add(i) = 0;
2861 }
2862 }
2863 for_each_reduce_along_dim(s, dim, |byte_off, dst_flat| {
2864 let v = unsafe { *(src_ptr.add(byte_off) as *const i64) };
2865 unsafe {
2866 *buf.add(dst_flat) = (*buf.add(dst_flat)).wrapping_add(v);
2867 }
2868 });
2869 }
2870 }
2871 }
2872
2873 /// Walk every source element along all dims (column-major), invoking
2874 /// `accum(src_byte_off, mask_byte_off, dst_flat)` so the caller can
2875 /// honor both the source's and the mask's per-dim strides without
2876 /// reimplementing the index machinery. `dst_flat` indexes the
2877 /// rank-(N-1) output that excludes `reduce_dim`.
2878 fn for_each_reduce_along_dim_with_mask<F: FnMut(usize, usize, usize)>(
2879 src: &ArrayDescriptor,
2880 mask: &ArrayDescriptor,
2881 reduce_dim: i32,
2882 mut accum: F,
2883 ) {
2884 let rank = src.rank as usize;
2885 if rank == 0 {
2886 return;
2887 }
2888 let reduce_dim_idx = reduce_dim as usize - 1;
2889 if reduce_dim_idx >= rank {
2890 return;
2891 }
2892 let mut extents: [i64; 15] = [0; 15];
2893 let mut s_strides: [i64; 15] = [0; 15];
2894 let mut m_strides: [i64; 15] = [0; 15];
2895 let mut dst_running_stride: [i64; 15] = [0; 15];
2896 let mut k = 0usize;
2897 let mut acc = 1i64;
2898 for i in 0..rank {
2899 extents[i] = src.dims[i].extent();
2900 s_strides[i] = src.dims[i].stride.max(1);
2901 m_strides[i] = if (i as i32) < mask.rank {
2902 mask.dims[i].stride.max(1)
2903 } else {
2904 1
2905 };
2906 if i == reduce_dim_idx {
2907 continue;
2908 }
2909 dst_running_stride[k] = acc;
2910 acc *= extents[i];
2911 k += 1;
2912 }
2913 let mut idx: [i64; 15] = [0; 15];
2914 let total = (0..rank).map(|i| extents[i]).product::<i64>();
2915 if total <= 0 {
2916 return;
2917 }
2918 let m_elem = mask.elem_size.max(1);
2919 for _ in 0..total {
2920 let mut s_byte_off: i64 = 0;
2921 let mut m_byte_off: i64 = 0;
2922 let mut dst_flat: i64 = 0;
2923 let mut dk = 0usize;
2924 for d in 0..rank {
2925 s_byte_off += idx[d] * s_strides[d] * src.elem_size;
2926 m_byte_off += idx[d] * m_strides[d] * m_elem;
2927 if d != reduce_dim_idx {
2928 dst_flat += idx[d] * dst_running_stride[dk];
2929 dk += 1;
2930 }
2931 }
2932 accum(s_byte_off as usize, m_byte_off as usize, dst_flat as usize);
2933 for d in 0..rank {
2934 idx[d] += 1;
2935 if idx[d] < extents[d] {
2936 break;
2937 }
2938 idx[d] = 0;
2939 }
2940 }
2941 }
2942
2943 unsafe fn mask_byte_is_true(mask: &ArrayDescriptor, byte_off: usize) -> bool {
2944 let p = mask.base_addr.add(byte_off);
2945 match mask.elem_size {
2946 1 => *p != 0,
2947 2 => *(p as *const u16) != 0,
2948 4 => *(p as *const u32) != 0,
2949 8 => *(p as *const u64) != 0,
2950 _ => *p != 0,
2951 }
2952 }
2953
2954 /// SUM(array, DIM=k, MASK=mask) — real version. Auto-allocates `dst`
2955 /// to rank-(N-1) on first call. Walks both array and mask using each
2956 /// descriptor's own per-dim strides; treats any non-zero mask byte as
2957 /// `.true.` (matches `mask_at`).
2958 #[no_mangle]
2959 pub extern "C" fn afs_array_sum_real8_dim_mask(
2960 src: *const ArrayDescriptor,
2961 dim: i32,
2962 dst: *mut ArrayDescriptor,
2963 mask: *const ArrayDescriptor,
2964 ) {
2965 if src.is_null() || dst.is_null() || mask.is_null() {
2966 return;
2967 }
2968 let s = unsafe { &*src };
2969 let d = unsafe { &mut *dst };
2970 let m = unsafe { &*mask };
2971 if s.base_addr.is_null() || m.base_addr.is_null() {
2972 return;
2973 }
2974 if !d.is_allocated() {
2975 let new_rank = (s.rank - 1).max(0);
2976 let mut dim_buf: [DimDescriptor; 15] = [DimDescriptor {
2977 lower_bound: 0,
2978 upper_bound: 0,
2979 stride: 0,
2980 }; 15];
2981 let mut k = 0usize;
2982 let mut acc: i64 = 1;
2983 for i in 0..s.rank as usize {
2984 if i + 1 == dim as usize {
2985 continue;
2986 }
2987 let extent = s.dims[i].extent();
2988 dim_buf[k].lower_bound = 1;
2989 dim_buf[k].upper_bound = extent;
2990 dim_buf[k].stride = acc;
2991 acc *= extent;
2992 k += 1;
2993 }
2994 let dim_ptr = if new_rank > 0 {
2995 dim_buf.as_ptr()
2996 } else {
2997 ptr::null()
2998 };
2999 let mut stat: i32 = 0;
3000 afs_allocate_array(dst, s.elem_size, new_rank, dim_ptr, &mut stat);
3001 if stat != 0 || d.base_addr.is_null() {
3002 return;
3003 }
3004 }
3005 let dst_total = d.total_elements() as usize;
3006 let src_ptr = s.base_addr as *const u8;
3007 if s.elem_size == 4 {
3008 let buf = d.base_addr as *mut f32;
3009 for i in 0..dst_total {
3010 unsafe {
3011 *buf.add(i) = 0.0;
3012 }
3013 }
3014 for_each_reduce_along_dim_with_mask(s, m, dim, |sb, mb, df| {
3015 if unsafe { mask_byte_is_true(m, mb) } {
3016 let v = unsafe { *(src_ptr.add(sb) as *const f32) };
3017 unsafe {
3018 *buf.add(df) += v;
3019 }
3020 }
3021 });
3022 } else {
3023 let buf = d.base_addr as *mut f64;
3024 for i in 0..dst_total {
3025 unsafe {
3026 *buf.add(i) = 0.0;
3027 }
3028 }
3029 for_each_reduce_along_dim_with_mask(s, m, dim, |sb, mb, df| {
3030 if unsafe { mask_byte_is_true(m, mb) } {
3031 let v = unsafe { *(src_ptr.add(sb) as *const f64) };
3032 unsafe {
3033 *buf.add(df) += v;
3034 }
3035 }
3036 });
3037 }
3038 }
3039
3040 /// SUM(array, DIM=k, MASK=mask) — integer version. Dispatches on
3041 /// `src.elem_size` (1/2/4/8); auto-allocates `dst` and walks both
3042 /// descriptors with their own per-dim strides.
3043 #[no_mangle]
3044 pub extern "C" fn afs_array_sum_int_dim_mask(
3045 src: *const ArrayDescriptor,
3046 dim: i32,
3047 dst: *mut ArrayDescriptor,
3048 mask: *const ArrayDescriptor,
3049 ) {
3050 if src.is_null() || dst.is_null() || mask.is_null() {
3051 return;
3052 }
3053 let s = unsafe { &*src };
3054 let d = unsafe { &mut *dst };
3055 let m = unsafe { &*mask };
3056 if s.base_addr.is_null() || m.base_addr.is_null() {
3057 return;
3058 }
3059 if !d.is_allocated() {
3060 let new_rank = (s.rank - 1).max(0);
3061 let mut dim_buf: [DimDescriptor; 15] = [DimDescriptor {
3062 lower_bound: 0,
3063 upper_bound: 0,
3064 stride: 0,
3065 }; 15];
3066 let mut k = 0usize;
3067 let mut acc: i64 = 1;
3068 for i in 0..s.rank as usize {
3069 if i + 1 == dim as usize {
3070 continue;
3071 }
3072 let extent = s.dims[i].extent();
3073 dim_buf[k].lower_bound = 1;
3074 dim_buf[k].upper_bound = extent;
3075 dim_buf[k].stride = acc;
3076 acc *= extent;
3077 k += 1;
3078 }
3079 let dim_ptr = if new_rank > 0 {
3080 dim_buf.as_ptr()
3081 } else {
3082 ptr::null()
3083 };
3084 let mut stat: i32 = 0;
3085 afs_allocate_array(dst, s.elem_size, new_rank, dim_ptr, &mut stat);
3086 if stat != 0 || d.base_addr.is_null() {
3087 return;
3088 }
3089 }
3090 let dst_total = d.total_elements() as usize;
3091 let src_ptr = s.base_addr as *const u8;
3092 macro_rules! sum_dim_mask_kind {
3093 ($t:ty) => {{
3094 let buf = d.base_addr as *mut $t;
3095 for i in 0..dst_total {
3096 unsafe {
3097 *buf.add(i) = 0;
3098 }
3099 }
3100 for_each_reduce_along_dim_with_mask(s, m, dim, |sb, mb, df| {
3101 if unsafe { mask_byte_is_true(m, mb) } {
3102 let v = unsafe { *(src_ptr.add(sb) as *const $t) };
3103 unsafe {
3104 *buf.add(df) = (*buf.add(df)).wrapping_add(v);
3105 }
3106 }
3107 });
3108 }};
3109 }
3110 match s.elem_size {
3111 1 => sum_dim_mask_kind!(i8),
3112 2 => sum_dim_mask_kind!(i16),
3113 8 => sum_dim_mask_kind!(i64),
3114 _ => sum_dim_mask_kind!(i32),
3115 }
3116 }
3117
3118 /// SUM(array) — sum all elements (integer version).
3119 /// Dispatches on `elem_size` so integer(1/2/4/8) arrays all sum correctly.
3120 /// Respects strides for non-contiguous sections.
3121 #[no_mangle]
3122 pub extern "C" fn afs_array_sum_int(desc: *const ArrayDescriptor) -> i64 {
3123 if desc.is_null() {
3124 return 0;
3125 }
3126 let d = unsafe { &*desc };
3127 if d.base_addr.is_null() {
3128 return 0;
3129 }
3130 let n = d.total_elements() as usize;
3131 let stride = d.dims[0].stride.max(1) as usize;
3132 let mut sum: i64 = 0;
3133 match d.elem_size {
3134 1 => {
3135 let ptr = d.base_addr as *const i8;
3136 for i in 0..n {
3137 sum = sum.wrapping_add(unsafe { *ptr.add(i * stride) } as i64);
3138 }
3139 }
3140 2 => {
3141 let ptr = d.base_addr as *const i16;
3142 for i in 0..n {
3143 sum = sum.wrapping_add(unsafe { *ptr.add(i * stride) } as i64);
3144 }
3145 }
3146 8 => {
3147 let ptr = d.base_addr as *const i64;
3148 for i in 0..n {
3149 sum = sum.wrapping_add(unsafe { *ptr.add(i * stride) });
3150 }
3151 }
3152 _ => {
3153 let ptr = d.base_addr as *const i32;
3154 for i in 0..n {
3155 sum = sum.wrapping_add(unsafe { *ptr.add(i * stride) } as i64);
3156 }
3157 }
3158 }
3159 sum
3160 }
3161
3162 /// Read one logical element from `mask` at logical index `i` (zero-based)
3163 /// honoring `mask.elem_size` (kind 1, 4, or any bit-width). Fortran maps
3164 /// `.true.` to a non-zero stored value; we treat any non-zero byte as true.
3165 unsafe fn mask_at(mask: &ArrayDescriptor, i: usize, stride: usize) -> bool {
3166 let off = i * stride * mask.elem_size.max(1) as usize;
3167 let p = mask.base_addr.add(off);
3168 let es = mask.elem_size as usize;
3169 match es {
3170 1 => *p != 0,
3171 2 => *(p as *const u16) != 0,
3172 4 => *(p as *const u32) != 0,
3173 8 => *(p as *const u64) != 0,
3174 _ => *p != 0,
3175 }
3176 }
3177
3178 /// SUM(array, mask=mask) — sum elements where `mask(i)` is true (real).
3179 /// Width-dispatched on the array's elem_size; mask is read with its own
3180 /// kind from `mask.elem_size`.
3181 #[no_mangle]
3182 pub extern "C" fn afs_array_sum_real8_mask(
3183 desc: *const ArrayDescriptor,
3184 mask: *const ArrayDescriptor,
3185 ) -> f64 {
3186 if desc.is_null() || mask.is_null() {
3187 return 0.0;
3188 }
3189 let d = unsafe { &*desc };
3190 let m = unsafe { &*mask };
3191 if d.base_addr.is_null() || m.base_addr.is_null() {
3192 return 0.0;
3193 }
3194 let n = d.total_elements() as usize;
3195 let stride_d = d.dims[0].stride.max(1) as usize;
3196 let stride_m = m.dims[0].stride.max(1) as usize;
3197 let mut sum: f64 = 0.0;
3198 if d.elem_size == 4 {
3199 let ptr = d.base_addr as *const f32;
3200 for i in 0..n {
3201 if unsafe { mask_at(m, i, stride_m) } {
3202 sum += unsafe { *ptr.add(i * stride_d) } as f64;
3203 }
3204 }
3205 } else {
3206 let ptr = d.base_addr as *const f64;
3207 for i in 0..n {
3208 if unsafe { mask_at(m, i, stride_m) } {
3209 sum += unsafe { *ptr.add(i * stride_d) };
3210 }
3211 }
3212 }
3213 sum
3214 }
3215
3216 /// SUM(array, mask=mask) — integer arrays. Dispatches on elem_size like
3217 /// the unmasked entry; returns i64 for any input kind.
3218 #[no_mangle]
3219 pub extern "C" fn afs_array_sum_int_mask(
3220 desc: *const ArrayDescriptor,
3221 mask: *const ArrayDescriptor,
3222 ) -> i64 {
3223 if desc.is_null() || mask.is_null() {
3224 return 0;
3225 }
3226 let d = unsafe { &*desc };
3227 let mk = unsafe { &*mask };
3228 if d.base_addr.is_null() || mk.base_addr.is_null() {
3229 return 0;
3230 }
3231 let n = d.total_elements() as usize;
3232 let stride_d = d.dims[0].stride.max(1) as usize;
3233 let stride_m = mk.dims[0].stride.max(1) as usize;
3234 let mut sum: i64 = 0;
3235 macro_rules! sum_kind {
3236 ($t:ty) => {{
3237 let ptr = d.base_addr as *const $t;
3238 for i in 0..n {
3239 if unsafe { mask_at(mk, i, stride_m) } {
3240 sum = sum.wrapping_add(unsafe { *ptr.add(i * stride_d) } as i64);
3241 }
3242 }
3243 }};
3244 }
3245 match d.elem_size {
3246 1 => sum_kind!(i8),
3247 2 => sum_kind!(i16),
3248 8 => sum_kind!(i64),
3249 _ => sum_kind!(i32),
3250 }
3251 sum
3252 }
3253
3254 /// PRODUCT(array) — product of all elements (real version).
3255 /// Dispatches on `elem_size`; returns f64 for both real(4) and real(8).
3256 #[no_mangle]
3257 pub extern "C" fn afs_array_product_real8(desc: *const ArrayDescriptor) -> f64 {
3258 if desc.is_null() {
3259 return 1.0;
3260 }
3261 let d = unsafe { &*desc };
3262 if d.base_addr.is_null() {
3263 return 1.0;
3264 }
3265 let n = d.total_elements() as usize;
3266 let stride = d.dims[0].stride.max(1) as usize;
3267 let mut prod: f64 = 1.0;
3268 if d.elem_size == 4 {
3269 let ptr = d.base_addr as *const f32;
3270 for i in 0..n {
3271 prod *= unsafe { *ptr.add(i * stride) } as f64;
3272 }
3273 } else {
3274 let ptr = d.base_addr as *const f64;
3275 for i in 0..n {
3276 prod *= unsafe { *ptr.add(i * stride) };
3277 }
3278 }
3279 prod
3280 }
3281
3282 /// PRODUCT(array) — product of all elements (integer version).
3283 /// Dispatches on `elem_size` so integer(1/2/4/8) arrays multiply correctly.
3284 #[no_mangle]
3285 pub extern "C" fn afs_array_product_int(desc: *const ArrayDescriptor) -> i64 {
3286 if desc.is_null() {
3287 return 1;
3288 }
3289 let d = unsafe { &*desc };
3290 if d.base_addr.is_null() {
3291 return 1;
3292 }
3293 let n = d.total_elements() as usize;
3294 if n == 0 {
3295 return 1;
3296 }
3297 let stride = d.dims[0].stride.max(1) as usize;
3298 let mut prod: i64 = 1;
3299 match d.elem_size {
3300 1 => {
3301 let ptr = d.base_addr as *const i8;
3302 for i in 0..n {
3303 prod = prod.wrapping_mul(unsafe { *ptr.add(i * stride) } as i64);
3304 }
3305 }
3306 2 => {
3307 let ptr = d.base_addr as *const i16;
3308 for i in 0..n {
3309 prod = prod.wrapping_mul(unsafe { *ptr.add(i * stride) } as i64);
3310 }
3311 }
3312 8 => {
3313 let ptr = d.base_addr as *const i64;
3314 for i in 0..n {
3315 prod = prod.wrapping_mul(unsafe { *ptr.add(i * stride) });
3316 }
3317 }
3318 _ => {
3319 let ptr = d.base_addr as *const i32;
3320 for i in 0..n {
3321 prod = prod.wrapping_mul(unsafe { *ptr.add(i * stride) } as i64);
3322 }
3323 }
3324 }
3325 prod
3326 }
3327
3328 /// PRODUCT(array, mask=mask) — masked product (real). Dispatches on
3329 /// elem_size and reads mask via `mask_at` for any logical kind.
3330 #[no_mangle]
3331 pub extern "C" fn afs_array_product_real8_mask(
3332 desc: *const ArrayDescriptor,
3333 mask: *const ArrayDescriptor,
3334 ) -> f64 {
3335 if desc.is_null() || mask.is_null() {
3336 return 1.0;
3337 }
3338 let d = unsafe { &*desc };
3339 let m = unsafe { &*mask };
3340 if d.base_addr.is_null() || m.base_addr.is_null() {
3341 return 1.0;
3342 }
3343 let n = d.total_elements() as usize;
3344 let stride_d = d.dims[0].stride.max(1) as usize;
3345 let stride_m = m.dims[0].stride.max(1) as usize;
3346 let mut prod: f64 = 1.0;
3347 if d.elem_size == 4 {
3348 let ptr = d.base_addr as *const f32;
3349 for i in 0..n {
3350 if unsafe { mask_at(m, i, stride_m) } {
3351 prod *= unsafe { *ptr.add(i * stride_d) } as f64;
3352 }
3353 }
3354 } else {
3355 let ptr = d.base_addr as *const f64;
3356 for i in 0..n {
3357 if unsafe { mask_at(m, i, stride_m) } {
3358 prod *= unsafe { *ptr.add(i * stride_d) };
3359 }
3360 }
3361 }
3362 prod
3363 }
3364
3365 /// PRODUCT(array, mask=mask) — masked product (integer). Dispatches on
3366 /// elem_size; returns i64 regardless of input kind.
3367 #[no_mangle]
3368 pub extern "C" fn afs_array_product_int_mask(
3369 desc: *const ArrayDescriptor,
3370 mask: *const ArrayDescriptor,
3371 ) -> i64 {
3372 if desc.is_null() || mask.is_null() {
3373 return 1;
3374 }
3375 let d = unsafe { &*desc };
3376 let mk = unsafe { &*mask };
3377 if d.base_addr.is_null() || mk.base_addr.is_null() {
3378 return 1;
3379 }
3380 let n = d.total_elements() as usize;
3381 let stride_d = d.dims[0].stride.max(1) as usize;
3382 let stride_m = mk.dims[0].stride.max(1) as usize;
3383 let mut prod: i64 = 1;
3384 macro_rules! prod_kind {
3385 ($t:ty) => {{
3386 let ptr = d.base_addr as *const $t;
3387 for i in 0..n {
3388 if unsafe { mask_at(mk, i, stride_m) } {
3389 prod = prod.wrapping_mul(unsafe { *ptr.add(i * stride_d) } as i64);
3390 }
3391 }
3392 }};
3393 }
3394 match d.elem_size {
3395 1 => prod_kind!(i8),
3396 2 => prod_kind!(i16),
3397 8 => prod_kind!(i64),
3398 _ => prod_kind!(i32),
3399 }
3400 prod
3401 }
3402
3403 /// MAXVAL(array, mask=mask) — masked max (real). Returns -inf when no
3404 /// element is selected.
3405 #[no_mangle]
3406 pub extern "C" fn afs_array_maxval_real8_mask(
3407 desc: *const ArrayDescriptor,
3408 mask: *const ArrayDescriptor,
3409 ) -> f64 {
3410 if desc.is_null() || mask.is_null() {
3411 return f64::NEG_INFINITY;
3412 }
3413 let d = unsafe { &*desc };
3414 let m = unsafe { &*mask };
3415 if d.base_addr.is_null() || m.base_addr.is_null() {
3416 return f64::NEG_INFINITY;
3417 }
3418 let n = d.total_elements() as usize;
3419 if n == 0 {
3420 return f64::NEG_INFINITY;
3421 }
3422 let stride_d = d.dims[0].stride.max(1) as usize;
3423 let stride_m = m.dims[0].stride.max(1) as usize;
3424 let mut best = f64::NEG_INFINITY;
3425 if d.elem_size == 4 {
3426 let ptr = d.base_addr as *const f32;
3427 for i in 0..n {
3428 if unsafe { mask_at(m, i, stride_m) } {
3429 let v = unsafe { *ptr.add(i * stride_d) } as f64;
3430 if v > best {
3431 best = v;
3432 }
3433 }
3434 }
3435 } else {
3436 let ptr = d.base_addr as *const f64;
3437 for i in 0..n {
3438 if unsafe { mask_at(m, i, stride_m) } {
3439 let v = unsafe { *ptr.add(i * stride_d) };
3440 if v > best {
3441 best = v;
3442 }
3443 }
3444 }
3445 }
3446 best
3447 }
3448
3449 /// MINVAL(array, mask=mask) — masked min (real).
3450 #[no_mangle]
3451 pub extern "C" fn afs_array_minval_real8_mask(
3452 desc: *const ArrayDescriptor,
3453 mask: *const ArrayDescriptor,
3454 ) -> f64 {
3455 if desc.is_null() || mask.is_null() {
3456 return f64::INFINITY;
3457 }
3458 let d = unsafe { &*desc };
3459 let m = unsafe { &*mask };
3460 if d.base_addr.is_null() || m.base_addr.is_null() {
3461 return f64::INFINITY;
3462 }
3463 let n = d.total_elements() as usize;
3464 if n == 0 {
3465 return f64::INFINITY;
3466 }
3467 let stride_d = d.dims[0].stride.max(1) as usize;
3468 let stride_m = m.dims[0].stride.max(1) as usize;
3469 let mut best = f64::INFINITY;
3470 if d.elem_size == 4 {
3471 let ptr = d.base_addr as *const f32;
3472 for i in 0..n {
3473 if unsafe { mask_at(m, i, stride_m) } {
3474 let v = unsafe { *ptr.add(i * stride_d) } as f64;
3475 if v < best {
3476 best = v;
3477 }
3478 }
3479 }
3480 } else {
3481 let ptr = d.base_addr as *const f64;
3482 for i in 0..n {
3483 if unsafe { mask_at(m, i, stride_m) } {
3484 let v = unsafe { *ptr.add(i * stride_d) };
3485 if v < best {
3486 best = v;
3487 }
3488 }
3489 }
3490 }
3491 best
3492 }
3493
3494 /// MAXVAL(array, mask=mask) — masked max (integer).
3495 #[no_mangle]
3496 pub extern "C" fn afs_array_maxval_int_mask(
3497 desc: *const ArrayDescriptor,
3498 mask: *const ArrayDescriptor,
3499 ) -> i64 {
3500 if desc.is_null() || mask.is_null() {
3501 return i64::MIN;
3502 }
3503 let d = unsafe { &*desc };
3504 let mk = unsafe { &*mask };
3505 if d.base_addr.is_null() || mk.base_addr.is_null() {
3506 return i64::MIN;
3507 }
3508 let n = d.total_elements() as usize;
3509 if n == 0 {
3510 return i64::MIN;
3511 }
3512 let stride_d = d.dims[0].stride.max(1) as usize;
3513 let stride_m = mk.dims[0].stride.max(1) as usize;
3514 let mut best = i64::MIN;
3515 macro_rules! max_kind {
3516 ($t:ty) => {{
3517 let ptr = d.base_addr as *const $t;
3518 for i in 0..n {
3519 if unsafe { mask_at(mk, i, stride_m) } {
3520 let v = unsafe { *ptr.add(i * stride_d) } as i64;
3521 if v > best {
3522 best = v;
3523 }
3524 }
3525 }
3526 }};
3527 }
3528 match d.elem_size {
3529 1 => max_kind!(i8),
3530 2 => max_kind!(i16),
3531 8 => max_kind!(i64),
3532 _ => max_kind!(i32),
3533 }
3534 best
3535 }
3536
3537 /// MINVAL(array, mask=mask) — masked min (integer).
3538 #[no_mangle]
3539 pub extern "C" fn afs_array_minval_int_mask(
3540 desc: *const ArrayDescriptor,
3541 mask: *const ArrayDescriptor,
3542 ) -> i64 {
3543 if desc.is_null() || mask.is_null() {
3544 return i64::MAX;
3545 }
3546 let d = unsafe { &*desc };
3547 let mk = unsafe { &*mask };
3548 if d.base_addr.is_null() || mk.base_addr.is_null() {
3549 return i64::MAX;
3550 }
3551 let n = d.total_elements() as usize;
3552 if n == 0 {
3553 return i64::MAX;
3554 }
3555 let stride_d = d.dims[0].stride.max(1) as usize;
3556 let stride_m = mk.dims[0].stride.max(1) as usize;
3557 let mut best = i64::MAX;
3558 macro_rules! min_kind {
3559 ($t:ty) => {{
3560 let ptr = d.base_addr as *const $t;
3561 for i in 0..n {
3562 if unsafe { mask_at(mk, i, stride_m) } {
3563 let v = unsafe { *ptr.add(i * stride_d) } as i64;
3564 if v < best {
3565 best = v;
3566 }
3567 }
3568 }
3569 }};
3570 }
3571 match d.elem_size {
3572 1 => min_kind!(i8),
3573 2 => min_kind!(i16),
3574 8 => min_kind!(i64),
3575 _ => min_kind!(i32),
3576 }
3577 best
3578 }
3579
3580 /// MAXVAL(array) — maximum element (real version). Dispatches on
3581 /// `elem_size`; returns f64 for both real(4) and real(8).
3582 /// Respects strides.
3583 #[no_mangle]
3584 pub extern "C" fn afs_array_maxval_real8(desc: *const ArrayDescriptor) -> f64 {
3585 if desc.is_null() {
3586 return f64::NEG_INFINITY;
3587 }
3588 let d = unsafe { &*desc };
3589 if d.base_addr.is_null() {
3590 return f64::NEG_INFINITY;
3591 }
3592 let n = d.total_elements() as usize;
3593 if n == 0 {
3594 return f64::NEG_INFINITY;
3595 }
3596 let stride = d.dims[0].stride.max(1) as usize;
3597 if d.elem_size == 4 {
3598 let ptr = d.base_addr as *const f32;
3599 let mut max = unsafe { *ptr } as f64;
3600 for i in 1..n {
3601 let v = unsafe { *ptr.add(i * stride) } as f64;
3602 if v > max {
3603 max = v;
3604 }
3605 }
3606 max
3607 } else {
3608 let ptr = d.base_addr as *const f64;
3609 let mut max = unsafe { *ptr };
3610 for i in 1..n {
3611 let v = unsafe { *ptr.add(i * stride) };
3612 if v > max {
3613 max = v;
3614 }
3615 }
3616 max
3617 }
3618 }
3619
3620 /// MINVAL(array) — minimum element (real version). Dispatches on
3621 /// `elem_size`; returns f64 for both real(4) and real(8).
3622 /// Respects strides.
3623 #[no_mangle]
3624 pub extern "C" fn afs_array_minval_real8(desc: *const ArrayDescriptor) -> f64 {
3625 if desc.is_null() {
3626 return f64::INFINITY;
3627 }
3628 let d = unsafe { &*desc };
3629 if d.base_addr.is_null() {
3630 return f64::INFINITY;
3631 }
3632 let n = d.total_elements() as usize;
3633 if n == 0 {
3634 return f64::INFINITY;
3635 }
3636 let stride = d.dims[0].stride.max(1) as usize;
3637 if d.elem_size == 4 {
3638 let ptr = d.base_addr as *const f32;
3639 let mut min = unsafe { *ptr } as f64;
3640 for i in 1..n {
3641 let v = unsafe { *ptr.add(i * stride) } as f64;
3642 if v < min {
3643 min = v;
3644 }
3645 }
3646 min
3647 } else {
3648 let ptr = d.base_addr as *const f64;
3649 let mut min = unsafe { *ptr };
3650 for i in 1..n {
3651 let v = unsafe { *ptr.add(i * stride) };
3652 if v < min {
3653 min = v;
3654 }
3655 }
3656 min
3657 }
3658 }
3659
3660 /// MAXVAL(array) — maximum element (integer version).
3661 /// Dispatches on `elem_size` so integer(1/2/4/8) arrays read correctly.
3662 /// Returns i64 so all kinds fit; codegen truncates to result kind.
3663 #[no_mangle]
3664 pub extern "C" fn afs_array_maxval_int(desc: *const ArrayDescriptor) -> i64 {
3665 if desc.is_null() {
3666 return i64::MIN;
3667 }
3668 let d = unsafe { &*desc };
3669 if d.base_addr.is_null() {
3670 return i64::MIN;
3671 }
3672 let n = d.total_elements() as usize;
3673 if n == 0 {
3674 return i64::MIN;
3675 }
3676 let stride = d.dims[0].stride.max(1) as usize;
3677 match d.elem_size {
3678 1 => {
3679 let ptr = d.base_addr as *const i8;
3680 let mut max = unsafe { *ptr } as i64;
3681 for i in 1..n {
3682 let v = unsafe { *ptr.add(i * stride) } as i64;
3683 if v > max {
3684 max = v;
3685 }
3686 }
3687 max
3688 }
3689 2 => {
3690 let ptr = d.base_addr as *const i16;
3691 let mut max = unsafe { *ptr } as i64;
3692 for i in 1..n {
3693 let v = unsafe { *ptr.add(i * stride) } as i64;
3694 if v > max {
3695 max = v;
3696 }
3697 }
3698 max
3699 }
3700 8 => {
3701 let ptr = d.base_addr as *const i64;
3702 let mut max = unsafe { *ptr };
3703 for i in 1..n {
3704 let v = unsafe { *ptr.add(i * stride) };
3705 if v > max {
3706 max = v;
3707 }
3708 }
3709 max
3710 }
3711 _ => {
3712 let ptr = d.base_addr as *const i32;
3713 let mut max = unsafe { *ptr } as i64;
3714 for i in 1..n {
3715 let v = unsafe { *ptr.add(i * stride) } as i64;
3716 if v > max {
3717 max = v;
3718 }
3719 }
3720 max
3721 }
3722 }
3723 }
3724
3725 /// MINVAL(array) — minimum element (integer version).
3726 /// Dispatches on `elem_size` so integer(1/2/4/8) arrays read correctly.
3727 /// Returns i64 so all kinds fit; codegen truncates to result kind.
3728 #[no_mangle]
3729 pub extern "C" fn afs_array_minval_int(desc: *const ArrayDescriptor) -> i64 {
3730 if desc.is_null() {
3731 return i64::MAX;
3732 }
3733 let d = unsafe { &*desc };
3734 if d.base_addr.is_null() {
3735 return i64::MAX;
3736 }
3737 let n = d.total_elements() as usize;
3738 if n == 0 {
3739 return i64::MAX;
3740 }
3741 let stride = d.dims[0].stride.max(1) as usize;
3742 match d.elem_size {
3743 1 => {
3744 let ptr = d.base_addr as *const i8;
3745 let mut min = unsafe { *ptr } as i64;
3746 for i in 1..n {
3747 let v = unsafe { *ptr.add(i * stride) } as i64;
3748 if v < min {
3749 min = v;
3750 }
3751 }
3752 min
3753 }
3754 2 => {
3755 let ptr = d.base_addr as *const i16;
3756 let mut min = unsafe { *ptr } as i64;
3757 for i in 1..n {
3758 let v = unsafe { *ptr.add(i * stride) } as i64;
3759 if v < min {
3760 min = v;
3761 }
3762 }
3763 min
3764 }
3765 8 => {
3766 let ptr = d.base_addr as *const i64;
3767 let mut min = unsafe { *ptr };
3768 for i in 1..n {
3769 let v = unsafe { *ptr.add(i * stride) };
3770 if v < min {
3771 min = v;
3772 }
3773 }
3774 min
3775 }
3776 _ => {
3777 let ptr = d.base_addr as *const i32;
3778 let mut min = unsafe { *ptr } as i64;
3779 for i in 1..n {
3780 let v = unsafe { *ptr.add(i * stride) } as i64;
3781 if v < min {
3782 min = v;
3783 }
3784 }
3785 min
3786 }
3787 }
3788 }
3789
3790 /// MAXLOC(array, dim=1) for rank-1 input — returns 1-based index of the
3791 /// maximum element (real(4)). F2018 §16.9.130. Dispatches on elem_size.
3792 #[no_mangle]
3793 pub extern "C" fn afs_array_maxloc_real4(desc: *const ArrayDescriptor) -> i32 {
3794 if desc.is_null() {
3795 return 0;
3796 }
3797 let d = unsafe { &*desc };
3798 if d.base_addr.is_null() {
3799 return 0;
3800 }
3801 let n = d.total_elements() as usize;
3802 if n == 0 {
3803 return 0;
3804 }
3805 let stride = d.dims[0].stride.max(1) as usize;
3806 let ptr = d.base_addr as *const f32;
3807 let mut max = unsafe { *ptr };
3808 let mut idx = 0usize;
3809 for i in 1..n {
3810 let v = unsafe { *ptr.add(i * stride) };
3811 if v > max {
3812 max = v;
3813 idx = i;
3814 }
3815 }
3816 (idx as i32) + 1
3817 }
3818
3819 #[no_mangle]
3820 pub extern "C" fn afs_array_maxloc_real8(desc: *const ArrayDescriptor) -> i32 {
3821 if desc.is_null() {
3822 return 0;
3823 }
3824 let d = unsafe { &*desc };
3825 if d.base_addr.is_null() {
3826 return 0;
3827 }
3828 let n = d.total_elements() as usize;
3829 if n == 0 {
3830 return 0;
3831 }
3832 let stride = d.dims[0].stride.max(1) as usize;
3833 let ptr = d.base_addr as *const f64;
3834 let mut max = unsafe { *ptr };
3835 let mut idx = 0usize;
3836 for i in 1..n {
3837 let v = unsafe { *ptr.add(i * stride) };
3838 if v > max {
3839 max = v;
3840 idx = i;
3841 }
3842 }
3843 (idx as i32) + 1
3844 }
3845
3846 #[no_mangle]
3847 pub extern "C" fn afs_array_maxloc_int(desc: *const ArrayDescriptor) -> i32 {
3848 if desc.is_null() {
3849 return 0;
3850 }
3851 let d = unsafe { &*desc };
3852 if d.base_addr.is_null() {
3853 return 0;
3854 }
3855 let n = d.total_elements() as usize;
3856 if n == 0 {
3857 return 0;
3858 }
3859 let stride = d.dims[0].stride.max(1) as usize;
3860 let mut idx = 0usize;
3861 match d.elem_size {
3862 1 => {
3863 let ptr = d.base_addr as *const i8;
3864 let mut max = unsafe { *ptr };
3865 for i in 1..n {
3866 let v = unsafe { *ptr.add(i * stride) };
3867 if v > max {
3868 max = v;
3869 idx = i;
3870 }
3871 }
3872 }
3873 2 => {
3874 let ptr = d.base_addr as *const i16;
3875 let mut max = unsafe { *ptr };
3876 for i in 1..n {
3877 let v = unsafe { *ptr.add(i * stride) };
3878 if v > max {
3879 max = v;
3880 idx = i;
3881 }
3882 }
3883 }
3884 8 => {
3885 let ptr = d.base_addr as *const i64;
3886 let mut max = unsafe { *ptr };
3887 for i in 1..n {
3888 let v = unsafe { *ptr.add(i * stride) };
3889 if v > max {
3890 max = v;
3891 idx = i;
3892 }
3893 }
3894 }
3895 _ => {
3896 let ptr = d.base_addr as *const i32;
3897 let mut max = unsafe { *ptr };
3898 for i in 1..n {
3899 let v = unsafe { *ptr.add(i * stride) };
3900 if v > max {
3901 max = v;
3902 idx = i;
3903 }
3904 }
3905 }
3906 }
3907 (idx as i32) + 1
3908 }
3909
3910 /// MINLOC(array) for rank-1 input — analogous to MAXLOC.
3911 #[no_mangle]
3912 pub extern "C" fn afs_array_minloc_real4(desc: *const ArrayDescriptor) -> i32 {
3913 if desc.is_null() {
3914 return 0;
3915 }
3916 let d = unsafe { &*desc };
3917 if d.base_addr.is_null() {
3918 return 0;
3919 }
3920 let n = d.total_elements() as usize;
3921 if n == 0 {
3922 return 0;
3923 }
3924 let stride = d.dims[0].stride.max(1) as usize;
3925 let ptr = d.base_addr as *const f32;
3926 let mut min = unsafe { *ptr };
3927 let mut idx = 0usize;
3928 for i in 1..n {
3929 let v = unsafe { *ptr.add(i * stride) };
3930 if v < min {
3931 min = v;
3932 idx = i;
3933 }
3934 }
3935 (idx as i32) + 1
3936 }
3937
3938 #[no_mangle]
3939 pub extern "C" fn afs_array_minloc_real8(desc: *const ArrayDescriptor) -> i32 {
3940 if desc.is_null() {
3941 return 0;
3942 }
3943 let d = unsafe { &*desc };
3944 if d.base_addr.is_null() {
3945 return 0;
3946 }
3947 let n = d.total_elements() as usize;
3948 if n == 0 {
3949 return 0;
3950 }
3951 let stride = d.dims[0].stride.max(1) as usize;
3952 let ptr = d.base_addr as *const f64;
3953 let mut min = unsafe { *ptr };
3954 let mut idx = 0usize;
3955 for i in 1..n {
3956 let v = unsafe { *ptr.add(i * stride) };
3957 if v < min {
3958 min = v;
3959 idx = i;
3960 }
3961 }
3962 (idx as i32) + 1
3963 }
3964
3965 /// TRANSPOSE(source, result) — matrix transpose (real(8) version).
3966 /// source is (m x n), result is (n x m). Both descriptors must be allocated.
3967 #[no_mangle]
3968 pub extern "C" fn afs_transpose_real8(
3969 source: *const ArrayDescriptor,
3970 result: *mut ArrayDescriptor,
3971 ) {
3972 if source.is_null() || result.is_null() {
3973 return;
3974 }
3975 let src = unsafe { &*source };
3976 if src.rank < 2 || src.base_addr.is_null() {
3977 return;
3978 }
3979
3980 let m = src.dims[0].extent() as usize;
3981 let n = src.dims[1].extent() as usize;
3982 let elem_size = src.elem_size.max(1);
3983
3984 // Allocate result as (n x m) using the source's element width so
3985 // real(4) and real(8) (and complex(4)/(8) when routed here) all
3986 // get the right buffer size and stride.
3987 afs_allocate_1d(result, elem_size, (n * m) as i64);
3988 let res = unsafe { &mut *result };
3989 res.rank = 2;
3990 res.dims[0] = DimDescriptor {
3991 lower_bound: 1,
3992 upper_bound: n as i64,
3993 stride: 1,
3994 };
3995 res.dims[1] = DimDescriptor {
3996 lower_bound: 1,
3997 upper_bound: m as i64,
3998 stride: 1,
3999 };
4000
4001 // Fortran arrays are column-major: source A(i,j) at offset j*m+i for
4002 // an m-row source; result B = transpose(A) has n rows, so B(j,i) at
4003 // offset i*n+j. The previous formulas were swapped (rp[j*m+i] =
4004 // sp[i*n+j]) which used row-major indexing on both sides; for any
4005 // non-square source this produced a scrambled output that's neither
4006 // the transpose nor the original. Surfaced in stdlib_stats cov_2_*
4007 // where `matmul(transpose(center), center)` returned all zeros — the
4008 // mis-strided transpose left the matrix multiply consuming the wrong
4009 // lanes, and the elements summed to 0 by accident on the toy input.
4010 if elem_size == 4 {
4011 let sp = src.base_addr as *const f32;
4012 let rp = res.base_addr as *mut f32;
4013 for i in 0..m {
4014 for j in 0..n {
4015 unsafe {
4016 *rp.add(i * n + j) = *sp.add(j * m + i);
4017 }
4018 }
4019 }
4020 } else if elem_size == 8 {
4021 let sp = src.base_addr as *const f64;
4022 let rp = res.base_addr as *mut f64;
4023 for i in 0..m {
4024 for j in 0..n {
4025 unsafe {
4026 *rp.add(i * n + j) = *sp.add(j * m + i);
4027 }
4028 }
4029 }
4030 } else {
4031 // Generic byte-level copy for other widths (complex(4)=8 already
4032 // handled above as f64 lanes; complex(8)=16 falls here).
4033 let sb = elem_size as usize;
4034 let sp = src.base_addr;
4035 let rp = res.base_addr;
4036 for i in 0..m {
4037 for j in 0..n {
4038 unsafe {
4039 let src_off = (j * m + i) * sb;
4040 let dst_off = (i * n + j) * sb;
4041 core::ptr::copy_nonoverlapping(sp.add(src_off), rp.add(dst_off), sb);
4042 }
4043 }
4044 }
4045 }
4046 }
4047
4048 /// MATMUL(a, b, result) — matrix multiplication (real(8) version).
4049 /// a is (m x k), b is (k x n), result is (m x n).
4050 #[no_mangle]
4051 pub extern "C" fn afs_matmul_real8(
4052 a: *const ArrayDescriptor,
4053 b: *const ArrayDescriptor,
4054 result: *mut ArrayDescriptor,
4055 ) {
4056 if a.is_null() || b.is_null() || result.is_null() {
4057 return;
4058 }
4059 let da = unsafe { &*a };
4060 let db = unsafe { &*b };
4061 if da.base_addr.is_null() || db.base_addr.is_null() {
4062 return;
4063 }
4064
4065 let m = da.dims[0].extent() as usize;
4066 let k = if da.rank >= 2 {
4067 da.dims[1].extent() as usize
4068 } else {
4069 1
4070 };
4071 let n = if db.rank >= 2 {
4072 db.dims[1].extent() as usize
4073 } else {
4074 db.dims[0].extent() as usize
4075 };
4076 let elem_size = da.elem_size.max(1);
4077
4078 // Allocate result using the source element width so real(4) and
4079 // real(8) inputs both produce correctly-sized output buffers.
4080 afs_allocate_1d(result, elem_size, (m * n) as i64);
4081 let res = unsafe { &mut *result };
4082 res.rank = 2;
4083 res.dims[0] = DimDescriptor {
4084 lower_bound: 1,
4085 upper_bound: m as i64,
4086 stride: 1,
4087 };
4088 res.dims[1] = DimDescriptor {
4089 lower_bound: 1,
4090 upper_bound: n as i64,
4091 stride: 1,
4092 };
4093
4094 // Fortran is column-major: A(m,k) stores A(i,l) at l*m + i,
4095 // B(k,n) stores B(l,j) at j*k + l, C(m,n) stores C(i,j) at j*m + i.
4096 if elem_size == 4 {
4097 let ap = da.base_addr as *const f32;
4098 let bp = db.base_addr as *const f32;
4099 let rp = res.base_addr as *mut f32;
4100 for j in 0..n {
4101 for i in 0..m {
4102 let mut sum: f64 = 0.0;
4103 for l in 0..k {
4104 let a_val = unsafe { *ap.add(l * m + i) } as f64;
4105 let b_val = unsafe { *bp.add(j * k + l) } as f64;
4106 sum += a_val * b_val;
4107 }
4108 unsafe {
4109 *rp.add(j * m + i) = sum as f32;
4110 }
4111 }
4112 }
4113 } else {
4114 let ap = da.base_addr as *const f64;
4115 let bp = db.base_addr as *const f64;
4116 let rp = res.base_addr as *mut f64;
4117 for j in 0..n {
4118 for i in 0..m {
4119 let mut sum: f64 = 0.0;
4120 for l in 0..k {
4121 let a_val = unsafe { *ap.add(l * m + i) };
4122 let b_val = unsafe { *bp.add(j * k + l) };
4123 sum += a_val * b_val;
4124 }
4125 unsafe {
4126 *rp.add(j * m + i) = sum;
4127 }
4128 }
4129 }
4130 }
4131 }
4132
4133 /// MATMUL(a, b, result) — matrix multiplication (complex version).
4134 /// elem_size 8 → complex(4) (two f32 lanes); elem_size 16 → complex(8) (two f64 lanes).
4135 #[no_mangle]
4136 pub extern "C" fn afs_matmul_complex(
4137 a: *const ArrayDescriptor,
4138 b: *const ArrayDescriptor,
4139 result: *mut ArrayDescriptor,
4140 ) {
4141 if a.is_null() || b.is_null() || result.is_null() {
4142 return;
4143 }
4144 let da = unsafe { &*a };
4145 let db = unsafe { &*b };
4146 if da.base_addr.is_null() || db.base_addr.is_null() {
4147 return;
4148 }
4149
4150 let m = da.dims[0].extent() as usize;
4151 let k = if da.rank >= 2 {
4152 da.dims[1].extent() as usize
4153 } else {
4154 1
4155 };
4156 let n = if db.rank >= 2 {
4157 db.dims[1].extent() as usize
4158 } else {
4159 db.dims[0].extent() as usize
4160 };
4161 let elem_size = da.elem_size.max(8);
4162
4163 afs_allocate_1d(result, elem_size, (m * n) as i64);
4164 let res = unsafe { &mut *result };
4165 res.rank = 2;
4166 res.dims[0] = DimDescriptor {
4167 lower_bound: 1,
4168 upper_bound: m as i64,
4169 stride: 1,
4170 };
4171 res.dims[1] = DimDescriptor {
4172 lower_bound: 1,
4173 upper_bound: n as i64,
4174 stride: 1,
4175 };
4176
4177 if elem_size == 8 {
4178 // complex(4): pairs of f32 (re, im).
4179 let ap = da.base_addr as *const f32;
4180 let bp = db.base_addr as *const f32;
4181 let rp = res.base_addr as *mut f32;
4182 for j in 0..n {
4183 for i in 0..m {
4184 let mut sum_re: f64 = 0.0;
4185 let mut sum_im: f64 = 0.0;
4186 for l in 0..k {
4187 let a_re = unsafe { *ap.add(2 * (l * m + i)) } as f64;
4188 let a_im = unsafe { *ap.add(2 * (l * m + i) + 1) } as f64;
4189 let b_re = unsafe { *bp.add(2 * (j * k + l)) } as f64;
4190 let b_im = unsafe { *bp.add(2 * (j * k + l) + 1) } as f64;
4191 sum_re += a_re * b_re - a_im * b_im;
4192 sum_im += a_re * b_im + a_im * b_re;
4193 }
4194 unsafe {
4195 *rp.add(2 * (j * m + i)) = sum_re as f32;
4196 *rp.add(2 * (j * m + i) + 1) = sum_im as f32;
4197 }
4198 }
4199 }
4200 } else {
4201 // complex(8): pairs of f64 (re, im).
4202 let ap = da.base_addr as *const f64;
4203 let bp = db.base_addr as *const f64;
4204 let rp = res.base_addr as *mut f64;
4205 for j in 0..n {
4206 for i in 0..m {
4207 let mut sum_re: f64 = 0.0;
4208 let mut sum_im: f64 = 0.0;
4209 for l in 0..k {
4210 let a_re = unsafe { *ap.add(2 * (l * m + i)) };
4211 let a_im = unsafe { *ap.add(2 * (l * m + i) + 1) };
4212 let b_re = unsafe { *bp.add(2 * (j * k + l)) };
4213 let b_im = unsafe { *bp.add(2 * (j * k + l) + 1) };
4214 sum_re += a_re * b_re - a_im * b_im;
4215 sum_im += a_re * b_im + a_im * b_re;
4216 }
4217 unsafe {
4218 *rp.add(2 * (j * m + i)) = sum_re;
4219 *rp.add(2 * (j * m + i) + 1) = sum_im;
4220 }
4221 }
4222 }
4223 }
4224 }
4225
4226 /// MATMUL(a, b, result) — matrix multiplication (integer(4) version).
4227 #[no_mangle]
4228 pub extern "C" fn afs_matmul_int(
4229 a: *const ArrayDescriptor,
4230 b: *const ArrayDescriptor,
4231 result: *mut ArrayDescriptor,
4232 ) {
4233 if a.is_null() || b.is_null() || result.is_null() {
4234 return;
4235 }
4236 let da = unsafe { &*a };
4237 let db = unsafe { &*b };
4238 if da.base_addr.is_null() || db.base_addr.is_null() {
4239 return;
4240 }
4241
4242 let m = da.dims[0].extent() as usize;
4243 let k = if da.rank >= 2 {
4244 da.dims[1].extent() as usize
4245 } else {
4246 1
4247 };
4248 let n = if db.rank >= 2 {
4249 db.dims[1].extent() as usize
4250 } else {
4251 db.dims[0].extent() as usize
4252 };
4253
4254 let ap = da.base_addr as *const i32;
4255 let bp = db.base_addr as *const i32;
4256
4257 afs_allocate_1d(result, 4, (m * n) as i64);
4258 let res = unsafe { &mut *result };
4259 res.rank = 2;
4260 res.dims[0] = DimDescriptor {
4261 lower_bound: 1,
4262 upper_bound: m as i64,
4263 stride: 1,
4264 };
4265 res.dims[1] = DimDescriptor {
4266 lower_bound: 1,
4267 upper_bound: n as i64,
4268 stride: 1,
4269 };
4270 let rp = res.base_addr as *mut i32;
4271
4272 // Fortran is column-major: A(m,k) stores A(i,l) at l*m + i,
4273 // B(k,n) stores B(l,j) at j*k + l, C(m,n) stores C(i,j) at j*m + i.
4274 for j in 0..n {
4275 for i in 0..m {
4276 let mut sum: i64 = 0;
4277 for l in 0..k {
4278 let a_val = unsafe { *ap.add(l * m + i) as i64 };
4279 let b_val = unsafe { *bp.add(j * k + l) as i64 };
4280 sum += a_val * b_val;
4281 }
4282 unsafe {
4283 *rp.add(j * m + i) = sum as i32;
4284 }
4285 }
4286 }
4287 }
4288
4289 /// TRANSPOSE(source, result) — matrix transpose (integer(4) version).
4290 #[no_mangle]
4291 pub extern "C" fn afs_transpose_int(source: *const ArrayDescriptor, result: *mut ArrayDescriptor) {
4292 if source.is_null() || result.is_null() {
4293 return;
4294 }
4295 let src = unsafe { &*source };
4296 if src.rank < 2 || src.base_addr.is_null() {
4297 return;
4298 }
4299
4300 let m = src.dims[0].extent() as usize;
4301 let n = src.dims[1].extent() as usize;
4302 let elem_size = src.elem_size.max(1) as usize;
4303 let sp = src.base_addr as *const u8;
4304
4305 // Allocate result with same per-element width so callers using
4306 // complex (8/16-byte), integer(8) (8-byte), integer(2)/(1) etc. all
4307 // round-trip without truncation. The previous always-i32 path silently
4308 // dropped the upper bytes of every element for non-32-bit types.
4309 let dim0 = DimDescriptor {
4310 lower_bound: 1,
4311 upper_bound: n as i64,
4312 stride: 1,
4313 };
4314 let dim1 = DimDescriptor {
4315 lower_bound: 1,
4316 upper_bound: m as i64,
4317 stride: 1,
4318 };
4319 let dims = [dim0, dim1];
4320 afs_allocate_array(result, elem_size as i64, 2, dims.as_ptr(), ptr::null_mut());
4321 let res = unsafe { &mut *result };
4322 let rp = res.base_addr;
4323
4324 // Column-major: source A(i,j) at offset j*m+i; dest B(j,i) at i*n+j.
4325 // See afs_transpose_real8 for the full root-cause note.
4326 for i in 0..m {
4327 for j in 0..n {
4328 let src_off = (j * m + i) * elem_size;
4329 let dst_off = (i * n + j) * elem_size;
4330 unsafe {
4331 core::ptr::copy_nonoverlapping(sp.add(src_off), rp.add(dst_off), elem_size);
4332 }
4333 }
4334 }
4335 }
4336
4337 /// CONJG over a complex array: allocate result with the same shape and
4338 /// element size, copy the real lane verbatim and negate the imag lane.
4339 /// Handles complex(sp) (8-byte) and complex(dp) (16-byte) by reading the
4340 /// per-element width from the descriptor.
4341 #[no_mangle]
4342 pub extern "C" fn afs_array_conjg(source: *const ArrayDescriptor, result: *mut ArrayDescriptor) {
4343 if source.is_null() || result.is_null() {
4344 return;
4345 }
4346 let src = unsafe { &*source };
4347 if src.base_addr.is_null() {
4348 return;
4349 }
4350 afs_allocate_like(result, source, ptr::null_mut());
4351 let res = unsafe { &mut *result };
4352 let elem_size = src.elem_size.max(1) as usize;
4353 let lane = elem_size / 2;
4354 let total = src.total_elements() as usize;
4355 let sp = src.base_addr as *const u8;
4356 let rp = res.base_addr;
4357 if elem_size == 8 {
4358 // complex(sp): two f32 lanes per element
4359 for i in 0..total {
4360 let off = i * 8;
4361 unsafe {
4362 let re = *(sp.add(off) as *const f32);
4363 let im = *(sp.add(off + lane) as *const f32);
4364 *(rp.add(off) as *mut f32) = re;
4365 *(rp.add(off + lane) as *mut f32) = -im;
4366 }
4367 }
4368 } else if elem_size == 16 {
4369 // complex(dp): two f64 lanes per element
4370 for i in 0..total {
4371 let off = i * 16;
4372 unsafe {
4373 let re = *(sp.add(off) as *const f64);
4374 let im = *(sp.add(off + lane) as *const f64);
4375 *(rp.add(off) as *mut f64) = re;
4376 *(rp.add(off + lane) as *mut f64) = -im;
4377 }
4378 }
4379 } else {
4380 // Non-complex element width: byte-copy (degenerates to identity).
4381 for i in 0..total {
4382 let off = i * elem_size;
4383 unsafe {
4384 core::ptr::copy_nonoverlapping(sp.add(off), rp.add(off), elem_size);
4385 }
4386 }
4387 }
4388 }
4389
4390 /// AIMAG over a complex array: produce a real array of the same shape
4391 /// whose elements are the imaginary lanes of the source. Result has
4392 /// HALF the source elem_size (complex(sp) 8B → real(sp) 4B; complex(dp)
4393 /// 16B → real(dp) 8B), so we allocate fresh dims rather than using
4394 /// `afs_allocate_like`.
4395 #[no_mangle]
4396 pub extern "C" fn afs_array_aimag(source: *const ArrayDescriptor, result: *mut ArrayDescriptor) {
4397 if source.is_null() || result.is_null() {
4398 return;
4399 }
4400 let src = unsafe { &*source };
4401 if src.base_addr.is_null() {
4402 return;
4403 }
4404 let elem_size = src.elem_size.max(1) as usize;
4405 let lane = elem_size / 2;
4406 let mut dims = [DimDescriptor::default(); MAX_RANK];
4407 for (i, dim) in dims.iter_mut().enumerate().take(src.rank as usize) {
4408 *dim = DimDescriptor {
4409 lower_bound: src.dims[i].lower_bound,
4410 upper_bound: src.dims[i].upper_bound,
4411 stride: 1,
4412 };
4413 }
4414 let dims_ptr = if src.rank > 0 {
4415 dims.as_ptr()
4416 } else {
4417 ptr::null()
4418 };
4419 afs_allocate_array(result, lane as i64, src.rank, dims_ptr, ptr::null_mut());
4420
4421 let res = unsafe { &mut *result };
4422 let total = src.total_elements() as usize;
4423 let sp_buf = src.base_addr as *const u8;
4424 let rp_buf = res.base_addr;
4425 if elem_size == 8 {
4426 for i in 0..total {
4427 unsafe {
4428 let im = *(sp_buf.add(i * 8 + 4) as *const f32);
4429 *(rp_buf.add(i * 4) as *mut f32) = im;
4430 }
4431 }
4432 } else if elem_size == 16 {
4433 for i in 0..total {
4434 unsafe {
4435 let im = *(sp_buf.add(i * 16 + 8) as *const f64);
4436 *(rp_buf.add(i * 8) as *mut f64) = im;
4437 }
4438 }
4439 }
4440 }
4441
4442 /// ABS over a complex array: produce a real array of the same shape
4443 /// whose elements are |z| = sqrt(re*re + im*im). Result has HALF the
4444 /// source elem_size; mirror the allocation strategy from `afs_array_aimag`.
4445 #[no_mangle]
4446 pub extern "C" fn afs_array_abs_complex(
4447 source: *const ArrayDescriptor,
4448 result: *mut ArrayDescriptor,
4449 ) {
4450 if source.is_null() || result.is_null() {
4451 return;
4452 }
4453 let src = unsafe { &*source };
4454 if src.base_addr.is_null() {
4455 return;
4456 }
4457 let elem_size = src.elem_size.max(1) as usize;
4458 let lane = elem_size / 2;
4459 let mut dims = [DimDescriptor::default(); MAX_RANK];
4460 for (i, dim) in dims.iter_mut().enumerate().take(src.rank as usize) {
4461 *dim = DimDescriptor {
4462 lower_bound: src.dims[i].lower_bound,
4463 upper_bound: src.dims[i].upper_bound,
4464 stride: 1,
4465 };
4466 }
4467 let dims_ptr = if src.rank > 0 {
4468 dims.as_ptr()
4469 } else {
4470 ptr::null()
4471 };
4472 afs_allocate_array(result, lane as i64, src.rank, dims_ptr, ptr::null_mut());
4473
4474 let res = unsafe { &mut *result };
4475 let total = src.total_elements() as usize;
4476 let sp_buf = src.base_addr as *const u8;
4477 let rp_buf = res.base_addr;
4478 if elem_size == 8 {
4479 for i in 0..total {
4480 unsafe {
4481 let re = *(sp_buf.add(i * 8) as *const f32);
4482 let im = *(sp_buf.add(i * 8 + 4) as *const f32);
4483 *(rp_buf.add(i * 4) as *mut f32) = (re * re + im * im).sqrt();
4484 }
4485 }
4486 } else if elem_size == 16 {
4487 for i in 0..total {
4488 unsafe {
4489 let re = *(sp_buf.add(i * 16) as *const f64);
4490 let im = *(sp_buf.add(i * 16 + 8) as *const f64);
4491 *(rp_buf.add(i * 8) as *mut f64) = (re * re + im * im).sqrt();
4492 }
4493 }
4494 }
4495 }
4496
4497 /// F2018 §16.9.43 CMPLX(re, im, kind) over real arrays.
4498 ///
4499 /// Allocates a complex(out_lane_bytes) result of the same shape as `re_source`
4500 /// and writes one element per source element with re[i] in lane 0 and
4501 /// im[i] (or 0 when im_source is null) in lane 1. Handles cross-kind
4502 /// inputs (real(sp) ↔ real(dp)) by reading the per-side elem_size.
4503 ///
4504 /// `out_lane_bytes` is 4 (single) or 8 (double); result elem_size is
4505 /// `2 * out_lane_bytes`.
4506 #[no_mangle]
4507 pub extern "C" fn afs_array_cmplx(
4508 re_source: *const ArrayDescriptor,
4509 im_source: *const ArrayDescriptor,
4510 out_lane_bytes: i32,
4511 result: *mut ArrayDescriptor,
4512 ) {
4513 if re_source.is_null() || result.is_null() {
4514 return;
4515 }
4516 let re = unsafe { &*re_source };
4517 if re.base_addr.is_null() {
4518 return;
4519 }
4520 let im_opt = if im_source.is_null() {
4521 None
4522 } else {
4523 let im = unsafe { &*im_source };
4524 if im.base_addr.is_null() {
4525 None
4526 } else {
4527 Some(im)
4528 }
4529 };
4530 let lane = out_lane_bytes.max(4) as usize;
4531 let elem_size = 2 * lane;
4532 let mut dims = [DimDescriptor::default(); MAX_RANK];
4533 for (i, dim) in dims.iter_mut().enumerate().take(re.rank as usize) {
4534 *dim = DimDescriptor {
4535 lower_bound: re.dims[i].lower_bound,
4536 upper_bound: re.dims[i].upper_bound,
4537 stride: 1,
4538 };
4539 }
4540 let dims_ptr = if re.rank > 0 {
4541 dims.as_ptr()
4542 } else {
4543 ptr::null()
4544 };
4545 afs_allocate_array(result, elem_size as i64, re.rank, dims_ptr, ptr::null_mut());
4546
4547 let res = unsafe { &mut *result };
4548 let total = re.total_elements() as usize;
4549 let re_elem = re.elem_size.max(1) as usize;
4550 let im_elem = im_opt.map(|im| im.elem_size.max(1) as usize).unwrap_or(0);
4551 let re_buf = re.base_addr as *const u8;
4552 let im_buf = im_opt
4553 .map(|im| im.base_addr as *const u8)
4554 .unwrap_or(ptr::null());
4555 let rp_buf = res.base_addr;
4556 for i in 0..total {
4557 let dst_off = i * elem_size;
4558 unsafe {
4559 // Read real lane as f64 from source kind.
4560 let r_val: f64 = match re_elem {
4561 4 => *(re_buf.add(i * 4) as *const f32) as f64,
4562 8 => *(re_buf.add(i * 8) as *const f64),
4563 _ => 0.0,
4564 };
4565 // Read imag lane (zero when source absent).
4566 let i_val: f64 = if im_buf.is_null() {
4567 0.0
4568 } else {
4569 match im_elem {
4570 4 => *(im_buf.add(i * 4) as *const f32) as f64,
4571 8 => *(im_buf.add(i * 8) as *const f64),
4572 _ => 0.0,
4573 }
4574 };
4575 // Write per result kind.
4576 match lane {
4577 4 => {
4578 *(rp_buf.add(dst_off) as *mut f32) = r_val as f32;
4579 *(rp_buf.add(dst_off + 4) as *mut f32) = i_val as f32;
4580 }
4581 8 => {
4582 *(rp_buf.add(dst_off) as *mut f64) = r_val;
4583 *(rp_buf.add(dst_off + 8) as *mut f64) = i_val;
4584 }
4585 _ => {}
4586 }
4587 }
4588 }
4589 }
4590
4591 /// F2018 §16.9.144 PACK(ARRAY, MASK [, VECTOR]).
4592 ///
4593 /// Walks `source` and `mask` element-by-element (mask is interpreted
4594 /// element-wise, regardless of source rank, since shapes must conform
4595 /// per the standard). Each source element whose mask element is true
4596 /// is copied into a fresh rank-1 result descriptor.
4597 ///
4598 /// `vector` is optional; when non-null, the result inherits its size
4599 /// (element count) and elements past the masked-true count are filled
4600 /// from `vector`. Otherwise the result size is the count of true
4601 /// values in the mask.
4602 ///
4603 /// `mask` is a Fortran logical, stored as i32 in our descriptor: zero
4604 /// means false, anything else means true.
4605 ///
4606 /// The element copy is byte-level via `elem_size` so this works for
4607 /// any non-derived element type (integer/real/complex/logical/character
4608 /// of any kind).
4609 #[no_mangle]
4610 pub extern "C" fn afs_array_pack(
4611 source: *const ArrayDescriptor,
4612 mask: *const ArrayDescriptor,
4613 vector: *const ArrayDescriptor,
4614 result: *mut ArrayDescriptor,
4615 ) {
4616 if source.is_null() || mask.is_null() || result.is_null() {
4617 return;
4618 }
4619 let src = unsafe { &*source };
4620 let msk = unsafe { &*mask };
4621 if src.base_addr.is_null() || msk.base_addr.is_null() {
4622 return;
4623 }
4624 let elem_size = src.elem_size.max(1) as usize;
4625 let total = src.total_elements() as usize;
4626 let mask_total = msk.total_elements() as usize;
4627 let pairs = total.min(mask_total);
4628 let mask_elem = msk.elem_size.max(1) as usize;
4629
4630 // First pass: count true values in the mask. Dispatch on the
4631 // mask's elem_size — a `logical :: m(:)` now reaches us with
4632 // elem_size=1 (matches storage), and the prior fixed i32 read
4633 // misaligned every iteration.
4634 let mut true_count: i64 = 0;
4635 for i in 0..pairs {
4636 if unsafe { mask_byte_is_true(msk, i * mask_elem) } {
4637 true_count += 1;
4638 }
4639 }
4640
4641 // Result size: vector's size if provided, else count of trues.
4642 let result_n = if !vector.is_null() {
4643 let vec = unsafe { &*vector };
4644 vec.total_elements()
4645 } else {
4646 true_count
4647 };
4648
4649 // Allocate rank-1 result descriptor.
4650 let dim = DimDescriptor {
4651 lower_bound: 1,
4652 upper_bound: result_n,
4653 stride: 1,
4654 };
4655 let dim_ptr = &dim as *const DimDescriptor;
4656 afs_allocate_array(result, elem_size as i64, 1, dim_ptr, ptr::null_mut());
4657
4658 let res = unsafe { &mut *result };
4659 let sp = src.base_addr as *const u8;
4660 let rp = res.base_addr;
4661
4662 // Second pass: emit masked-true source elements into result.
4663 let mut out_idx: usize = 0;
4664 for i in 0..pairs {
4665 if unsafe { mask_byte_is_true(msk, i * mask_elem) } {
4666 unsafe {
4667 core::ptr::copy_nonoverlapping(
4668 sp.add(i * elem_size),
4669 rp.add(out_idx * elem_size),
4670 elem_size,
4671 );
4672 }
4673 out_idx += 1;
4674 }
4675 }
4676
4677 // Pad the tail from `vector` (if provided and result_n > true_count).
4678 if !vector.is_null() {
4679 let vec = unsafe { &*vector };
4680 if !vec.base_addr.is_null() {
4681 let vp = vec.base_addr as *const u8;
4682 let tail_start = out_idx;
4683 let tail_end = result_n as usize;
4684 for j in tail_start..tail_end {
4685 unsafe {
4686 core::ptr::copy_nonoverlapping(
4687 vp.add(j * elem_size),
4688 rp.add(j * elem_size),
4689 elem_size,
4690 );
4691 }
4692 }
4693 }
4694 }
4695 }
4696
4697 /// F2018 §16.9.163: RESHAPE(SOURCE, SHAPE [, PAD, ORDER]).
4698 ///
4699 /// Allocates a fresh result descriptor of rank = size(shape) and
4700 /// element-fills it from `source` in array-element order. When
4701 /// `order` is supplied (a permutation of 1..rank), the *target*
4702 /// dimension traversal is permuted: result element index `(j1,...,jN)`
4703 /// corresponds to a logical "natural" position whose subscripts are
4704 /// `(j[order(1)],...,j[order(N)])`. When the result has more elements
4705 /// than the source, the tail is filled cyclically from `pad`.
4706 ///
4707 /// Shape and order arrays are i32 or i64 — read both via the same
4708 /// 64-bit-extended path keyed off the descriptor's elem_size.
4709 #[no_mangle]
4710 pub extern "C" fn afs_array_reshape(
4711 source: *const ArrayDescriptor,
4712 shape: *const ArrayDescriptor,
4713 order: *const ArrayDescriptor,
4714 pad: *const ArrayDescriptor,
4715 result: *mut ArrayDescriptor,
4716 ) {
4717 if source.is_null() || shape.is_null() || result.is_null() {
4718 return;
4719 }
4720 let src = unsafe { &*source };
4721 let shp = unsafe { &*shape };
4722 if src.base_addr.is_null() || shp.base_addr.is_null() {
4723 return;
4724 }
4725 let rank = shp.total_elements() as usize;
4726 if rank == 0 || rank > MAX_RANK {
4727 return;
4728 }
4729
4730 // Read shape extents into a fixed-size array.
4731 let read_int_at = |buf: *const u8, idx: usize, elem_size: usize| -> i64 {
4732 unsafe {
4733 match elem_size {
4734 4 => *(buf.add(idx * 4) as *const i32) as i64,
4735 8 => *(buf.add(idx * 8) as *const i64),
4736 _ => 0,
4737 }
4738 }
4739 };
4740 let shape_buf = shp.base_addr as *const u8;
4741 let shape_elem = shp.elem_size.max(1) as usize;
4742 let mut extents: [i64; MAX_RANK] = [0; MAX_RANK];
4743 for (i, slot) in extents.iter_mut().enumerate().take(rank) {
4744 *slot = read_int_at(shape_buf, i, shape_elem).max(0);
4745 }
4746
4747 // Build dim descriptors and allocate result.
4748 let mut dims = [DimDescriptor::default(); MAX_RANK];
4749 for i in 0..rank {
4750 dims[i] = DimDescriptor {
4751 lower_bound: 1,
4752 upper_bound: extents[i],
4753 stride: 1,
4754 };
4755 }
4756 let elem_size = src.elem_size.max(1);
4757 afs_allocate_array(
4758 result,
4759 elem_size,
4760 rank as i32,
4761 dims.as_ptr(),
4762 ptr::null_mut(),
4763 );
4764 let res = unsafe { &mut *result };
4765 if res.base_addr.is_null() {
4766 return;
4767 }
4768
4769 let total: i64 = extents.iter().take(rank).copied().product();
4770 let total_usize = total as usize;
4771 let src_total = src.total_elements() as usize;
4772 let elem_size_usize = elem_size as usize;
4773
4774 // Read order (identity if absent).
4775 let mut order_perm: [usize; MAX_RANK] = [0; MAX_RANK];
4776 let order_present = !order.is_null() && unsafe { (*order).rank > 0 };
4777 if order_present {
4778 let ord = unsafe { &*order };
4779 let ord_buf = ord.base_addr as *const u8;
4780 let ord_elem = ord.elem_size.max(1) as usize;
4781 let ord_count = ord.total_elements() as usize;
4782 for (i, slot) in order_perm.iter_mut().enumerate().take(rank.min(ord_count)) {
4783 // Convert from 1-based Fortran to 0-based.
4784 *slot = (read_int_at(ord_buf, i, ord_elem) - 1).max(0) as usize;
4785 }
4786 } else {
4787 for (i, slot) in order_perm.iter_mut().enumerate().take(rank) {
4788 *slot = i;
4789 }
4790 }
4791
4792 let pad_present = !pad.is_null() && unsafe { (*pad).total_elements() > 0 };
4793 let (pad_buf, pad_total) = if pad_present {
4794 let p = unsafe { &*pad };
4795 (p.base_addr as *const u8, p.total_elements() as usize)
4796 } else {
4797 (ptr::null(), 0)
4798 };
4799
4800 let sp = src.base_addr as *const u8;
4801 let rp = res.base_addr;
4802
4803 // Linear iteration over the result in element order. For each
4804 // result linear index, compute the multi-dim subscript in the
4805 // *natural* (un-permuted) order, then look up the target slot
4806 // by applying `order_perm` to translate logical → result subscript.
4807 for linear in 0..total_usize {
4808 // Natural multi-dim subscript: column-major over extents in
4809 // logical order, where logical extents follow the permutation
4810 // (logical_dim k = extents[order_perm[k]]).
4811 let mut idx = linear;
4812 let mut logical_subs: [i64; MAX_RANK] = [0; MAX_RANK];
4813 for k in 0..rank {
4814 let logical_extent = extents[order_perm[k]].max(1) as usize;
4815 logical_subs[k] = (idx % logical_extent) as i64;
4816 idx /= logical_extent;
4817 }
4818 // Translate into result subscript: result_subs[order_perm[k]] = logical_subs[k]
4819 let mut result_subs: [i64; MAX_RANK] = [0; MAX_RANK];
4820 for k in 0..rank {
4821 result_subs[order_perm[k]] = logical_subs[k];
4822 }
4823 // Compute result linear (column-major over result extents).
4824 let mut result_linear: usize = 0;
4825 let mut multiplier: usize = 1;
4826 for k in 0..rank {
4827 result_linear += (result_subs[k] as usize) * multiplier;
4828 multiplier *= extents[k].max(1) as usize;
4829 }
4830 // Source element: linear (column-major as if rank-1 flat).
4831 let src_off = if linear < src_total {
4832 linear * elem_size_usize
4833 } else if pad_total > 0 {
4834 ((linear - src_total) % pad_total) * elem_size_usize
4835 } else {
4836 0
4837 };
4838 let from = if linear < src_total || pad_total == 0 {
4839 sp
4840 } else {
4841 pad_buf
4842 };
4843 unsafe {
4844 core::ptr::copy_nonoverlapping(
4845 from.add(src_off),
4846 rp.add(result_linear * elem_size_usize),
4847 elem_size_usize,
4848 );
4849 }
4850 }
4851 }
4852
4853 /// DOT_PRODUCT(a, b) — vector dot product (real(8) version).
4854 /// Respects strides for non-contiguous array sections.
4855 #[no_mangle]
4856 pub extern "C" fn afs_dot_product_real8(
4857 a: *const ArrayDescriptor,
4858 b: *const ArrayDescriptor,
4859 ) -> f64 {
4860 if a.is_null() || b.is_null() {
4861 return 0.0;
4862 }
4863 let da = unsafe { &*a };
4864 let db = unsafe { &*b };
4865 if da.base_addr.is_null() || db.base_addr.is_null() {
4866 return 0.0;
4867 }
4868 let n = da.dims[0].extent().min(db.dims[0].extent()) as usize;
4869 let stride_a = da.dims[0].stride.max(1) as usize;
4870 let stride_b = db.dims[0].stride.max(1) as usize;
4871 let pa = da.base_addr as *const f64;
4872 let pb = db.base_addr as *const f64;
4873 let mut dot = 0.0;
4874 for i in 0..n {
4875 dot += unsafe { *pa.add(i * stride_a) * *pb.add(i * stride_b) };
4876 }
4877 dot
4878 }
4879
4880 /// DOT_PRODUCT(a, b) — vector dot product (real(4) version).
4881 #[no_mangle]
4882 pub extern "C" fn afs_dot_product_real4(
4883 a: *const ArrayDescriptor,
4884 b: *const ArrayDescriptor,
4885 ) -> f32 {
4886 if a.is_null() || b.is_null() {
4887 return 0.0;
4888 }
4889 let da = unsafe { &*a };
4890 let db = unsafe { &*b };
4891 if da.base_addr.is_null() || db.base_addr.is_null() {
4892 return 0.0;
4893 }
4894 let n = da.dims[0].extent().min(db.dims[0].extent()) as usize;
4895 let stride_a = da.dims[0].stride.max(1) as usize;
4896 let stride_b = db.dims[0].stride.max(1) as usize;
4897 let pa = da.base_addr as *const f32;
4898 let pb = db.base_addr as *const f32;
4899 let mut dot = 0.0;
4900 for i in 0..n {
4901 dot += unsafe { *pa.add(i * stride_a) * *pb.add(i * stride_b) };
4902 }
4903 dot
4904 }
4905
4906 /// DOT_PRODUCT(a, b) — vector dot product (integer(4) version).
4907 #[no_mangle]
4908 pub extern "C" fn afs_dot_product_int(a: *const ArrayDescriptor, b: *const ArrayDescriptor) -> i64 {
4909 if a.is_null() || b.is_null() {
4910 return 0;
4911 }
4912 let da = unsafe { &*a };
4913 let db = unsafe { &*b };
4914 if da.base_addr.is_null() || db.base_addr.is_null() {
4915 return 0;
4916 }
4917 let n = da.dims[0].extent().min(db.dims[0].extent()) as usize;
4918 let stride_a = da.dims[0].stride.max(1) as usize;
4919 let stride_b = db.dims[0].stride.max(1) as usize;
4920 let pa = da.base_addr as *const i32;
4921 let pb = db.base_addr as *const i32;
4922 let mut dot: i64 = 0;
4923 for i in 0..n {
4924 dot += unsafe { (*pa.add(i * stride_a) as i64) * (*pb.add(i * stride_b) as i64) };
4925 }
4926 dot
4927 }
4928