| 1 | module ferp_mmap |
| 2 | !> Memory-mapped file I/O for FERP |
| 3 | !> Uses POSIX mmap for efficient file reading |
| 4 | !> Uses SIMD for fast newline scanning on ARM64 |
| 5 | use ferp_kinds |
| 6 | use ferp_simd |
| 7 | use, intrinsic :: iso_c_binding |
| 8 | implicit none |
| 9 | private |
| 10 | |
| 11 | public :: mmap_file_t |
| 12 | public :: mmap_open, mmap_close, mmap_get_line |
| 13 | public :: line_info_t, line_batch_t |
| 14 | public :: BATCH_SIZE |
| 15 | |
| 16 | ! POSIX constants |
| 17 | integer(c_int), parameter :: PROT_READ = 1 |
| 18 | integer(c_int), parameter :: MAP_PRIVATE = 2 |
| 19 | integer(c_int), parameter :: MAP_FAILED = -1 |
| 20 | |
| 21 | ! Batch processing constants |
| 22 | integer, parameter :: BATCH_SIZE = 256 ! Number of lines per batch |
| 23 | |
| 24 | !> Line info for batch processing (pointers into mmap'd memory) |
| 25 | type :: line_info_t |
| 26 | integer(c_size_t) :: start_pos = 0 ! Start position in mmap (0-based) |
| 27 | integer(c_size_t) :: length = 0 ! Length of line (excluding newline) |
| 28 | integer :: line_num = 0 ! Line number (1-based) |
| 29 | integer(i64) :: byte_off = 0 ! Byte offset in file |
| 30 | end type line_info_t |
| 31 | |
| 32 | !> Batch of line info for bulk processing |
| 33 | type :: line_batch_t |
| 34 | type(line_info_t) :: lines(BATCH_SIZE) |
| 35 | integer :: count = 0 ! Number of valid lines in batch |
| 36 | end type line_batch_t |
| 37 | |
| 38 | ! C interfaces |
| 39 | interface |
| 40 | function c_open(pathname, flags) bind(C, name="open") |
| 41 | import :: c_char, c_int |
| 42 | character(kind=c_char), intent(in) :: pathname(*) |
| 43 | integer(c_int), value :: flags |
| 44 | integer(c_int) :: c_open |
| 45 | end function c_open |
| 46 | |
| 47 | function c_close(fd) bind(C, name="close") |
| 48 | import :: c_int |
| 49 | integer(c_int), value :: fd |
| 50 | integer(c_int) :: c_close |
| 51 | end function c_close |
| 52 | |
| 53 | function c_mmap(addr, length, prot, flags, fd, offset) bind(C, name="mmap") |
| 54 | import :: c_ptr, c_size_t, c_int, c_long |
| 55 | type(c_ptr), value :: addr |
| 56 | integer(c_size_t), value :: length |
| 57 | integer(c_int), value :: prot |
| 58 | integer(c_int), value :: flags |
| 59 | integer(c_int), value :: fd |
| 60 | integer(c_long), value :: offset |
| 61 | type(c_ptr) :: c_mmap |
| 62 | end function c_mmap |
| 63 | |
| 64 | function c_munmap(addr, length) bind(C, name="munmap") |
| 65 | import :: c_ptr, c_size_t, c_int |
| 66 | type(c_ptr), value :: addr |
| 67 | integer(c_size_t), value :: length |
| 68 | integer(c_int) :: c_munmap |
| 69 | end function c_munmap |
| 70 | |
| 71 | function c_fstat(fd, statbuf) bind(C, name="fstat") |
| 72 | import :: c_int, c_ptr |
| 73 | integer(c_int), value :: fd |
| 74 | type(c_ptr), value :: statbuf |
| 75 | integer(c_int) :: c_fstat |
| 76 | end function c_fstat |
| 77 | |
| 78 | function c_lseek(fd, offset, whence) bind(C, name="lseek") |
| 79 | import :: c_int, c_long |
| 80 | integer(c_int), value :: fd |
| 81 | integer(c_long), value :: offset |
| 82 | integer(c_int), value :: whence |
| 83 | integer(c_long) :: c_lseek |
| 84 | end function c_lseek |
| 85 | end interface |
| 86 | |
| 87 | ! lseek whence values |
| 88 | integer(c_int), parameter :: SEEK_SET = 0 |
| 89 | integer(c_int), parameter :: SEEK_END = 2 |
| 90 | |
| 91 | !> Memory-mapped file type |
| 92 | type :: mmap_file_t |
| 93 | type(c_ptr) :: data = c_null_ptr |
| 94 | integer(c_size_t) :: size = 0 |
| 95 | integer(c_size_t) :: pos = 0 ! Current position in file |
| 96 | integer :: line_number = 0 |
| 97 | integer(i64) :: byte_offset = 0 |
| 98 | logical :: is_open = .false. |
| 99 | character(len=max_path_len) :: filename = '' |
| 100 | contains |
| 101 | procedure :: open => mmap_open_method |
| 102 | procedure :: close => mmap_close_method |
| 103 | procedure :: read_line => mmap_read_line |
| 104 | procedure :: read_lines_batch => mmap_read_lines_batch |
| 105 | procedure :: get_line_text => mmap_get_line_text |
| 106 | procedure :: reset => mmap_reset |
| 107 | end type mmap_file_t |
| 108 | |
| 109 | contains |
| 110 | |
| 111 | function mmap_open(filename, mfile) result(success) |
| 112 | !> Open a file with memory mapping |
| 113 | character(len=*), intent(in) :: filename |
| 114 | type(mmap_file_t), intent(out) :: mfile |
| 115 | logical :: success |
| 116 | |
| 117 | integer(c_int) :: fd, istat |
| 118 | integer(c_int), parameter :: O_RDONLY = 0 |
| 119 | integer(c_size_t) :: file_size |
| 120 | integer(c_long) :: size_long |
| 121 | |
| 122 | success = .false. |
| 123 | mfile%is_open = .false. |
| 124 | mfile%filename = filename |
| 125 | |
| 126 | ! Open file |
| 127 | fd = c_open(trim(filename) // c_null_char, O_RDONLY) |
| 128 | if (fd < 0) return |
| 129 | |
| 130 | ! Get file size via lseek to end |
| 131 | size_long = c_lseek(fd, 0_c_long, SEEK_END) |
| 132 | if (size_long < 0) then |
| 133 | istat = c_close(fd) |
| 134 | return |
| 135 | end if |
| 136 | file_size = int(size_long, c_size_t) |
| 137 | |
| 138 | ! Seek back to beginning |
| 139 | size_long = c_lseek(fd, 0_c_long, SEEK_SET) |
| 140 | |
| 141 | if (file_size == 0) then |
| 142 | istat = c_close(fd) |
| 143 | mfile%size = 0 |
| 144 | mfile%is_open = .true. |
| 145 | success = .true. |
| 146 | return |
| 147 | end if |
| 148 | |
| 149 | ! Memory map the file |
| 150 | mfile%data = c_mmap(c_null_ptr, file_size, PROT_READ, MAP_PRIVATE, fd, 0_c_long) |
| 151 | istat = c_close(fd) ! Can close fd after mmap |
| 152 | |
| 153 | if (.not. c_associated(mfile%data)) return |
| 154 | |
| 155 | mfile%size = file_size |
| 156 | mfile%pos = 0 |
| 157 | mfile%line_number = 0 |
| 158 | mfile%byte_offset = 0 |
| 159 | mfile%is_open = .true. |
| 160 | success = .true. |
| 161 | |
| 162 | end function mmap_open |
| 163 | |
| 164 | function mmap_open_method(this, filename) result(success) |
| 165 | class(mmap_file_t), intent(inout) :: this |
| 166 | character(len=*), intent(in) :: filename |
| 167 | logical :: success |
| 168 | success = mmap_open(filename, this) |
| 169 | end function mmap_open_method |
| 170 | |
| 171 | subroutine mmap_close(mfile) |
| 172 | !> Close memory-mapped file |
| 173 | type(mmap_file_t), intent(inout) :: mfile |
| 174 | |
| 175 | integer(c_int) :: istat |
| 176 | |
| 177 | if (mfile%is_open .and. c_associated(mfile%data)) then |
| 178 | istat = c_munmap(mfile%data, mfile%size) |
| 179 | end if |
| 180 | |
| 181 | mfile%data = c_null_ptr |
| 182 | mfile%size = 0 |
| 183 | mfile%pos = 0 |
| 184 | mfile%is_open = .false. |
| 185 | |
| 186 | end subroutine mmap_close |
| 187 | |
| 188 | subroutine mmap_close_method(this) |
| 189 | class(mmap_file_t), intent(inout) :: this |
| 190 | call mmap_close(this) |
| 191 | end subroutine mmap_close_method |
| 192 | |
| 193 | subroutine mmap_reset(this) |
| 194 | !> Reset to beginning of file |
| 195 | class(mmap_file_t), intent(inout) :: this |
| 196 | this%pos = 0 |
| 197 | this%line_number = 0 |
| 198 | this%byte_offset = 0 |
| 199 | end subroutine mmap_reset |
| 200 | |
| 201 | function mmap_get_line(mfile, line, line_num, byte_off) result(success) |
| 202 | !> Get next line from memory-mapped file |
| 203 | type(mmap_file_t), intent(inout) :: mfile |
| 204 | character(len=:), allocatable, intent(out) :: line |
| 205 | integer, intent(out) :: line_num |
| 206 | integer(i64), intent(out) :: byte_off |
| 207 | logical :: success |
| 208 | |
| 209 | success = mfile%read_line(line, line_num, byte_off) |
| 210 | end function mmap_get_line |
| 211 | |
| 212 | function mmap_read_line(this, line, line_num, byte_off) result(success) |
| 213 | !> Read next line from memory-mapped file (SIMD-accelerated newline scanning) |
| 214 | class(mmap_file_t), intent(inout) :: this |
| 215 | character(len=:), allocatable, intent(out) :: line |
| 216 | integer, intent(out) :: line_num |
| 217 | integer(i64), intent(out) :: byte_off |
| 218 | logical :: success |
| 219 | |
| 220 | character(len=1, kind=c_char), pointer :: file_data(:) |
| 221 | integer(c_size_t) :: start_pos, end_pos, line_len |
| 222 | integer(c_int64_t) :: newline_pos |
| 223 | integer :: i |
| 224 | |
| 225 | success = .false. |
| 226 | line_num = 0 |
| 227 | byte_off = 0 |
| 228 | if (allocated(line)) deallocate(line) |
| 229 | |
| 230 | if (.not. this%is_open) return |
| 231 | if (this%pos >= this%size) return |
| 232 | |
| 233 | ! Map the C pointer to a Fortran character array |
| 234 | call c_f_pointer(this%data, file_data, [this%size]) |
| 235 | |
| 236 | ! Find start and end of line |
| 237 | start_pos = this%pos + 1 ! 1-based for Fortran |
| 238 | |
| 239 | ! Use SIMD to find newline (16 bytes at a time on ARM64) |
| 240 | newline_pos = simd_find_char_ptr(this%data, int(this%size, c_int64_t), & |
| 241 | int(this%pos, c_int64_t), char(10)) |
| 242 | |
| 243 | if (newline_pos < 0) then |
| 244 | ! No newline found - rest of file is the line |
| 245 | end_pos = this%size + 1 |
| 246 | else |
| 247 | ! Found newline - convert from 0-indexed to 1-indexed |
| 248 | end_pos = int(newline_pos, c_size_t) + 1 |
| 249 | end if |
| 250 | |
| 251 | ! Calculate line length (excluding newline, but preserving CR like grep) |
| 252 | line_len = end_pos - start_pos |
| 253 | |
| 254 | ! Allocate and copy line |
| 255 | if (line_len > 0) then |
| 256 | allocate(character(len=line_len) :: line) |
| 257 | do i = 1, int(line_len) |
| 258 | line(i:i) = file_data(start_pos + i - 1) |
| 259 | end do |
| 260 | else |
| 261 | line = '' |
| 262 | end if |
| 263 | |
| 264 | ! Update state |
| 265 | this%line_number = this%line_number + 1 |
| 266 | line_num = this%line_number |
| 267 | byte_off = int(this%pos, i64) |
| 268 | |
| 269 | ! Move past the newline |
| 270 | if (end_pos <= this%size) then |
| 271 | this%pos = end_pos ! Position after newline (0-based) |
| 272 | else |
| 273 | this%pos = this%size |
| 274 | end if |
| 275 | this%byte_offset = int(this%pos, i64) |
| 276 | |
| 277 | success = .true. |
| 278 | |
| 279 | end function mmap_read_line |
| 280 | |
| 281 | function mmap_read_lines_batch(this, batch) result(success) |
| 282 | !> Read up to BATCH_SIZE lines from memory-mapped file |
| 283 | !> Returns line positions without copying data (zero-copy batch read) |
| 284 | class(mmap_file_t), intent(inout) :: this |
| 285 | type(line_batch_t), intent(out) :: batch |
| 286 | logical :: success |
| 287 | |
| 288 | character(len=1, kind=c_char), pointer :: file_data(:) |
| 289 | integer(c_size_t) :: start_pos, end_pos, line_len |
| 290 | integer(c_int64_t) :: newline_pos |
| 291 | integer :: i |
| 292 | |
| 293 | success = .false. |
| 294 | batch%count = 0 |
| 295 | |
| 296 | if (.not. this%is_open) return |
| 297 | if (this%pos >= this%size) return |
| 298 | if (.not. c_associated(this%data)) return |
| 299 | |
| 300 | ! Map the C pointer to a Fortran character array |
| 301 | call c_f_pointer(this%data, file_data, [this%size]) |
| 302 | |
| 303 | ! Read up to BATCH_SIZE lines |
| 304 | do i = 1, BATCH_SIZE |
| 305 | if (this%pos >= this%size) exit |
| 306 | |
| 307 | start_pos = this%pos ! 0-based position |
| 308 | |
| 309 | ! Use SIMD to find newline |
| 310 | newline_pos = simd_find_char_ptr(this%data, int(this%size, c_int64_t), & |
| 311 | int(this%pos, c_int64_t), char(10)) |
| 312 | |
| 313 | if (newline_pos < 0) then |
| 314 | ! No newline found - rest of file is the line |
| 315 | end_pos = this%size |
| 316 | else |
| 317 | end_pos = int(newline_pos, c_size_t) |
| 318 | end if |
| 319 | |
| 320 | ! Calculate line length (excluding newline, but preserving CR like grep) |
| 321 | line_len = end_pos - start_pos |
| 322 | |
| 323 | ! Store line info |
| 324 | batch%count = batch%count + 1 |
| 325 | batch%lines(batch%count)%start_pos = start_pos |
| 326 | batch%lines(batch%count)%length = line_len |
| 327 | this%line_number = this%line_number + 1 |
| 328 | batch%lines(batch%count)%line_num = this%line_number |
| 329 | batch%lines(batch%count)%byte_off = int(start_pos, i64) |
| 330 | |
| 331 | ! Move past the newline |
| 332 | if (end_pos < this%size) then |
| 333 | this%pos = end_pos + 1 ! Position after newline (0-based) |
| 334 | else |
| 335 | this%pos = this%size |
| 336 | end if |
| 337 | this%byte_offset = int(this%pos, i64) |
| 338 | end do |
| 339 | |
| 340 | success = (batch%count > 0) |
| 341 | |
| 342 | end function mmap_read_lines_batch |
| 343 | |
| 344 | function mmap_get_line_text(this, info) result(line) |
| 345 | !> Extract line text from mmap'd memory given line info |
| 346 | class(mmap_file_t), intent(in) :: this |
| 347 | type(line_info_t), intent(in) :: info |
| 348 | character(len=:), allocatable :: line |
| 349 | |
| 350 | character(len=1, kind=c_char), pointer :: file_data(:) |
| 351 | integer :: i |
| 352 | |
| 353 | if (.not. this%is_open .or. .not. c_associated(this%data)) then |
| 354 | line = '' |
| 355 | return |
| 356 | end if |
| 357 | |
| 358 | if (info%length == 0) then |
| 359 | line = '' |
| 360 | return |
| 361 | end if |
| 362 | |
| 363 | ! Map the C pointer to a Fortran character array |
| 364 | call c_f_pointer(this%data, file_data, [this%size]) |
| 365 | |
| 366 | ! Allocate and copy line (start_pos is 0-based, array is 1-based) |
| 367 | allocate(character(len=info%length) :: line) |
| 368 | do i = 1, int(info%length) |
| 369 | line(i:i) = file_data(info%start_pos + i) |
| 370 | end do |
| 371 | |
| 372 | end function mmap_get_line_text |
| 373 | |
| 374 | end module ferp_mmap |
| 375 |