| 1 | module ferp_io |
| 2 | !> File I/O handling for FERP |
| 3 | !> Supports dynamic line length (no fixed limit) |
| 4 | !> Uses memory-mapped I/O for improved performance on files |
| 5 | use ferp_kinds |
| 6 | use ferp_mmap |
| 7 | use, intrinsic :: iso_fortran_env, only: input_unit, error_unit, iostat_end, iostat_eor |
| 8 | implicit none |
| 9 | private |
| 10 | |
| 11 | public :: input_source |
| 12 | public :: SOURCE_STDIN, SOURCE_FILE, SOURCE_MMAP |
| 13 | public :: check_binary_file |
| 14 | ! Re-export batch types from ferp_mmap |
| 15 | public :: line_info_t, line_batch_t, BATCH_SIZE |
| 16 | |
| 17 | integer, parameter :: SOURCE_STDIN = 1 |
| 18 | integer, parameter :: SOURCE_FILE = 2 |
| 19 | integer, parameter :: SOURCE_MMAP = 3 |
| 20 | |
| 21 | type :: input_source |
| 22 | integer :: source_type = SOURCE_STDIN |
| 23 | integer :: unit_num = input_unit |
| 24 | character(len=max_path_len) :: filename = '(standard input)' |
| 25 | logical :: is_open = .false. |
| 26 | integer(i64) :: byte_offset = 0 |
| 27 | integer :: line_number = 0 |
| 28 | logical :: is_binary = .false. |
| 29 | logical :: eof_reached = .false. |
| 30 | logical :: null_data_mode = .false. |
| 31 | type(mmap_file_t) :: mmap_file ! Memory-mapped file handle |
| 32 | contains |
| 33 | procedure :: open => source_open |
| 34 | procedure :: close => source_close |
| 35 | procedure :: read_line_dynamic => source_read_line_dynamic |
| 36 | procedure :: read_line_null_dynamic => source_read_line_null_dynamic |
| 37 | procedure :: read_lines_batch => source_read_lines_batch |
| 38 | procedure :: get_line_text => source_get_line_text |
| 39 | procedure :: check_binary => source_check_binary |
| 40 | end type input_source |
| 41 | |
| 42 | contains |
| 43 | |
| 44 | function source_open(this, filename, suppress_errors, null_data) result(success) |
| 45 | !> Open a file or stdin for reading |
| 46 | class(input_source), intent(inout) :: this |
| 47 | character(len=*), intent(in) :: filename |
| 48 | logical, intent(in), optional :: suppress_errors |
| 49 | logical, intent(in), optional :: null_data |
| 50 | logical :: success |
| 51 | |
| 52 | integer :: ios |
| 53 | character(len=256) :: errmsg |
| 54 | logical :: quiet |
| 55 | |
| 56 | quiet = .false. |
| 57 | if (present(suppress_errors)) quiet = suppress_errors |
| 58 | |
| 59 | this%null_data_mode = .false. |
| 60 | if (present(null_data)) this%null_data_mode = null_data |
| 61 | |
| 62 | success = .false. |
| 63 | |
| 64 | ! Reset state (but preserve is_binary if already set by caller) |
| 65 | this%byte_offset = 0 |
| 66 | this%line_number = 0 |
| 67 | this%eof_reached = .false. |
| 68 | |
| 69 | ! Handle stdin |
| 70 | if (filename == '-' .or. len_trim(filename) == 0) then |
| 71 | this%source_type = SOURCE_STDIN |
| 72 | this%unit_num = input_unit |
| 73 | this%filename = '(standard input)' |
| 74 | this%is_open = .true. |
| 75 | success = .true. |
| 76 | return |
| 77 | end if |
| 78 | |
| 79 | this%filename = filename |
| 80 | |
| 81 | ! For null-data mode, use stream access (can't use mmap easily) |
| 82 | if (this%null_data_mode) then |
| 83 | this%source_type = SOURCE_FILE |
| 84 | open(newunit=this%unit_num, file=filename, status='old', action='read', & |
| 85 | access='stream', form='unformatted', iostat=ios, iomsg=errmsg) |
| 86 | if (ios /= 0) then |
| 87 | if (.not. quiet) then |
| 88 | write(error_unit, '(A)') 'ferp: ' // trim(filename) // ': ' // trim(errmsg) |
| 89 | end if |
| 90 | return |
| 91 | end if |
| 92 | this%is_open = .true. |
| 93 | success = .true. |
| 94 | return |
| 95 | end if |
| 96 | |
| 97 | ! Try memory-mapped I/O first (fastest for regular files) |
| 98 | if (this%mmap_file%open(filename)) then |
| 99 | this%source_type = SOURCE_MMAP |
| 100 | this%is_open = .true. |
| 101 | success = .true. |
| 102 | return |
| 103 | end if |
| 104 | |
| 105 | ! Fall back to standard Fortran I/O |
| 106 | this%source_type = SOURCE_FILE |
| 107 | open(newunit=this%unit_num, file=filename, status='old', action='read', & |
| 108 | iostat=ios, iomsg=errmsg) |
| 109 | |
| 110 | if (ios /= 0) then |
| 111 | if (.not. quiet) then |
| 112 | write(error_unit, '(A)') 'ferp: ' // trim(filename) // ': ' // trim(errmsg) |
| 113 | end if |
| 114 | return |
| 115 | end if |
| 116 | |
| 117 | this%is_open = .true. |
| 118 | success = .true. |
| 119 | |
| 120 | end function source_open |
| 121 | |
| 122 | subroutine source_close(this) |
| 123 | !> Close the input source |
| 124 | class(input_source), intent(inout) :: this |
| 125 | |
| 126 | if (this%is_open) then |
| 127 | if (this%source_type == SOURCE_FILE) then |
| 128 | close(this%unit_num) |
| 129 | else if (this%source_type == SOURCE_MMAP) then |
| 130 | call this%mmap_file%close() |
| 131 | end if |
| 132 | end if |
| 133 | |
| 134 | this%is_open = .false. |
| 135 | end subroutine source_close |
| 136 | |
| 137 | function source_read_line_dynamic(this, line, line_num, byte_off) result(success) |
| 138 | !> Read a line from the input source with dynamic allocation |
| 139 | !> Uses mmap for files, standard I/O for stdin |
| 140 | class(input_source), intent(inout) :: this |
| 141 | character(len=:), allocatable, intent(out) :: line |
| 142 | integer, intent(out) :: line_num |
| 143 | integer(i64), intent(out) :: byte_off |
| 144 | logical :: success |
| 145 | |
| 146 | integer :: ios |
| 147 | integer :: line_len |
| 148 | ! Use a generous fixed buffer for reading (64KB handles most lines) |
| 149 | integer, parameter :: READ_BUFFER_SIZE = 65536 |
| 150 | character(len=READ_BUFFER_SIZE) :: buffer |
| 151 | |
| 152 | success = .false. |
| 153 | line_num = 0 |
| 154 | byte_off = 0 |
| 155 | if (allocated(line)) deallocate(line) |
| 156 | |
| 157 | if (.not. this%is_open .or. this%eof_reached) return |
| 158 | |
| 159 | ! Use mmap for memory-mapped files (fastest path) |
| 160 | if (this%source_type == SOURCE_MMAP) then |
| 161 | success = this%mmap_file%read_line(line, line_num, byte_off) |
| 162 | if (.not. success) this%eof_reached = .true. |
| 163 | return |
| 164 | end if |
| 165 | |
| 166 | ! Standard Fortran I/O for stdin and fallback |
| 167 | read(this%unit_num, '(A)', iostat=ios) buffer |
| 168 | |
| 169 | if (ios == iostat_end) then |
| 170 | this%eof_reached = .true. |
| 171 | return |
| 172 | end if |
| 173 | |
| 174 | if (ios /= 0) then |
| 175 | ! Read error |
| 176 | return |
| 177 | end if |
| 178 | |
| 179 | ! Get line length (preserve CR like grep) |
| 180 | line_len = len_trim(buffer) |
| 181 | |
| 182 | ! Allocate result string trimmed to actual length |
| 183 | if (line_len > 0) then |
| 184 | line = buffer(1:line_len) |
| 185 | else |
| 186 | line = '' |
| 187 | end if |
| 188 | |
| 189 | ! Update state |
| 190 | this%line_number = this%line_number + 1 |
| 191 | line_num = this%line_number |
| 192 | byte_off = this%byte_offset |
| 193 | |
| 194 | ! Update byte offset (line length + newline) |
| 195 | this%byte_offset = this%byte_offset + int(line_len, i64) + 1_i64 |
| 196 | |
| 197 | success = .true. |
| 198 | |
| 199 | end function source_read_line_dynamic |
| 200 | |
| 201 | function source_read_line_null_dynamic(this, line, line_num, byte_off) result(success) |
| 202 | !> Read a NUL-terminated line from the input source (for -z mode) |
| 203 | !> Line buffer grows automatically to accommodate any line length |
| 204 | class(input_source), intent(inout) :: this |
| 205 | character(len=:), allocatable, intent(out) :: line |
| 206 | integer, intent(out) :: line_num |
| 207 | integer(i64), intent(out) :: byte_off |
| 208 | logical :: success |
| 209 | |
| 210 | integer :: ios, pos, capacity |
| 211 | character(len=1) :: ch |
| 212 | character(len=:), allocatable :: new_buf |
| 213 | |
| 214 | success = .false. |
| 215 | line_num = 0 |
| 216 | byte_off = 0 |
| 217 | if (allocated(line)) deallocate(line) |
| 218 | |
| 219 | if (.not. this%is_open .or. this%eof_reached) return |
| 220 | |
| 221 | ! Start with initial buffer |
| 222 | capacity = initial_line_len |
| 223 | allocate(character(len=capacity) :: line) |
| 224 | pos = 0 |
| 225 | |
| 226 | ! Read byte by byte until NUL or EOF |
| 227 | do |
| 228 | ! Use formatted read for stdin, unformatted for files |
| 229 | if (this%source_type == SOURCE_STDIN) then |
| 230 | read(this%unit_num, '(A1)', iostat=ios, advance='no') ch |
| 231 | else |
| 232 | read(this%unit_num, iostat=ios) ch |
| 233 | end if |
| 234 | |
| 235 | if (ios == iostat_end) then |
| 236 | this%eof_reached = .true. |
| 237 | if (pos > 0) then |
| 238 | exit ! Return what we have |
| 239 | else |
| 240 | deallocate(line) |
| 241 | return |
| 242 | end if |
| 243 | end if |
| 244 | |
| 245 | if (ios /= 0) then |
| 246 | deallocate(line) |
| 247 | return |
| 248 | end if |
| 249 | |
| 250 | ! Check for NUL terminator |
| 251 | if (ch == char(0)) exit |
| 252 | |
| 253 | ! Skip carriage returns (for Windows line endings in data) |
| 254 | if (ch == char(13)) cycle |
| 255 | |
| 256 | ! Convert embedded newlines to space |
| 257 | if (ch == char(10)) ch = ' ' |
| 258 | |
| 259 | ! Grow buffer if needed |
| 260 | if (pos >= capacity) then |
| 261 | capacity = capacity * 2 |
| 262 | allocate(character(len=capacity) :: new_buf) |
| 263 | if (pos > 0) new_buf(1:pos) = line(1:pos) |
| 264 | call move_alloc(new_buf, line) |
| 265 | end if |
| 266 | |
| 267 | ! Add character to line |
| 268 | pos = pos + 1 |
| 269 | line(pos:pos) = ch |
| 270 | end do |
| 271 | |
| 272 | ! Trim to actual length |
| 273 | if (pos > 0) then |
| 274 | new_buf = line(1:pos) |
| 275 | call move_alloc(new_buf, line) |
| 276 | else |
| 277 | line = '' |
| 278 | end if |
| 279 | |
| 280 | ! Update state |
| 281 | this%line_number = this%line_number + 1 |
| 282 | line_num = this%line_number |
| 283 | byte_off = this%byte_offset |
| 284 | |
| 285 | ! Update byte offset (record length + NUL) |
| 286 | this%byte_offset = this%byte_offset + int(len(line), i64) + 1_i64 |
| 287 | |
| 288 | success = .true. |
| 289 | |
| 290 | end function source_read_line_null_dynamic |
| 291 | |
| 292 | function check_binary_file(filename) result(is_binary) |
| 293 | !> Check if a file is binary by looking for NUL bytes or non-text chars |
| 294 | !> This must be called BEFORE the file is opened for reading |
| 295 | character(len=*), intent(in) :: filename |
| 296 | logical :: is_binary |
| 297 | |
| 298 | integer, parameter :: CHECK_SIZE = 8192 |
| 299 | character(len=CHECK_SIZE) :: buffer |
| 300 | integer :: ios, i, check_unit, bytes_read |
| 301 | integer :: char_code |
| 302 | logical :: file_exists |
| 303 | |
| 304 | is_binary = .false. |
| 305 | |
| 306 | ! Open file in stream mode to check for binary content |
| 307 | inquire(file=filename, exist=file_exists) |
| 308 | if (.not. file_exists) return |
| 309 | |
| 310 | open(newunit=check_unit, file=filename, status='old', action='read', & |
| 311 | access='stream', form='unformatted', iostat=ios) |
| 312 | if (ios /= 0) return |
| 313 | |
| 314 | ! Initialize buffer to spaces |
| 315 | buffer = '' |
| 316 | |
| 317 | read(check_unit, iostat=ios) buffer |
| 318 | close(check_unit) |
| 319 | |
| 320 | ! Determine how many bytes were actually read |
| 321 | bytes_read = CHECK_SIZE |
| 322 | if (ios == iostat_end) then |
| 323 | ! File was smaller than buffer - find actual length |
| 324 | do i = CHECK_SIZE, 1, -1 |
| 325 | if (buffer(i:i) /= char(0)) then |
| 326 | bytes_read = i |
| 327 | exit |
| 328 | end if |
| 329 | end do |
| 330 | else if (ios /= 0) then |
| 331 | return |
| 332 | end if |
| 333 | |
| 334 | ! Check each byte for binary indicators |
| 335 | do i = 1, bytes_read |
| 336 | char_code = ichar(buffer(i:i)) |
| 337 | |
| 338 | ! NUL byte is definitive binary indicator |
| 339 | if (char_code == 0) then |
| 340 | is_binary = .true. |
| 341 | return |
| 342 | end if |
| 343 | |
| 344 | ! Non-printable control chars (except common text ones) |
| 345 | ! Allow: tab (9), newline (10), carriage return (13), form feed (12) |
| 346 | if (char_code < 32 .and. char_code /= 9 .and. char_code /= 10 & |
| 347 | .and. char_code /= 13 .and. char_code /= 12) then |
| 348 | is_binary = .true. |
| 349 | return |
| 350 | end if |
| 351 | end do |
| 352 | |
| 353 | end function check_binary_file |
| 354 | |
| 355 | subroutine source_check_binary(this) |
| 356 | !> Check if the source is binary (wrapper that calls check_binary_file) |
| 357 | !> NOTE: This only works if called BEFORE the file is opened |
| 358 | class(input_source), intent(inout) :: this |
| 359 | |
| 360 | this%is_binary = .false. |
| 361 | if (this%source_type == SOURCE_STDIN) return |
| 362 | |
| 363 | this%is_binary = check_binary_file(trim(this%filename)) |
| 364 | |
| 365 | end subroutine source_check_binary |
| 366 | |
| 367 | function source_read_lines_batch(this, batch) result(success) |
| 368 | !> Read multiple lines as a batch (wrapper for mmap batch read) |
| 369 | !> Only works for mmap sources; returns false for stdin/file |
| 370 | class(input_source), intent(inout) :: this |
| 371 | type(line_batch_t), intent(out) :: batch |
| 372 | logical :: success |
| 373 | |
| 374 | success = .false. |
| 375 | batch%count = 0 |
| 376 | |
| 377 | if (.not. this%is_open .or. this%eof_reached) return |
| 378 | |
| 379 | ! Only mmap sources support batch reading |
| 380 | if (this%source_type == SOURCE_MMAP) then |
| 381 | success = this%mmap_file%read_lines_batch(batch) |
| 382 | if (.not. success) this%eof_reached = .true. |
| 383 | end if |
| 384 | |
| 385 | end function source_read_lines_batch |
| 386 | |
| 387 | function source_get_line_text(this, info) result(line) |
| 388 | !> Get line text from mmap given line info (wrapper) |
| 389 | class(input_source), intent(in) :: this |
| 390 | type(line_info_t), intent(in) :: info |
| 391 | character(len=:), allocatable :: line |
| 392 | |
| 393 | if (this%source_type == SOURCE_MMAP) then |
| 394 | line = this%mmap_file%get_line_text(info) |
| 395 | else |
| 396 | line = '' |
| 397 | end if |
| 398 | |
| 399 | end function source_get_line_text |
| 400 | |
| 401 | end module ferp_io |
| 402 |