| 1 | //! Fixed-form (F77) Fortran lexer. |
| 2 | //! |
| 3 | //! Two-pass approach: |
| 4 | //! 1. Preprocess lines: identify comments, extract labels, join continuations, |
| 5 | //! strip columns 73+, handle tab-form extension. |
| 6 | //! 2. Tokenize each logical statement body, handling whitespace insensitivity |
| 7 | //! and Hollerith constants. |
| 8 | //! |
| 9 | //! Produces the same Token types as the free-form lexer. |
| 10 | |
| 11 | use super::{is_keyword, is_known_dot_op, LexError, Position, Span, Token, TokenKind}; |
| 12 | |
| 13 | /// Tokenize fixed-form Fortran source. |
| 14 | pub fn tokenize_fixed(src: &str, file_id: u32) -> Result<Vec<Token>, LexError> { |
| 15 | let statements = preprocess_lines(src, file_id); |
| 16 | let mut tokens = Vec::new(); |
| 17 | |
| 18 | for stmt in &statements { |
| 19 | match stmt { |
| 20 | FixedLine::Comment { text, span } => { |
| 21 | tokens.push(Token { |
| 22 | kind: TokenKind::Comment, |
| 23 | text: text.clone(), |
| 24 | span: *span, |
| 25 | }); |
| 26 | tokens.push(Token { |
| 27 | kind: TokenKind::Newline, |
| 28 | text: "\n".into(), |
| 29 | span: *span, |
| 30 | }); |
| 31 | } |
| 32 | FixedLine::Statement { |
| 33 | label, |
| 34 | body, |
| 35 | start_line, |
| 36 | file_id: fid, |
| 37 | } => { |
| 38 | // Emit label as integer literal if present. |
| 39 | if let Some(label_text) = label { |
| 40 | let label_trimmed = label_text.trim(); |
| 41 | if !label_trimmed.is_empty() { |
| 42 | tokens.push(Token { |
| 43 | kind: TokenKind::IntegerLiteral, |
| 44 | text: label_trimmed.to_string(), |
| 45 | span: Span { |
| 46 | file_id: *fid, |
| 47 | start: Position { |
| 48 | line: *start_line, |
| 49 | col: 1, |
| 50 | }, |
| 51 | end: Position { |
| 52 | line: *start_line, |
| 53 | col: 6, |
| 54 | }, |
| 55 | }, |
| 56 | }); |
| 57 | } |
| 58 | } |
| 59 | |
| 60 | // Tokenize the body with the whitespace-insensitive scanner. |
| 61 | let body_tokens = tokenize_body(body, *fid, *start_line)?; |
| 62 | tokens.extend(body_tokens); |
| 63 | |
| 64 | tokens.push(Token { |
| 65 | kind: TokenKind::Newline, |
| 66 | text: "\n".into(), |
| 67 | span: Span { |
| 68 | file_id: *fid, |
| 69 | start: Position { |
| 70 | line: *start_line, |
| 71 | col: 1, |
| 72 | }, |
| 73 | end: Position { |
| 74 | line: *start_line, |
| 75 | col: 1, |
| 76 | }, |
| 77 | }, |
| 78 | }); |
| 79 | } |
| 80 | FixedLine::Blank { span } => { |
| 81 | tokens.push(Token { |
| 82 | kind: TokenKind::Newline, |
| 83 | text: "\n".into(), |
| 84 | span: *span, |
| 85 | }); |
| 86 | } |
| 87 | } |
| 88 | } |
| 89 | |
| 90 | tokens.push(Token { |
| 91 | kind: TokenKind::Eof, |
| 92 | text: String::new(), |
| 93 | span: Span { |
| 94 | file_id, |
| 95 | start: Position { |
| 96 | line: src.lines().count() as u32 + 1, |
| 97 | col: 1, |
| 98 | }, |
| 99 | end: Position { |
| 100 | line: src.lines().count() as u32 + 1, |
| 101 | col: 1, |
| 102 | }, |
| 103 | }, |
| 104 | }); |
| 105 | |
| 106 | Ok(tokens) |
| 107 | } |
| 108 | |
| 109 | fn unexpected_char_message(text: &str, pos: usize, context: &str) -> String { |
| 110 | let ch = text.as_bytes().get(pos).copied().unwrap_or(b'?') as char; |
| 111 | format!("{context}: '{ch}'") |
| 112 | } |
| 113 | |
| 114 | // ---- Whitespace-insensitive body tokenizer ---- |
| 115 | |
| 116 | /// Tokenize a fixed-form statement body with whitespace insensitivity. |
| 117 | /// |
| 118 | /// Three-phase approach: |
| 119 | /// 1. Protect Hollerith constants (nH...) by converting to string literals before stripping |
| 120 | /// 2. Strip all whitespace outside string literals |
| 121 | /// 3. Tokenize with keyword-splitting: longest keyword prefix match at letter runs |
| 122 | fn tokenize_body(body: &str, file_id: u32, line: u32) -> Result<Vec<Token>, LexError> { |
| 123 | // Phase 1: Convert Hollerith constants to string literals (preserves their spaces). |
| 124 | let hollerith_protected = protect_hollerith(body); |
| 125 | // Phase 2: Strip whitespace outside string literals. |
| 126 | let stripped = strip_whitespace_outside_strings(&hollerith_protected); |
| 127 | let bytes = stripped.as_bytes(); |
| 128 | let mut tokens = Vec::new(); |
| 129 | let mut pos = 0; |
| 130 | |
| 131 | while pos < bytes.len() { |
| 132 | let col = (pos as u32) + 7; |
| 133 | let start = Position { line, col }; |
| 134 | let ch = bytes[pos]; |
| 135 | |
| 136 | // Comment (! to end). |
| 137 | if ch == b'!' { |
| 138 | tokens.push(Token { |
| 139 | kind: TokenKind::Comment, |
| 140 | text: stripped[pos..].to_string(), |
| 141 | span: Span { |
| 142 | file_id, |
| 143 | start, |
| 144 | end: Position { |
| 145 | line, |
| 146 | col: col + (bytes.len() - pos) as u32, |
| 147 | }, |
| 148 | }, |
| 149 | }); |
| 150 | break; |
| 151 | } |
| 152 | |
| 153 | // String literal. |
| 154 | if ch == b'\'' || ch == b'"' { |
| 155 | let (tok, consumed) = lex_fixed_string(&stripped, pos, file_id, line)?; |
| 156 | tokens.push(tok); |
| 157 | pos += consumed; |
| 158 | continue; |
| 159 | } |
| 160 | |
| 161 | // Dot-operator or real starting with dot. |
| 162 | if ch == b'.' { |
| 163 | if pos + 1 < bytes.len() && bytes[pos + 1].is_ascii_digit() { |
| 164 | let (tok, consumed) = lex_fixed_number(&stripped, pos, file_id, line); |
| 165 | tokens.push(tok); |
| 166 | pos += consumed; |
| 167 | } else { |
| 168 | let (tok, consumed) = lex_fixed_dot_op(&stripped, pos, file_id, line)?; |
| 169 | tokens.push(tok); |
| 170 | pos += consumed; |
| 171 | } |
| 172 | continue; |
| 173 | } |
| 174 | |
| 175 | // Number (integer or real). |
| 176 | if ch.is_ascii_digit() { |
| 177 | let (tok, consumed) = lex_fixed_number(&stripped, pos, file_id, line); |
| 178 | tokens.push(tok); |
| 179 | pos += consumed; |
| 180 | continue; |
| 181 | } |
| 182 | |
| 183 | // BOZ literal: B/O/Z followed by quote. |
| 184 | if matches!(ch, b'B' | b'b' | b'O' | b'o' | b'Z' | b'z') |
| 185 | && pos + 1 < bytes.len() |
| 186 | && matches!(bytes[pos + 1], b'\'' | b'"') |
| 187 | { |
| 188 | let (tok, consumed) = lex_fixed_boz(&stripped, pos, file_id, line)?; |
| 189 | tokens.push(tok); |
| 190 | pos += consumed; |
| 191 | continue; |
| 192 | } |
| 193 | |
| 194 | // Letter — keyword or identifier with fixed-form prefix splitting. |
| 195 | if ch.is_ascii_alphabetic() || ch == b'_' { |
| 196 | let (tok, consumed) = |
| 197 | lex_fixed_ident_or_keyword(&stripped, pos, file_id, line, &tokens); |
| 198 | tokens.push(tok); |
| 199 | pos += consumed; |
| 200 | continue; |
| 201 | } |
| 202 | |
| 203 | // Operators and punctuation. |
| 204 | let (tok, consumed) = lex_fixed_punct(&stripped, pos, file_id, line)?; |
| 205 | tokens.push(tok); |
| 206 | pos += consumed; |
| 207 | } |
| 208 | |
| 209 | Ok(tokens) |
| 210 | } |
| 211 | |
| 212 | /// Convert Hollerith constants (nH...) to quoted string literals BEFORE whitespace stripping. |
| 213 | /// This preserves spaces inside Hollerith content: `6H HELLO` → `' HELLO'`. |
| 214 | fn protect_hollerith(body: &str) -> String { |
| 215 | let bytes = body.as_bytes(); |
| 216 | let mut result = String::with_capacity(body.len()); |
| 217 | let mut i = 0; |
| 218 | |
| 219 | while i < bytes.len() { |
| 220 | // Inside a string literal: copy verbatim. |
| 221 | if bytes[i] == b'\'' || bytes[i] == b'"' { |
| 222 | let quote = bytes[i]; |
| 223 | result.push(bytes[i] as char); |
| 224 | i += 1; |
| 225 | while i < bytes.len() { |
| 226 | result.push(bytes[i] as char); |
| 227 | if bytes[i] == quote { |
| 228 | i += 1; |
| 229 | if i < bytes.len() && bytes[i] == quote { |
| 230 | result.push(bytes[i] as char); |
| 231 | i += 1; |
| 232 | } else { |
| 233 | break; |
| 234 | } |
| 235 | } else { |
| 236 | i += 1; |
| 237 | } |
| 238 | } |
| 239 | continue; |
| 240 | } |
| 241 | |
| 242 | // Check for Hollerith: digits followed by H, not preceded by a letter/digit. |
| 243 | if bytes[i].is_ascii_digit() { |
| 244 | let preceded_by_alnum = |
| 245 | i > 0 && (bytes[i - 1].is_ascii_alphanumeric() || bytes[i - 1] == b'_'); |
| 246 | if !preceded_by_alnum { |
| 247 | let digit_start = i; |
| 248 | while i < bytes.len() && bytes[i].is_ascii_digit() { |
| 249 | i += 1; |
| 250 | } |
| 251 | if i < bytes.len() && (bytes[i] == b'H' || bytes[i] == b'h') { |
| 252 | if let Ok(count) = body[digit_start..i].parse::<usize>() { |
| 253 | i += 1; // skip H |
| 254 | if i + count <= bytes.len() { |
| 255 | // Replace nH... with '...' |
| 256 | result.push('\''); |
| 257 | result.push_str(&body[i..i + count]); |
| 258 | result.push('\''); |
| 259 | i += count; |
| 260 | continue; |
| 261 | } |
| 262 | } |
| 263 | } |
| 264 | // Not Hollerith — put the digits back. |
| 265 | result.push_str(&body[digit_start..i]); |
| 266 | continue; |
| 267 | } |
| 268 | } |
| 269 | |
| 270 | result.push(bytes[i] as char); |
| 271 | i += 1; |
| 272 | } |
| 273 | result |
| 274 | } |
| 275 | |
| 276 | /// Strip whitespace from body text, preserving content inside string literals. |
| 277 | fn strip_whitespace_outside_strings(body: &str) -> String { |
| 278 | let mut result = String::with_capacity(body.len()); |
| 279 | let bytes = body.as_bytes(); |
| 280 | let mut i = 0; |
| 281 | while i < bytes.len() { |
| 282 | if bytes[i] == b'\'' || bytes[i] == b'"' { |
| 283 | let quote = bytes[i]; |
| 284 | result.push(quote as char); |
| 285 | i += 1; |
| 286 | while i < bytes.len() { |
| 287 | result.push(bytes[i] as char); |
| 288 | if bytes[i] == quote { |
| 289 | i += 1; |
| 290 | if i < bytes.len() && bytes[i] == quote { |
| 291 | result.push(bytes[i] as char); |
| 292 | i += 1; |
| 293 | } else { |
| 294 | break; |
| 295 | } |
| 296 | } else { |
| 297 | i += 1; |
| 298 | } |
| 299 | } |
| 300 | } else if bytes[i] == b' ' || bytes[i] == b'\t' { |
| 301 | i += 1; |
| 302 | } else { |
| 303 | result.push(bytes[i] as char); |
| 304 | i += 1; |
| 305 | } |
| 306 | } |
| 307 | result |
| 308 | } |
| 309 | |
| 310 | /// Lex a string literal in whitespace-stripped body. |
| 311 | fn lex_fixed_string( |
| 312 | text: &str, |
| 313 | pos: usize, |
| 314 | file_id: u32, |
| 315 | line: u32, |
| 316 | ) -> Result<(Token, usize), LexError> { |
| 317 | let bytes = text.as_bytes(); |
| 318 | let quote = bytes[pos]; |
| 319 | let mut end = pos + 1; |
| 320 | let mut tok_text = String::new(); |
| 321 | tok_text.push(quote as char); |
| 322 | |
| 323 | let mut closed = false; |
| 324 | while end < bytes.len() { |
| 325 | tok_text.push(bytes[end] as char); |
| 326 | if bytes[end] == quote { |
| 327 | end += 1; |
| 328 | if end < bytes.len() && bytes[end] == quote { |
| 329 | tok_text.push(bytes[end] as char); |
| 330 | end += 1; |
| 331 | } else { |
| 332 | closed = true; |
| 333 | break; |
| 334 | } |
| 335 | } else { |
| 336 | end += 1; |
| 337 | } |
| 338 | } |
| 339 | |
| 340 | if !closed { |
| 341 | let col = (pos as u32) + 7; |
| 342 | return Err(LexError { |
| 343 | span: Span { |
| 344 | file_id, |
| 345 | start: Position { line, col }, |
| 346 | end: Position { line, col }, |
| 347 | }, |
| 348 | msg: "unterminated string literal in fixed-form body".into(), |
| 349 | }); |
| 350 | } |
| 351 | |
| 352 | let col = (pos as u32) + 7; |
| 353 | Ok(( |
| 354 | Token { |
| 355 | kind: TokenKind::StringLiteral, |
| 356 | text: tok_text, |
| 357 | span: Span { |
| 358 | file_id, |
| 359 | start: Position { line, col }, |
| 360 | end: Position { |
| 361 | line, |
| 362 | col: col + (end - pos) as u32, |
| 363 | }, |
| 364 | }, |
| 365 | }, |
| 366 | end - pos, |
| 367 | )) |
| 368 | } |
| 369 | |
| 370 | /// Lex a dot-operator (.AND., .EQ., .TRUE., .myop.) in whitespace-stripped body. |
| 371 | fn lex_fixed_dot_op( |
| 372 | text: &str, |
| 373 | pos: usize, |
| 374 | file_id: u32, |
| 375 | line: u32, |
| 376 | ) -> Result<(Token, usize), LexError> { |
| 377 | let bytes = text.as_bytes(); |
| 378 | let mut end = pos + 1; // skip first dot |
| 379 | let mut name = String::new(); |
| 380 | |
| 381 | while end < bytes.len() && (bytes[end].is_ascii_alphabetic() || bytes[end] == b'_') { |
| 382 | name.push(bytes[end] as char); |
| 383 | end += 1; |
| 384 | } |
| 385 | |
| 386 | if end < bytes.len() && bytes[end] == b'.' { |
| 387 | end += 1; // closing dot |
| 388 | } |
| 389 | |
| 390 | let lower = name.to_lowercase(); |
| 391 | let col = (pos as u32) + 7; |
| 392 | let tok_text = format!(".{}.", name); |
| 393 | let span = Span { |
| 394 | file_id, |
| 395 | start: Position { line, col }, |
| 396 | end: Position { |
| 397 | line, |
| 398 | col: col + (end - pos) as u32, |
| 399 | }, |
| 400 | }; |
| 401 | |
| 402 | if lower == "true" || lower == "false" { |
| 403 | // Check for kind suffix. |
| 404 | let mut full_text = tok_text; |
| 405 | if end < bytes.len() && bytes[end] == b'_' { |
| 406 | full_text.push('_'); |
| 407 | end += 1; |
| 408 | while end < bytes.len() && (bytes[end].is_ascii_alphanumeric() || bytes[end] == b'_') { |
| 409 | full_text.push(bytes[end] as char); |
| 410 | end += 1; |
| 411 | } |
| 412 | } |
| 413 | return Ok(( |
| 414 | Token { |
| 415 | kind: TokenKind::LogicalLiteral, |
| 416 | text: full_text, |
| 417 | span, |
| 418 | }, |
| 419 | end - pos, |
| 420 | )); |
| 421 | } |
| 422 | |
| 423 | let kind = if is_known_dot_op(&lower) { |
| 424 | TokenKind::DotOp(lower) |
| 425 | } else { |
| 426 | TokenKind::DefinedOp(name.to_lowercase()) |
| 427 | }; |
| 428 | |
| 429 | Ok(( |
| 430 | Token { |
| 431 | kind, |
| 432 | text: tok_text, |
| 433 | span, |
| 434 | }, |
| 435 | end - pos, |
| 436 | )) |
| 437 | } |
| 438 | |
| 439 | /// Lex a number (integer or real) in whitespace-stripped body. |
| 440 | fn lex_fixed_number(text: &str, pos: usize, file_id: u32, line: u32) -> (Token, usize) { |
| 441 | let bytes = text.as_bytes(); |
| 442 | let mut end = pos; |
| 443 | let mut is_real = false; |
| 444 | let mut tok_text = String::new(); |
| 445 | |
| 446 | // Leading digits. |
| 447 | while end < bytes.len() && bytes[end].is_ascii_digit() { |
| 448 | tok_text.push(bytes[end] as char); |
| 449 | end += 1; |
| 450 | } |
| 451 | |
| 452 | // Decimal point — but not if followed by letter (dot-op like .EQ.). |
| 453 | if end < bytes.len() && bytes[end] == b'.' { |
| 454 | let after_dot = if end + 1 < bytes.len() { |
| 455 | bytes[end + 1] |
| 456 | } else { |
| 457 | 0 |
| 458 | }; |
| 459 | let dot_is_numeric = after_dot.is_ascii_digit() |
| 460 | || tok_text.is_empty() // leading dot |
| 461 | || { |
| 462 | // Check for exponent: .e5 vs .eq. |
| 463 | if matches!(after_dot, b'e' | b'E' | b'd' | b'D') { |
| 464 | let after_ed = if end + 2 < bytes.len() { bytes[end + 2] } else { 0 }; |
| 465 | matches!(after_ed, b'0'..=b'9' | b'+' | b'-') |
| 466 | } else { |
| 467 | !after_dot.is_ascii_alphabetic() // 5. followed by op/end |
| 468 | } |
| 469 | }; |
| 470 | |
| 471 | if dot_is_numeric { |
| 472 | is_real = true; |
| 473 | tok_text.push(bytes[end] as char); |
| 474 | end += 1; |
| 475 | while end < bytes.len() && bytes[end].is_ascii_digit() { |
| 476 | tok_text.push(bytes[end] as char); |
| 477 | end += 1; |
| 478 | } |
| 479 | } |
| 480 | } |
| 481 | |
| 482 | // Exponent — only consume e/d if followed by digit or +/- then digit. |
| 483 | // This prevents `10DO` from being lexed as real `10D` + identifier `O`. |
| 484 | if end < bytes.len() && matches!(bytes[end], b'e' | b'E' | b'd' | b'D') { |
| 485 | let after_ed = if end + 1 < bytes.len() { |
| 486 | bytes[end + 1] |
| 487 | } else { |
| 488 | 0 |
| 489 | }; |
| 490 | let has_exponent_digits = after_ed.is_ascii_digit() |
| 491 | || (matches!(after_ed, b'+' | b'-') |
| 492 | && end + 2 < bytes.len() |
| 493 | && bytes[end + 2].is_ascii_digit()); |
| 494 | |
| 495 | if has_exponent_digits { |
| 496 | is_real = true; |
| 497 | tok_text.push(bytes[end] as char); |
| 498 | end += 1; |
| 499 | if end < bytes.len() && matches!(bytes[end], b'+' | b'-') { |
| 500 | tok_text.push(bytes[end] as char); |
| 501 | end += 1; |
| 502 | } |
| 503 | while end < bytes.len() && bytes[end].is_ascii_digit() { |
| 504 | tok_text.push(bytes[end] as char); |
| 505 | end += 1; |
| 506 | } |
| 507 | } |
| 508 | } |
| 509 | |
| 510 | // Kind suffix. |
| 511 | if end < bytes.len() && bytes[end] == b'_' { |
| 512 | tok_text.push(bytes[end] as char); |
| 513 | end += 1; |
| 514 | while end < bytes.len() && (bytes[end].is_ascii_alphanumeric() || bytes[end] == b'_') { |
| 515 | tok_text.push(bytes[end] as char); |
| 516 | end += 1; |
| 517 | } |
| 518 | } |
| 519 | |
| 520 | let col = (pos as u32) + 7; |
| 521 | let kind = if is_real { |
| 522 | TokenKind::RealLiteral |
| 523 | } else { |
| 524 | TokenKind::IntegerLiteral |
| 525 | }; |
| 526 | ( |
| 527 | Token { |
| 528 | kind, |
| 529 | text: tok_text, |
| 530 | span: Span { |
| 531 | file_id, |
| 532 | start: Position { line, col }, |
| 533 | end: Position { |
| 534 | line, |
| 535 | col: col + (end - pos) as u32, |
| 536 | }, |
| 537 | }, |
| 538 | }, |
| 539 | end - pos, |
| 540 | ) |
| 541 | } |
| 542 | |
| 543 | /// Lex an identifier or keyword in whitespace-stripped fixed-form body. |
| 544 | /// |
| 545 | /// Fixed-form removes spaces from the statement body, so common source like |
| 546 | /// `PROGRAM HELLO` and `INTEGER I, N` reaches us as `PROGRAMHELLO` and |
| 547 | /// `INTEGERI,N`. The parser does not have enough context to recover those |
| 548 | /// boundaries reliably from a single opaque identifier token, so the fixed-form |
| 549 | /// lexer splits a small set of keyword prefixes when we are at a statement |
| 550 | /// boundary or another keyword-following context. |
| 551 | /// |
| 552 | /// The DO/assignment ambiguity still needs special handling before the generic |
| 553 | /// prefix splitter because `DO10I=1,10` is a loop while `DO10I=1.10` is an |
| 554 | /// assignment. |
| 555 | fn lex_fixed_ident_or_keyword( |
| 556 | text: &str, |
| 557 | pos: usize, |
| 558 | file_id: u32, |
| 559 | line: u32, |
| 560 | prior_tokens: &[Token], |
| 561 | ) -> (Token, usize) { |
| 562 | let bytes = text.as_bytes(); |
| 563 | let mut run_end = pos; |
| 564 | while run_end < bytes.len() |
| 565 | && (bytes[run_end].is_ascii_alphanumeric() || bytes[run_end] == b'_') |
| 566 | { |
| 567 | run_end += 1; |
| 568 | } |
| 569 | let run = &text[pos..run_end]; |
| 570 | let run_lower = run.to_lowercase(); |
| 571 | |
| 572 | // DO/assignment ambiguity: if the run starts with "do" followed by digits, |
| 573 | // check if this is a DO loop (has comma after =) or an assignment. |
| 574 | if run_lower.starts_with("do") |
| 575 | && run.len() > 2 |
| 576 | && run.as_bytes()[2].is_ascii_digit() |
| 577 | && is_do_loop_context(text, pos + 2) |
| 578 | { |
| 579 | // IS a DO loop — emit just "DO" (2 chars). Subsequent calls |
| 580 | // will pick up the label (digits) and variable (letters) separately. |
| 581 | let col = (pos as u32) + 7; |
| 582 | return ( |
| 583 | Token { |
| 584 | kind: TokenKind::Identifier, |
| 585 | text: run[..2].to_string(), |
| 586 | span: Span { |
| 587 | file_id, |
| 588 | start: Position { line, col }, |
| 589 | end: Position { line, col: col + 2 }, |
| 590 | }, |
| 591 | }, |
| 592 | 2, |
| 593 | ); |
| 594 | } |
| 595 | |
| 596 | if let Some(prefix_len) = split_fixed_keyword_prefix(text, pos, run, prior_tokens) { |
| 597 | return make_ident_token(&run[..prefix_len], pos, file_id, line); |
| 598 | } |
| 599 | |
| 600 | // Emit the entire alphanumeric run as one identifier. |
| 601 | make_ident_token(run, pos, file_id, line) |
| 602 | } |
| 603 | |
| 604 | fn split_fixed_keyword_prefix( |
| 605 | text: &str, |
| 606 | pos: usize, |
| 607 | run: &str, |
| 608 | prior_tokens: &[Token], |
| 609 | ) -> Option<usize> { |
| 610 | if !allow_fixed_keyword_split(prior_tokens) || run.len() <= 4 { |
| 611 | return None; |
| 612 | } |
| 613 | |
| 614 | let trailing = text.as_bytes().get(pos + run.len()).copied(); |
| 615 | if matches!(trailing, Some(b'=') | Some(b'%')) { |
| 616 | return None; |
| 617 | } |
| 618 | |
| 619 | for prefix_len in (4..run.len()).rev() { |
| 620 | let prefix = &run[..prefix_len]; |
| 621 | let prefix_lower = prefix.to_ascii_lowercase(); |
| 622 | let suffix = &run[prefix_len..]; |
| 623 | let suffix_first = suffix.as_bytes()[0]; |
| 624 | |
| 625 | let is_fixed_keyword = prefix_lower == "endtype" || is_keyword(prefix).is_some(); |
| 626 | if !is_fixed_keyword { |
| 627 | continue; |
| 628 | } |
| 629 | |
| 630 | if suffix_first.is_ascii_digit() && !matches!(prefix_lower.as_str(), "goto" | "call") { |
| 631 | continue; |
| 632 | } |
| 633 | |
| 634 | return Some(prefix_len); |
| 635 | } |
| 636 | |
| 637 | None |
| 638 | } |
| 639 | |
| 640 | fn allow_fixed_keyword_split(prior_tokens: &[Token]) -> bool { |
| 641 | let Some(prev) = prior_tokens.last() else { |
| 642 | return true; |
| 643 | }; |
| 644 | |
| 645 | match prev.kind { |
| 646 | TokenKind::Comma | TokenKind::ColonColon => true, |
| 647 | TokenKind::Identifier => matches!( |
| 648 | prev.text.to_ascii_lowercase().as_str(), |
| 649 | "integer" |
| 650 | | "real" |
| 651 | | "doubleprecision" |
| 652 | | "doublecomplex" |
| 653 | | "complex" |
| 654 | | "character" |
| 655 | | "logical" |
| 656 | | "type" |
| 657 | | "class" |
| 658 | | "implicit" |
| 659 | | "program" |
| 660 | | "module" |
| 661 | | "submodule" |
| 662 | | "subroutine" |
| 663 | | "function" |
| 664 | | "entry" |
| 665 | | "call" |
| 666 | | "pure" |
| 667 | | "impure" |
| 668 | | "elemental" |
| 669 | | "recursive" |
| 670 | | "end" |
| 671 | | "endtype" |
| 672 | ), |
| 673 | _ => false, |
| 674 | } |
| 675 | } |
| 676 | |
| 677 | fn make_ident_token(text: &str, pos: usize, file_id: u32, line: u32) -> (Token, usize) { |
| 678 | let col = (pos as u32) + 7; |
| 679 | ( |
| 680 | Token { |
| 681 | kind: TokenKind::Identifier, |
| 682 | text: text.to_string(), |
| 683 | span: Span { |
| 684 | file_id, |
| 685 | start: Position { line, col }, |
| 686 | end: Position { |
| 687 | line, |
| 688 | col: col + text.len() as u32, |
| 689 | }, |
| 690 | }, |
| 691 | }, |
| 692 | text.len(), |
| 693 | ) |
| 694 | } |
| 695 | |
| 696 | /// Check if the rest of the statement after DO+digits looks like a DO loop. |
| 697 | /// A DO loop has: DO [label] variable = start , end [, step] |
| 698 | /// An assignment has: DO[label][var] = expr (no top-level comma after =). |
| 699 | fn is_do_loop_context(text: &str, after_do: usize) -> bool { |
| 700 | let bytes = text.as_bytes(); |
| 701 | |
| 702 | // Find '=' that is not inside strings or parens. |
| 703 | let eq_pos = find_top_level_char(bytes, after_do, b'='); |
| 704 | let eq_pos = match eq_pos { |
| 705 | Some(p) => p, |
| 706 | None => return false, |
| 707 | }; |
| 708 | |
| 709 | // Make sure '=' is not '==' (comparison). |
| 710 | if eq_pos + 1 < bytes.len() && bytes[eq_pos + 1] == b'=' { |
| 711 | return false; |
| 712 | } |
| 713 | |
| 714 | // Check for a top-level comma after the '='. |
| 715 | find_top_level_char(bytes, eq_pos + 1, b',').is_some() |
| 716 | } |
| 717 | |
| 718 | /// Find the first occurrence of `target` byte at the top level |
| 719 | /// (not inside parentheses or string literals). |
| 720 | fn find_top_level_char(bytes: &[u8], start: usize, target: u8) -> Option<usize> { |
| 721 | let mut i = start; |
| 722 | let mut depth = 0i32; |
| 723 | while i < bytes.len() { |
| 724 | let b = bytes[i]; |
| 725 | |
| 726 | // Skip string literals. |
| 727 | if b == b'\'' || b == b'"' { |
| 728 | let quote = b; |
| 729 | i += 1; |
| 730 | while i < bytes.len() { |
| 731 | if bytes[i] == quote { |
| 732 | i += 1; |
| 733 | if i < bytes.len() && bytes[i] == quote { |
| 734 | i += 1; // doubled quote escape |
| 735 | } else { |
| 736 | break; |
| 737 | } |
| 738 | } else { |
| 739 | i += 1; |
| 740 | } |
| 741 | } |
| 742 | continue; |
| 743 | } |
| 744 | |
| 745 | match b { |
| 746 | b'(' => { |
| 747 | depth += 1; |
| 748 | } |
| 749 | b')' => { |
| 750 | depth -= 1; |
| 751 | } |
| 752 | c if c == target && depth == 0 => return Some(i), |
| 753 | _ => {} |
| 754 | } |
| 755 | i += 1; |
| 756 | } |
| 757 | None |
| 758 | } |
| 759 | |
| 760 | /// Lex a BOZ literal in fixed-form body. |
| 761 | fn lex_fixed_boz( |
| 762 | text: &str, |
| 763 | pos: usize, |
| 764 | file_id: u32, |
| 765 | line: u32, |
| 766 | ) -> Result<(Token, usize), LexError> { |
| 767 | let bytes = text.as_bytes(); |
| 768 | let mut end = pos; |
| 769 | let mut tok_text = String::new(); |
| 770 | |
| 771 | tok_text.push(bytes[end] as char); // B/O/Z |
| 772 | end += 1; |
| 773 | let quote = bytes[end]; |
| 774 | tok_text.push(quote as char); // opening quote |
| 775 | end += 1; |
| 776 | |
| 777 | while end < bytes.len() && bytes[end] != quote { |
| 778 | tok_text.push(bytes[end] as char); |
| 779 | end += 1; |
| 780 | } |
| 781 | if end >= bytes.len() { |
| 782 | return Err(LexError { |
| 783 | span: Span { |
| 784 | file_id, |
| 785 | start: Position { |
| 786 | line, |
| 787 | col: (pos as u32) + 7, |
| 788 | }, |
| 789 | end: Position { |
| 790 | line, |
| 791 | col: (pos as u32) + 7, |
| 792 | }, |
| 793 | }, |
| 794 | msg: "unterminated BOZ literal".into(), |
| 795 | }); |
| 796 | } |
| 797 | tok_text.push(bytes[end] as char); // closing quote |
| 798 | end += 1; |
| 799 | |
| 800 | let col = (pos as u32) + 7; |
| 801 | Ok(( |
| 802 | Token { |
| 803 | kind: TokenKind::BozLiteral, |
| 804 | text: tok_text, |
| 805 | span: Span { |
| 806 | file_id, |
| 807 | start: Position { line, col }, |
| 808 | end: Position { |
| 809 | line, |
| 810 | col: col + (end - pos) as u32, |
| 811 | }, |
| 812 | }, |
| 813 | }, |
| 814 | end - pos, |
| 815 | )) |
| 816 | } |
| 817 | |
| 818 | /// Lex an operator or punctuation in whitespace-stripped body. |
| 819 | fn lex_fixed_punct( |
| 820 | text: &str, |
| 821 | pos: usize, |
| 822 | file_id: u32, |
| 823 | line: u32, |
| 824 | ) -> Result<(Token, usize), LexError> { |
| 825 | let bytes = text.as_bytes(); |
| 826 | let ch = bytes[pos]; |
| 827 | let next = if pos + 1 < bytes.len() { |
| 828 | bytes[pos + 1] |
| 829 | } else { |
| 830 | 0 |
| 831 | }; |
| 832 | let col = (pos as u32) + 7; |
| 833 | let start = Position { line, col }; |
| 834 | |
| 835 | let (kind, tok_text, consumed) = match ch { |
| 836 | b'+' => (TokenKind::Plus, "+", 1), |
| 837 | b'-' => (TokenKind::Minus, "-", 1), |
| 838 | b'*' if next == b'*' => (TokenKind::Power, "**", 2), |
| 839 | b'*' => (TokenKind::Star, "*", 1), |
| 840 | b'/' if next == b'/' => (TokenKind::Concat, "//", 2), |
| 841 | b'/' if next == b'=' => (TokenKind::Ne, "/=", 2), |
| 842 | b'/' => (TokenKind::Slash, "/", 1), |
| 843 | b'=' if next == b'=' => (TokenKind::Eq, "==", 2), |
| 844 | b'=' if next == b'>' => (TokenKind::Arrow, "=>", 2), |
| 845 | b'=' => (TokenKind::Assign, "=", 1), |
| 846 | b'<' if next == b'=' => (TokenKind::Le, "<=", 2), |
| 847 | b'<' => (TokenKind::Lt, "<", 1), |
| 848 | b'>' if next == b'=' => (TokenKind::Ge, ">=", 2), |
| 849 | b'>' => (TokenKind::Gt, ">", 1), |
| 850 | b'(' => (TokenKind::LParen, "(", 1), |
| 851 | b')' => (TokenKind::RParen, ")", 1), |
| 852 | b'[' => (TokenKind::LBracket, "[", 1), |
| 853 | b']' => (TokenKind::RBracket, "]", 1), |
| 854 | b',' => (TokenKind::Comma, ",", 1), |
| 855 | b':' if next == b':' => (TokenKind::ColonColon, "::", 2), |
| 856 | b':' => (TokenKind::Colon, ":", 1), |
| 857 | b';' => (TokenKind::Semicolon, ";", 1), |
| 858 | b'%' => (TokenKind::Percent, "%", 1), |
| 859 | b'&' => (TokenKind::Ampersand, "&", 1), |
| 860 | _ => { |
| 861 | return Err(LexError { |
| 862 | span: Span { |
| 863 | file_id, |
| 864 | start, |
| 865 | end: start, |
| 866 | }, |
| 867 | msg: unexpected_char_message(text, pos, "unexpected character in fixed-form body"), |
| 868 | }); |
| 869 | } |
| 870 | }; |
| 871 | |
| 872 | Ok(( |
| 873 | Token { |
| 874 | kind, |
| 875 | text: tok_text.into(), |
| 876 | span: Span { |
| 877 | file_id, |
| 878 | start, |
| 879 | end: Position { |
| 880 | line, |
| 881 | col: col + consumed as u32, |
| 882 | }, |
| 883 | }, |
| 884 | }, |
| 885 | consumed, |
| 886 | )) |
| 887 | } |
| 888 | |
| 889 | // ---- Line preprocessing ---- |
| 890 | |
| 891 | enum FixedLine { |
| 892 | Comment { |
| 893 | text: String, |
| 894 | span: Span, |
| 895 | }, |
| 896 | Statement { |
| 897 | label: Option<String>, |
| 898 | body: String, |
| 899 | start_line: u32, |
| 900 | file_id: u32, |
| 901 | }, |
| 902 | Blank { |
| 903 | span: Span, |
| 904 | }, |
| 905 | } |
| 906 | |
| 907 | /// Preprocess fixed-form lines: identify comments, extract labels, join |
| 908 | /// continuations, strip columns 73+, handle tab-form. |
| 909 | fn preprocess_lines(src: &str, file_id: u32) -> Vec<FixedLine> { |
| 910 | let lines: Vec<&str> = src.lines().collect(); |
| 911 | let mut result = Vec::new(); |
| 912 | let mut i = 0; |
| 913 | |
| 914 | while i < lines.len() { |
| 915 | let line = lines[i]; |
| 916 | let line_num = (i + 1) as u32; |
| 917 | |
| 918 | // Blank line. |
| 919 | if line.trim().is_empty() { |
| 920 | result.push(FixedLine::Blank { |
| 921 | span: Span { |
| 922 | file_id, |
| 923 | start: Position { |
| 924 | line: line_num, |
| 925 | col: 1, |
| 926 | }, |
| 927 | end: Position { |
| 928 | line: line_num, |
| 929 | col: 1, |
| 930 | }, |
| 931 | }, |
| 932 | }); |
| 933 | i += 1; |
| 934 | continue; |
| 935 | } |
| 936 | |
| 937 | let first_byte = line.as_bytes().first().copied().unwrap_or(0); |
| 938 | |
| 939 | // Comment line: C, c, *, or ! in column 1. |
| 940 | if matches!(first_byte, b'C' | b'c' | b'*' | b'!') { |
| 941 | result.push(FixedLine::Comment { |
| 942 | text: line.to_string(), |
| 943 | span: Span { |
| 944 | file_id, |
| 945 | start: Position { |
| 946 | line: line_num, |
| 947 | col: 1, |
| 948 | }, |
| 949 | end: Position { |
| 950 | line: line_num, |
| 951 | col: line.len() as u32, |
| 952 | }, |
| 953 | }, |
| 954 | }); |
| 955 | i += 1; |
| 956 | continue; |
| 957 | } |
| 958 | |
| 959 | // Extract columns from this line. |
| 960 | let (label, body) = extract_fixed_columns(line); |
| 961 | |
| 962 | // Collect continuation lines. |
| 963 | let start_line = line_num; |
| 964 | let mut full_body = body; |
| 965 | i += 1; |
| 966 | |
| 967 | while i < lines.len() { |
| 968 | let next = lines[i]; |
| 969 | |
| 970 | // Blank lines between continuations: skip them only if the line |
| 971 | // after the blank is actually a continuation. Otherwise, the blank |
| 972 | // terminates the statement and should be emitted by the outer loop. |
| 973 | if next.trim().is_empty() { |
| 974 | // Peek ahead: is the line after this blank a continuation? |
| 975 | let lookahead = i + 1; |
| 976 | if lookahead < lines.len() && is_continuation_line(lines[lookahead]) { |
| 977 | i += 1; |
| 978 | continue; |
| 979 | } |
| 980 | break; // blank line ends the statement |
| 981 | } |
| 982 | |
| 983 | let next_first = next.as_bytes().first().copied().unwrap_or(0); |
| 984 | // Comment lines between continuations: skip them. |
| 985 | if matches!(next_first, b'C' | b'c' | b'*' | b'!') { |
| 986 | // Emit the comment but don't break the continuation. |
| 987 | result.push(FixedLine::Comment { |
| 988 | text: next.to_string(), |
| 989 | span: Span { |
| 990 | file_id, |
| 991 | start: Position { |
| 992 | line: (i + 1) as u32, |
| 993 | col: 1, |
| 994 | }, |
| 995 | end: Position { |
| 996 | line: (i + 1) as u32, |
| 997 | col: next.len() as u32, |
| 998 | }, |
| 999 | }, |
| 1000 | }); |
| 1001 | i += 1; |
| 1002 | continue; |
| 1003 | } |
| 1004 | |
| 1005 | // Check column 6 for continuation marker. |
| 1006 | if is_continuation_line(next) { |
| 1007 | let (_, cont_body) = extract_fixed_columns(next); |
| 1008 | full_body.push_str(&cont_body); |
| 1009 | i += 1; |
| 1010 | } else { |
| 1011 | break; |
| 1012 | } |
| 1013 | } |
| 1014 | |
| 1015 | result.push(FixedLine::Statement { |
| 1016 | label: if label.trim().is_empty() { |
| 1017 | None |
| 1018 | } else { |
| 1019 | Some(label) |
| 1020 | }, |
| 1021 | body: full_body, |
| 1022 | start_line, |
| 1023 | file_id, |
| 1024 | }); |
| 1025 | } |
| 1026 | |
| 1027 | result |
| 1028 | } |
| 1029 | |
| 1030 | /// Check if a line is a continuation line (non-space, non-zero in column 6). |
| 1031 | fn is_continuation_line(line: &str) -> bool { |
| 1032 | let bytes = line.as_bytes(); |
| 1033 | |
| 1034 | // Tab-form: tab followed by digit 1-9 is continuation. |
| 1035 | if bytes.first() == Some(&b'\t') { |
| 1036 | if let Some(&d) = bytes.get(1) { |
| 1037 | return (b'1'..=b'9').contains(&d); |
| 1038 | } |
| 1039 | } |
| 1040 | |
| 1041 | // Standard: column 6 (0-indexed: byte 5) is non-space, non-zero. |
| 1042 | if bytes.len() >= 6 { |
| 1043 | let col6 = bytes[5]; |
| 1044 | return col6 != b' ' && col6 != b'0' && col6 != b'\t'; |
| 1045 | } |
| 1046 | |
| 1047 | false |
| 1048 | } |
| 1049 | |
| 1050 | /// Extract label (columns 1-5) and body (columns 7-72) from a fixed-form line. |
| 1051 | /// Handles tab-form extension. |
| 1052 | fn extract_fixed_columns(line: &str) -> (String, String) { |
| 1053 | let bytes = line.as_bytes(); |
| 1054 | |
| 1055 | // Tab-form: if first character is a tab, everything after is body (or continuation). |
| 1056 | if bytes.first() == Some(&b'\t') { |
| 1057 | // Tab followed by digit 1-9: continuation (body starts after the digit). |
| 1058 | if let Some(&d) = bytes.get(1) { |
| 1059 | if (b'1'..=b'9').contains(&d) { |
| 1060 | let body = if bytes.len() > 2 { |
| 1061 | String::from_utf8_lossy(&bytes[2..]).to_string() |
| 1062 | } else { |
| 1063 | String::new() |
| 1064 | }; |
| 1065 | return (String::new(), body); |
| 1066 | } |
| 1067 | } |
| 1068 | // Tab followed by anything else: body starts at position after tab. |
| 1069 | let body = if bytes.len() > 1 { |
| 1070 | String::from_utf8_lossy(&bytes[1..]).to_string() |
| 1071 | } else { |
| 1072 | String::new() |
| 1073 | }; |
| 1074 | return (String::new(), body); |
| 1075 | } |
| 1076 | |
| 1077 | // Standard fixed-form: columns 1-5 label, column 6 continuation marker, 7-72 body. |
| 1078 | let label = if bytes.len() >= 5 { |
| 1079 | String::from_utf8_lossy(&bytes[0..5]).to_string() |
| 1080 | } else { |
| 1081 | String::from_utf8_lossy(bytes).to_string() |
| 1082 | }; |
| 1083 | |
| 1084 | let body_start = 6.min(bytes.len()); |
| 1085 | let body_end = 72.min(bytes.len()); // columns 73+ are ignored |
| 1086 | let body = if body_start < bytes.len() { |
| 1087 | String::from_utf8_lossy(&bytes[body_start..body_end]).to_string() |
| 1088 | } else { |
| 1089 | String::new() |
| 1090 | }; |
| 1091 | |
| 1092 | (label, body) |
| 1093 | } |
| 1094 | |
| 1095 | // ---- Hollerith constants ---- |
| 1096 | |
| 1097 | #[cfg(test)] |
| 1098 | mod tests { |
| 1099 | use super::*; |
| 1100 | use crate::lexer::TokenKind; |
| 1101 | |
| 1102 | fn fixed_toks(src: &str) -> Vec<Token> { |
| 1103 | tokenize_fixed(src, 0).unwrap() |
| 1104 | } |
| 1105 | |
| 1106 | fn fixed_kinds(src: &str) -> Vec<TokenKind> { |
| 1107 | fixed_toks(src) |
| 1108 | .into_iter() |
| 1109 | .map(|t| t.kind) |
| 1110 | .filter(|k| !matches!(k, TokenKind::Eof | TokenKind::Newline)) |
| 1111 | .collect() |
| 1112 | } |
| 1113 | |
| 1114 | fn fixed_texts(src: &str) -> Vec<String> { |
| 1115 | fixed_toks(src) |
| 1116 | .into_iter() |
| 1117 | .filter(|t| { |
| 1118 | !matches!( |
| 1119 | t.kind, |
| 1120 | TokenKind::Eof | TokenKind::Newline | TokenKind::Comment |
| 1121 | ) |
| 1122 | }) |
| 1123 | .map(|t| t.text) |
| 1124 | .collect() |
| 1125 | } |
| 1126 | |
| 1127 | // ---- Comment detection ---- |
| 1128 | |
| 1129 | #[test] |
| 1130 | fn comment_c_uppercase() { |
| 1131 | let k = fixed_kinds("C This is a comment\n"); |
| 1132 | assert_eq!(k, vec![TokenKind::Comment]); |
| 1133 | } |
| 1134 | |
| 1135 | #[test] |
| 1136 | fn comment_c_lowercase() { |
| 1137 | let k = fixed_kinds("c This is a comment\n"); |
| 1138 | assert_eq!(k, vec![TokenKind::Comment]); |
| 1139 | } |
| 1140 | |
| 1141 | #[test] |
| 1142 | fn comment_star() { |
| 1143 | let k = fixed_kinds("* This is a comment\n"); |
| 1144 | assert_eq!(k, vec![TokenKind::Comment]); |
| 1145 | } |
| 1146 | |
| 1147 | #[test] |
| 1148 | fn comment_bang() { |
| 1149 | let k = fixed_kinds("! This is a comment\n"); |
| 1150 | assert_eq!(k, vec![TokenKind::Comment]); |
| 1151 | } |
| 1152 | |
| 1153 | // ---- Statement labels ---- |
| 1154 | |
| 1155 | #[test] |
| 1156 | fn statement_with_label() { |
| 1157 | // " 10 CONTINUE" — label 10 in columns 1-5, CONTINUE in 7+ |
| 1158 | let texts = fixed_texts(" 10 CONTINUE\n"); |
| 1159 | assert!(texts.contains(&"10".to_string()), "got: {:?}", texts); |
| 1160 | assert!(texts.contains(&"CONTINUE".to_string()), "got: {:?}", texts); |
| 1161 | } |
| 1162 | |
| 1163 | #[test] |
| 1164 | fn statement_without_label() { |
| 1165 | // No label means the first token should be the identifier X, not a label number. |
| 1166 | let toks = fixed_toks(" X = 42\n"); |
| 1167 | let first_meaningful = toks |
| 1168 | .iter() |
| 1169 | .find(|t| { |
| 1170 | !matches!( |
| 1171 | t.kind, |
| 1172 | TokenKind::Newline | TokenKind::Eof | TokenKind::Comment |
| 1173 | ) |
| 1174 | }) |
| 1175 | .unwrap(); |
| 1176 | assert_eq!(first_meaningful.kind, TokenKind::Identifier); |
| 1177 | assert_eq!(first_meaningful.text, "X"); |
| 1178 | } |
| 1179 | |
| 1180 | // ---- Column 73+ ignored ---- |
| 1181 | |
| 1182 | #[test] |
| 1183 | fn columns_past_72_ignored() { |
| 1184 | // Columns 73+ should be stripped. Place code in 7-72, junk in 73+. |
| 1185 | let line = format!(" X = 42{}\n", " ".repeat(60) + "JUNK"); |
| 1186 | // Body should be "X = 42" + spaces, NOT including JUNK. |
| 1187 | let texts = fixed_texts(&line); |
| 1188 | assert!(texts.contains(&"X".to_string())); |
| 1189 | assert!( |
| 1190 | !texts.iter().any(|t| t.contains("JUNK")), |
| 1191 | "got: {:?}", |
| 1192 | texts |
| 1193 | ); |
| 1194 | } |
| 1195 | |
| 1196 | // ---- Continuation lines ---- |
| 1197 | |
| 1198 | #[test] |
| 1199 | fn continuation_in_column_6() { |
| 1200 | let src = " X = 1 +\n + 2\n"; |
| 1201 | let kinds = fixed_kinds(src); |
| 1202 | assert!(kinds.contains(&TokenKind::Plus)); |
| 1203 | // Should have both integer literals. |
| 1204 | let int_count = kinds |
| 1205 | .iter() |
| 1206 | .filter(|k| **k == TokenKind::IntegerLiteral) |
| 1207 | .count(); |
| 1208 | assert_eq!(int_count, 2, "expected 2 integer literals, got {:?}", kinds); |
| 1209 | } |
| 1210 | |
| 1211 | #[test] |
| 1212 | fn continuation_dollar_sign() { |
| 1213 | // Any non-space, non-zero character in column 6 is continuation. |
| 1214 | let src = " X = 1 +\n $ 2\n"; |
| 1215 | let kinds = fixed_kinds(src); |
| 1216 | let int_count = kinds |
| 1217 | .iter() |
| 1218 | .filter(|k| **k == TokenKind::IntegerLiteral) |
| 1219 | .count(); |
| 1220 | assert_eq!(int_count, 2); |
| 1221 | } |
| 1222 | |
| 1223 | // ---- Tab-form extension ---- |
| 1224 | |
| 1225 | #[test] |
| 1226 | fn tab_form_statement() { |
| 1227 | let src = "\tX = 42\n"; |
| 1228 | let texts = fixed_texts(src); |
| 1229 | assert!(texts.contains(&"X".to_string())); |
| 1230 | assert!(texts.contains(&"42".to_string())); |
| 1231 | } |
| 1232 | |
| 1233 | #[test] |
| 1234 | fn tab_form_continuation() { |
| 1235 | // Tab followed by digit 1-9 is continuation. |
| 1236 | let src = "\tX = 1 +\n\t1 2\n"; |
| 1237 | let kinds = fixed_kinds(src); |
| 1238 | let int_count = kinds |
| 1239 | .iter() |
| 1240 | .filter(|k| **k == TokenKind::IntegerLiteral) |
| 1241 | .count(); |
| 1242 | assert_eq!(int_count, 2, "got: {:?}", kinds); |
| 1243 | } |
| 1244 | |
| 1245 | // ---- Simple programs ---- |
| 1246 | |
| 1247 | #[test] |
| 1248 | fn simple_fixed_form_program() { |
| 1249 | let src = "\ |
| 1250 | C Hello World |
| 1251 | PROGRAM HELLO |
| 1252 | INTEGER I |
| 1253 | DO 10 I = 1, 10 |
| 1254 | WRITE(*,*) I |
| 1255 | 10 CONTINUE |
| 1256 | STOP |
| 1257 | END |
| 1258 | "; |
| 1259 | let tokens = tokenize_fixed(src, 0).unwrap(); |
| 1260 | let ident_count = tokens |
| 1261 | .iter() |
| 1262 | .filter(|t| t.kind == TokenKind::Identifier) |
| 1263 | .count(); |
| 1264 | assert!( |
| 1265 | ident_count >= 8, |
| 1266 | "expected 8+ identifiers, got {}", |
| 1267 | ident_count |
| 1268 | ); |
| 1269 | |
| 1270 | // Should have a label "10". |
| 1271 | assert!(tokens |
| 1272 | .iter() |
| 1273 | .any(|t| t.kind == TokenKind::IntegerLiteral && t.text == "10")); |
| 1274 | } |
| 1275 | |
| 1276 | // ---- Mode detection ---- |
| 1277 | |
| 1278 | #[test] |
| 1279 | fn detect_free_form() { |
| 1280 | use super::super::detect_source_form; |
| 1281 | assert_eq!( |
| 1282 | detect_source_form("test.f90"), |
| 1283 | super::super::SourceForm::FreeForm |
| 1284 | ); |
| 1285 | assert_eq!( |
| 1286 | detect_source_form("test.f95"), |
| 1287 | super::super::SourceForm::FreeForm |
| 1288 | ); |
| 1289 | assert_eq!( |
| 1290 | detect_source_form("test.f03"), |
| 1291 | super::super::SourceForm::FreeForm |
| 1292 | ); |
| 1293 | assert_eq!( |
| 1294 | detect_source_form("test.f08"), |
| 1295 | super::super::SourceForm::FreeForm |
| 1296 | ); |
| 1297 | assert_eq!( |
| 1298 | detect_source_form("test.f18"), |
| 1299 | super::super::SourceForm::FreeForm |
| 1300 | ); |
| 1301 | } |
| 1302 | |
| 1303 | #[test] |
| 1304 | fn detect_fixed_form() { |
| 1305 | use super::super::detect_source_form; |
| 1306 | assert_eq!( |
| 1307 | detect_source_form("test.f"), |
| 1308 | super::super::SourceForm::FixedForm |
| 1309 | ); |
| 1310 | assert_eq!( |
| 1311 | detect_source_form("test.for"), |
| 1312 | super::super::SourceForm::FixedForm |
| 1313 | ); |
| 1314 | assert_eq!( |
| 1315 | detect_source_form("test.ftn"), |
| 1316 | super::super::SourceForm::FixedForm |
| 1317 | ); |
| 1318 | } |
| 1319 | |
| 1320 | // ---- Unified token stream ---- |
| 1321 | |
| 1322 | #[test] |
| 1323 | fn fixed_and_free_produce_same_tokens() { |
| 1324 | let free_src = "integer :: x\nx = 42\n"; |
| 1325 | let fixed_src = " integer :: x\n x = 42\n"; |
| 1326 | |
| 1327 | let free_kinds: Vec<_> = super::super::Lexer::tokenize(free_src, 0) |
| 1328 | .unwrap() |
| 1329 | .into_iter() |
| 1330 | .map(|t| t.kind) |
| 1331 | .filter(|k| !matches!(k, TokenKind::Eof | TokenKind::Newline)) |
| 1332 | .collect(); |
| 1333 | |
| 1334 | let fixed_kinds = fixed_kinds(fixed_src); |
| 1335 | |
| 1336 | assert_eq!( |
| 1337 | free_kinds, fixed_kinds, |
| 1338 | "free-form and fixed-form produced different tokens:\n free: {:?}\n fixed: {:?}", |
| 1339 | free_kinds, fixed_kinds |
| 1340 | ); |
| 1341 | } |
| 1342 | |
| 1343 | // ---- Blank lines ---- |
| 1344 | |
| 1345 | #[test] |
| 1346 | fn blank_lines_handled() { |
| 1347 | let src = " X = 1\n\n Y = 2\n"; |
| 1348 | let kinds = fixed_kinds(src); |
| 1349 | assert!( |
| 1350 | kinds |
| 1351 | .iter() |
| 1352 | .filter(|k| **k == TokenKind::Identifier) |
| 1353 | .count() |
| 1354 | >= 2 |
| 1355 | ); |
| 1356 | } |
| 1357 | |
| 1358 | // ---- Hollerith ---- |
| 1359 | |
| 1360 | #[test] |
| 1361 | fn hollerith_protect_converts_to_string() { |
| 1362 | assert_eq!(protect_hollerith("3HABC"), "'ABC'"); |
| 1363 | assert_eq!(protect_hollerith("6HFOOBAR"), "'FOOBAR'"); |
| 1364 | } |
| 1365 | |
| 1366 | #[test] |
| 1367 | fn hollerith_with_spaces_preserved() { |
| 1368 | // 6H HELLO has a leading space — must be preserved. |
| 1369 | assert_eq!(protect_hollerith("6H HELLO"), "' HELLO'"); |
| 1370 | } |
| 1371 | |
| 1372 | #[test] |
| 1373 | fn hollerith_not_after_letter() { |
| 1374 | // X3HABC — the 3H is preceded by a letter, so it's NOT a Hollerith. |
| 1375 | assert_eq!(protect_hollerith("X3HABC"), "X3HABC"); |
| 1376 | } |
| 1377 | |
| 1378 | #[test] |
| 1379 | fn hollerith_after_operator() { |
| 1380 | // =3HABC — preceded by =, not a letter, so IS a Hollerith. |
| 1381 | assert_eq!(protect_hollerith("=3HABC"), "='ABC'"); |
| 1382 | } |
| 1383 | |
| 1384 | // ---- Real fixed-form files from refs ---- |
| 1385 | |
| 1386 | #[test] |
| 1387 | fn tokenize_flang_fixed_form_test() { |
| 1388 | let path = concat!( |
| 1389 | env!("CARGO_MANIFEST_DIR"), |
| 1390 | "/../.refs/llvm/flang/test/Driver/Inputs/fixed-form-test.f" |
| 1391 | ); |
| 1392 | if !std::path::Path::new(path).exists() { |
| 1393 | return; |
| 1394 | } |
| 1395 | let src = std::fs::read_to_string(path).unwrap(); |
| 1396 | let tokens = tokenize_fixed(&src, 0); |
| 1397 | assert!(tokens.is_ok(), "failed: {:?}", tokens.err()); |
| 1398 | } |
| 1399 | |
| 1400 | #[test] |
| 1401 | fn tokenize_gcc_nested_forall() { |
| 1402 | let path = concat!( |
| 1403 | env!("CARGO_MANIFEST_DIR"), |
| 1404 | "/../.refs/gcc/gcc/testsuite/gfortran.dg/nested_forall_1.f" |
| 1405 | ); |
| 1406 | if !std::path::Path::new(path).exists() { |
| 1407 | return; |
| 1408 | } |
| 1409 | let src = std::fs::read_to_string(path).unwrap(); |
| 1410 | let tokens = tokenize_fixed(&src, 0); |
| 1411 | assert!(tokens.is_ok(), "failed: {:?}", tokens.err()); |
| 1412 | let toks = tokens.unwrap(); |
| 1413 | assert!(toks.len() > 50, "expected 50+ tokens, got {}", toks.len()); |
| 1414 | } |
| 1415 | |
| 1416 | // ====================================================================== |
| 1417 | // Whitespace insensitivity tests — the core challenge of fixed-form |
| 1418 | // ====================================================================== |
| 1419 | |
| 1420 | #[test] |
| 1421 | fn whitespace_stripped_goto() { |
| 1422 | // GO TO 100 collapses to GOTO100 in fixed-form source. |
| 1423 | let texts = fixed_texts(" GOTO100\n"); |
| 1424 | assert_eq!(texts, vec!["GOTO", "100"], "got: {:?}", texts); |
| 1425 | } |
| 1426 | |
| 1427 | #[test] |
| 1428 | fn whitespace_stripped_integer_decl() { |
| 1429 | // INTEGER I collapses to INTEGERI and must still parse as a declaration. |
| 1430 | let texts = fixed_texts(" INTEGERI\n"); |
| 1431 | assert_eq!(texts, vec!["INTEGER", "I"], "got: {:?}", texts); |
| 1432 | } |
| 1433 | |
| 1434 | #[test] |
| 1435 | fn whitespace_stripped_doubleprecision() { |
| 1436 | // DOUBLE PRECISION X collapses to DOUBLEPRECISIONX. |
| 1437 | let texts = fixed_texts(" DOUBLEPRECISIONX\n"); |
| 1438 | assert_eq!(texts, vec!["DOUBLEPRECISION", "X"], "got: {:?}", texts); |
| 1439 | } |
| 1440 | |
| 1441 | #[test] |
| 1442 | fn whitespace_stripped_program_name() { |
| 1443 | let texts = fixed_texts(" PROGRAMHELLO\n"); |
| 1444 | assert_eq!(texts, vec!["PROGRAM", "HELLO"], "got: {:?}", texts); |
| 1445 | } |
| 1446 | |
| 1447 | #[test] |
| 1448 | fn whitespace_stripped_typed_function() { |
| 1449 | let texts = fixed_texts(" INTEGERFUNCTIONF(X)\n"); |
| 1450 | assert_eq!(texts, vec!["INTEGER", "FUNCTION", "F", "(", "X", ")"]); |
| 1451 | } |
| 1452 | |
| 1453 | #[test] |
| 1454 | fn index_not_broken() { |
| 1455 | // INDEX must NOT be split into IN+DEX — this was the showstopper bug. |
| 1456 | let _kinds = fixed_kinds(" X=INDEX(A,'B')\n"); |
| 1457 | let texts = fixed_texts(" X=INDEX(A,'B')\n"); |
| 1458 | assert!( |
| 1459 | texts.contains(&"INDEX".to_string()), |
| 1460 | "INDEX was incorrectly split, got: {:?}", |
| 1461 | texts |
| 1462 | ); |
| 1463 | } |
| 1464 | |
| 1465 | #[test] |
| 1466 | fn include_not_broken() { |
| 1467 | // INCLUDE must not become IN+CLUDE. |
| 1468 | let texts = fixed_texts(" INCLUDEVAR=1\n"); |
| 1469 | assert_eq!(texts[0], "INCLUDEVAR", "got: {:?}", texts); |
| 1470 | } |
| 1471 | |
| 1472 | #[test] |
| 1473 | fn if_ident_not_broken() { |
| 1474 | // IFLAG must not become IF+LAG. |
| 1475 | let texts = fixed_texts(" IFLAG=1\n"); |
| 1476 | assert_eq!(texts[0], "IFLAG", "got: {:?}", texts); |
| 1477 | } |
| 1478 | |
| 1479 | #[test] |
| 1480 | fn whitespace_stripped_assignment() { |
| 1481 | // X=42 → identifier, =, integer |
| 1482 | let kinds = fixed_kinds(" X=42\n"); |
| 1483 | assert_eq!( |
| 1484 | kinds, |
| 1485 | vec![ |
| 1486 | TokenKind::Identifier, |
| 1487 | TokenKind::Assign, |
| 1488 | TokenKind::IntegerLiteral, |
| 1489 | ] |
| 1490 | ); |
| 1491 | } |
| 1492 | |
| 1493 | #[test] |
| 1494 | fn whitespace_stripped_expression() { |
| 1495 | // A+B*C → identifier, +, identifier, *, identifier |
| 1496 | let kinds = fixed_kinds(" A+B*C\n"); |
| 1497 | assert_eq!( |
| 1498 | kinds, |
| 1499 | vec![ |
| 1500 | TokenKind::Identifier, |
| 1501 | TokenKind::Plus, |
| 1502 | TokenKind::Identifier, |
| 1503 | TokenKind::Star, |
| 1504 | TokenKind::Identifier, |
| 1505 | ] |
| 1506 | ); |
| 1507 | } |
| 1508 | |
| 1509 | #[test] |
| 1510 | fn whitespace_stripped_with_parens() { |
| 1511 | // X=REAL(I) → identifier, =, identifier, (, identifier, ) |
| 1512 | let kinds = fixed_kinds(" X=REAL(I)\n"); |
| 1513 | assert_eq!( |
| 1514 | kinds, |
| 1515 | vec![ |
| 1516 | TokenKind::Identifier, |
| 1517 | TokenKind::Assign, |
| 1518 | TokenKind::Identifier, |
| 1519 | TokenKind::LParen, |
| 1520 | TokenKind::Identifier, |
| 1521 | TokenKind::RParen, |
| 1522 | ] |
| 1523 | ); |
| 1524 | } |
| 1525 | |
| 1526 | #[test] |
| 1527 | fn whitespace_stripped_dot_op() { |
| 1528 | // A.AND.B → identifier, .and., identifier |
| 1529 | let kinds = fixed_kinds(" A.AND.B\n"); |
| 1530 | assert_eq!( |
| 1531 | kinds, |
| 1532 | vec![ |
| 1533 | TokenKind::Identifier, |
| 1534 | TokenKind::DotOp("and".into()), |
| 1535 | TokenKind::Identifier, |
| 1536 | ] |
| 1537 | ); |
| 1538 | } |
| 1539 | |
| 1540 | #[test] |
| 1541 | fn whitespace_stripped_real_literal() { |
| 1542 | // X=1.0D0 → identifier, =, real |
| 1543 | let kinds = fixed_kinds(" X=1.0D0\n"); |
| 1544 | assert_eq!( |
| 1545 | kinds, |
| 1546 | vec![ |
| 1547 | TokenKind::Identifier, |
| 1548 | TokenKind::Assign, |
| 1549 | TokenKind::RealLiteral, |
| 1550 | ] |
| 1551 | ); |
| 1552 | } |
| 1553 | |
| 1554 | #[test] |
| 1555 | fn whitespace_stripped_comparison() { |
| 1556 | // 1.EQ.2 → integer, .eq., integer |
| 1557 | let kinds = fixed_kinds(" IF(I.EQ.1)STOP\n"); |
| 1558 | assert!( |
| 1559 | kinds.contains(&TokenKind::DotOp("eq".into())), |
| 1560 | "got: {:?}", |
| 1561 | kinds |
| 1562 | ); |
| 1563 | } |
| 1564 | |
| 1565 | #[test] |
| 1566 | fn whitespace_stripped_string_preserved() { |
| 1567 | // Whitespace INSIDE strings must be preserved. |
| 1568 | let kinds = fixed_kinds(" X='HELLO WORLD'\n"); |
| 1569 | assert!(kinds.contains(&TokenKind::StringLiteral)); |
| 1570 | let texts = fixed_texts(" X='HELLO WORLD'\n"); |
| 1571 | assert!( |
| 1572 | texts.iter().any(|t| t.contains("HELLO WORLD")), |
| 1573 | "got: {:?}", |
| 1574 | texts |
| 1575 | ); |
| 1576 | } |
| 1577 | |
| 1578 | // ---- Continuation over blank lines ---- |
| 1579 | |
| 1580 | #[test] |
| 1581 | fn continuation_over_blank_line() { |
| 1582 | let src = " X = 1 +\n\n + 2\n"; |
| 1583 | let kinds = fixed_kinds(src); |
| 1584 | let int_count = kinds |
| 1585 | .iter() |
| 1586 | .filter(|k| **k == TokenKind::IntegerLiteral) |
| 1587 | .count(); |
| 1588 | assert_eq!( |
| 1589 | int_count, 2, |
| 1590 | "blank line should not break continuation, got: {:?}", |
| 1591 | kinds |
| 1592 | ); |
| 1593 | } |
| 1594 | |
| 1595 | // ---- DO/assignment ambiguity ---- |
| 1596 | |
| 1597 | #[test] |
| 1598 | fn do_loop_with_comma() { |
| 1599 | // DO10I=1,10 → DO loop: DO + 10 + I + = + 1 + , + 10 |
| 1600 | let kinds = fixed_kinds(" DO10I=1,10\n"); |
| 1601 | assert!( |
| 1602 | kinds.contains(&TokenKind::Comma), |
| 1603 | "DO loop must have comma, got: {:?}", |
| 1604 | kinds |
| 1605 | ); |
| 1606 | let texts = fixed_texts(" DO10I=1,10\n"); |
| 1607 | assert_eq!( |
| 1608 | texts[0], "DO", |
| 1609 | "first token should be DO keyword, got: {:?}", |
| 1610 | texts |
| 1611 | ); |
| 1612 | } |
| 1613 | |
| 1614 | #[test] |
| 1615 | fn do_assignment_no_comma() { |
| 1616 | // DO10I=1.10 → assignment: DO10I + = + 1.10 (no comma → not a loop) |
| 1617 | let kinds = fixed_kinds(" DO10I=1.10\n"); |
| 1618 | assert!( |
| 1619 | !kinds.contains(&TokenKind::Comma), |
| 1620 | "assignment should have no comma, got: {:?}", |
| 1621 | kinds |
| 1622 | ); |
| 1623 | let texts = fixed_texts(" DO10I=1.10\n"); |
| 1624 | assert_eq!( |
| 1625 | texts[0], "DO10I", |
| 1626 | "should be single identifier, got: {:?}", |
| 1627 | texts |
| 1628 | ); |
| 1629 | } |
| 1630 | |
| 1631 | #[test] |
| 1632 | fn do_assignment_no_comma_integer() { |
| 1633 | // DO10I=1 → assignment (no comma) |
| 1634 | let kinds = fixed_kinds(" DO10I=1\n"); |
| 1635 | assert!(!kinds.contains(&TokenKind::Comma)); |
| 1636 | let texts = fixed_texts(" DO10I=1\n"); |
| 1637 | assert_eq!(texts[0], "DO10I"); |
| 1638 | } |
| 1639 | |
| 1640 | // ---- BOZ in fixed-form ---- |
| 1641 | |
| 1642 | #[test] |
| 1643 | fn boz_in_fixed_form() { |
| 1644 | let kinds = fixed_kinds(" X=B'1010'\n"); |
| 1645 | assert!(kinds.contains(&TokenKind::BozLiteral), "got: {:?}", kinds); |
| 1646 | } |
| 1647 | |
| 1648 | #[test] |
| 1649 | fn boz_hex_in_fixed_form() { |
| 1650 | let kinds = fixed_kinds(" X=Z'FF'\n"); |
| 1651 | assert!(kinds.contains(&TokenKind::BozLiteral), "got: {:?}", kinds); |
| 1652 | } |
| 1653 | |
| 1654 | // ---- Hollerith integration ---- |
| 1655 | |
| 1656 | #[test] |
| 1657 | fn hollerith_in_source() { |
| 1658 | // 3HABC in a statement should produce a string literal. |
| 1659 | let kinds = fixed_kinds(" X=3HABC\n"); |
| 1660 | assert!( |
| 1661 | kinds.contains(&TokenKind::StringLiteral), |
| 1662 | "got: {:?}", |
| 1663 | kinds |
| 1664 | ); |
| 1665 | let texts = fixed_texts(" X=3HABC\n"); |
| 1666 | assert!( |
| 1667 | texts.iter().any(|t| t.contains("ABC")), |
| 1668 | "Hollerith content missing, got: {:?}", |
| 1669 | texts |
| 1670 | ); |
| 1671 | } |
| 1672 | |
| 1673 | #[test] |
| 1674 | fn hollerith_with_spaces_in_source() { |
| 1675 | // 6H HELLO preserves the space. |
| 1676 | let texts = fixed_texts(" X=6H HELLO\n"); |
| 1677 | assert!( |
| 1678 | texts.iter().any(|t| t.contains(" HELLO")), |
| 1679 | "space lost, got: {:?}", |
| 1680 | texts |
| 1681 | ); |
| 1682 | } |
| 1683 | |
| 1684 | #[test] |
| 1685 | fn hollerith_zero_length() { |
| 1686 | // 0H should produce empty string literal. |
| 1687 | assert_eq!(protect_hollerith("=0H+"), "=''+"); |
| 1688 | } |
| 1689 | |
| 1690 | // ---- String in fixed-form ---- |
| 1691 | |
| 1692 | #[test] |
| 1693 | fn string_literal_in_fixed_form() { |
| 1694 | let kinds = fixed_kinds(" X = 'IT''S'\n"); |
| 1695 | assert!(kinds.contains(&TokenKind::StringLiteral)); |
| 1696 | let texts = fixed_texts(" X = 'IT''S'\n"); |
| 1697 | assert!( |
| 1698 | texts.iter().any(|t| t.contains("IT''S")), |
| 1699 | "got: {:?}", |
| 1700 | texts |
| 1701 | ); |
| 1702 | } |
| 1703 | |
| 1704 | #[test] |
| 1705 | fn unterminated_string_error() { |
| 1706 | let result = tokenize_fixed(" X = 'UNTERMINATED\n", 0); |
| 1707 | assert!(result.is_err(), "should error on unterminated string"); |
| 1708 | } |
| 1709 | |
| 1710 | #[test] |
| 1711 | fn doublecomplex_keyword() { |
| 1712 | use crate::lexer::is_keyword; |
| 1713 | assert!(is_keyword("doublecomplex").is_some()); |
| 1714 | assert!(is_keyword("DOUBLECOMPLEX").is_some()); |
| 1715 | } |
| 1716 | |
| 1717 | #[test] |
| 1718 | fn continue_keyword() { |
| 1719 | use crate::lexer::is_keyword; |
| 1720 | assert!(is_keyword("continue").is_some()); |
| 1721 | assert!(is_keyword("CONTINUE").is_some()); |
| 1722 | } |
| 1723 | } |
| 1724 |