Rust · 52366 bytes Raw Blame History
1 //! Fixed-form (F77) Fortran lexer.
2 //!
3 //! Two-pass approach:
4 //! 1. Preprocess lines: identify comments, extract labels, join continuations,
5 //! strip columns 73+, handle tab-form extension.
6 //! 2. Tokenize each logical statement body, handling whitespace insensitivity
7 //! and Hollerith constants.
8 //!
9 //! Produces the same Token types as the free-form lexer.
10
11 use super::{is_keyword, is_known_dot_op, LexError, Position, Span, Token, TokenKind};
12
13 /// Tokenize fixed-form Fortran source.
14 pub fn tokenize_fixed(src: &str, file_id: u32) -> Result<Vec<Token>, LexError> {
15 let statements = preprocess_lines(src, file_id);
16 let mut tokens = Vec::new();
17
18 for stmt in &statements {
19 match stmt {
20 FixedLine::Comment { text, span } => {
21 tokens.push(Token {
22 kind: TokenKind::Comment,
23 text: text.clone(),
24 span: *span,
25 });
26 tokens.push(Token {
27 kind: TokenKind::Newline,
28 text: "\n".into(),
29 span: *span,
30 });
31 }
32 FixedLine::Statement {
33 label,
34 body,
35 start_line,
36 file_id: fid,
37 } => {
38 // Emit label as integer literal if present.
39 if let Some(label_text) = label {
40 let label_trimmed = label_text.trim();
41 if !label_trimmed.is_empty() {
42 tokens.push(Token {
43 kind: TokenKind::IntegerLiteral,
44 text: label_trimmed.to_string(),
45 span: Span {
46 file_id: *fid,
47 start: Position {
48 line: *start_line,
49 col: 1,
50 },
51 end: Position {
52 line: *start_line,
53 col: 6,
54 },
55 },
56 });
57 }
58 }
59
60 // Tokenize the body with the whitespace-insensitive scanner.
61 let body_tokens = tokenize_body(body, *fid, *start_line)?;
62 tokens.extend(body_tokens);
63
64 tokens.push(Token {
65 kind: TokenKind::Newline,
66 text: "\n".into(),
67 span: Span {
68 file_id: *fid,
69 start: Position {
70 line: *start_line,
71 col: 1,
72 },
73 end: Position {
74 line: *start_line,
75 col: 1,
76 },
77 },
78 });
79 }
80 FixedLine::Blank { span } => {
81 tokens.push(Token {
82 kind: TokenKind::Newline,
83 text: "\n".into(),
84 span: *span,
85 });
86 }
87 }
88 }
89
90 tokens.push(Token {
91 kind: TokenKind::Eof,
92 text: String::new(),
93 span: Span {
94 file_id,
95 start: Position {
96 line: src.lines().count() as u32 + 1,
97 col: 1,
98 },
99 end: Position {
100 line: src.lines().count() as u32 + 1,
101 col: 1,
102 },
103 },
104 });
105
106 Ok(tokens)
107 }
108
109 fn unexpected_char_message(text: &str, pos: usize, context: &str) -> String {
110 let ch = text.as_bytes().get(pos).copied().unwrap_or(b'?') as char;
111 format!("{context}: '{ch}'")
112 }
113
114 // ---- Whitespace-insensitive body tokenizer ----
115
116 /// Tokenize a fixed-form statement body with whitespace insensitivity.
117 ///
118 /// Three-phase approach:
119 /// 1. Protect Hollerith constants (nH...) by converting to string literals before stripping
120 /// 2. Strip all whitespace outside string literals
121 /// 3. Tokenize with keyword-splitting: longest keyword prefix match at letter runs
122 fn tokenize_body(body: &str, file_id: u32, line: u32) -> Result<Vec<Token>, LexError> {
123 // Phase 1: Convert Hollerith constants to string literals (preserves their spaces).
124 let hollerith_protected = protect_hollerith(body);
125 // Phase 2: Strip whitespace outside string literals.
126 let stripped = strip_whitespace_outside_strings(&hollerith_protected);
127 let bytes = stripped.as_bytes();
128 let mut tokens = Vec::new();
129 let mut pos = 0;
130
131 while pos < bytes.len() {
132 let col = (pos as u32) + 7;
133 let start = Position { line, col };
134 let ch = bytes[pos];
135
136 // Comment (! to end).
137 if ch == b'!' {
138 tokens.push(Token {
139 kind: TokenKind::Comment,
140 text: stripped[pos..].to_string(),
141 span: Span {
142 file_id,
143 start,
144 end: Position {
145 line,
146 col: col + (bytes.len() - pos) as u32,
147 },
148 },
149 });
150 break;
151 }
152
153 // String literal.
154 if ch == b'\'' || ch == b'"' {
155 let (tok, consumed) = lex_fixed_string(&stripped, pos, file_id, line)?;
156 tokens.push(tok);
157 pos += consumed;
158 continue;
159 }
160
161 // Dot-operator or real starting with dot.
162 if ch == b'.' {
163 if pos + 1 < bytes.len() && bytes[pos + 1].is_ascii_digit() {
164 let (tok, consumed) = lex_fixed_number(&stripped, pos, file_id, line);
165 tokens.push(tok);
166 pos += consumed;
167 } else {
168 let (tok, consumed) = lex_fixed_dot_op(&stripped, pos, file_id, line)?;
169 tokens.push(tok);
170 pos += consumed;
171 }
172 continue;
173 }
174
175 // Number (integer or real).
176 if ch.is_ascii_digit() {
177 let (tok, consumed) = lex_fixed_number(&stripped, pos, file_id, line);
178 tokens.push(tok);
179 pos += consumed;
180 continue;
181 }
182
183 // BOZ literal: B/O/Z followed by quote.
184 if matches!(ch, b'B' | b'b' | b'O' | b'o' | b'Z' | b'z')
185 && pos + 1 < bytes.len()
186 && matches!(bytes[pos + 1], b'\'' | b'"')
187 {
188 let (tok, consumed) = lex_fixed_boz(&stripped, pos, file_id, line)?;
189 tokens.push(tok);
190 pos += consumed;
191 continue;
192 }
193
194 // Letter — keyword or identifier with fixed-form prefix splitting.
195 if ch.is_ascii_alphabetic() || ch == b'_' {
196 let (tok, consumed) =
197 lex_fixed_ident_or_keyword(&stripped, pos, file_id, line, &tokens);
198 tokens.push(tok);
199 pos += consumed;
200 continue;
201 }
202
203 // Operators and punctuation.
204 let (tok, consumed) = lex_fixed_punct(&stripped, pos, file_id, line)?;
205 tokens.push(tok);
206 pos += consumed;
207 }
208
209 Ok(tokens)
210 }
211
212 /// Convert Hollerith constants (nH...) to quoted string literals BEFORE whitespace stripping.
213 /// This preserves spaces inside Hollerith content: `6H HELLO` → `' HELLO'`.
214 fn protect_hollerith(body: &str) -> String {
215 let bytes = body.as_bytes();
216 let mut result = String::with_capacity(body.len());
217 let mut i = 0;
218
219 while i < bytes.len() {
220 // Inside a string literal: copy verbatim.
221 if bytes[i] == b'\'' || bytes[i] == b'"' {
222 let quote = bytes[i];
223 result.push(bytes[i] as char);
224 i += 1;
225 while i < bytes.len() {
226 result.push(bytes[i] as char);
227 if bytes[i] == quote {
228 i += 1;
229 if i < bytes.len() && bytes[i] == quote {
230 result.push(bytes[i] as char);
231 i += 1;
232 } else {
233 break;
234 }
235 } else {
236 i += 1;
237 }
238 }
239 continue;
240 }
241
242 // Check for Hollerith: digits followed by H, not preceded by a letter/digit.
243 if bytes[i].is_ascii_digit() {
244 let preceded_by_alnum =
245 i > 0 && (bytes[i - 1].is_ascii_alphanumeric() || bytes[i - 1] == b'_');
246 if !preceded_by_alnum {
247 let digit_start = i;
248 while i < bytes.len() && bytes[i].is_ascii_digit() {
249 i += 1;
250 }
251 if i < bytes.len() && (bytes[i] == b'H' || bytes[i] == b'h') {
252 if let Ok(count) = body[digit_start..i].parse::<usize>() {
253 i += 1; // skip H
254 if i + count <= bytes.len() {
255 // Replace nH... with '...'
256 result.push('\'');
257 result.push_str(&body[i..i + count]);
258 result.push('\'');
259 i += count;
260 continue;
261 }
262 }
263 }
264 // Not Hollerith — put the digits back.
265 result.push_str(&body[digit_start..i]);
266 continue;
267 }
268 }
269
270 result.push(bytes[i] as char);
271 i += 1;
272 }
273 result
274 }
275
276 /// Strip whitespace from body text, preserving content inside string literals.
277 fn strip_whitespace_outside_strings(body: &str) -> String {
278 let mut result = String::with_capacity(body.len());
279 let bytes = body.as_bytes();
280 let mut i = 0;
281 while i < bytes.len() {
282 if bytes[i] == b'\'' || bytes[i] == b'"' {
283 let quote = bytes[i];
284 result.push(quote as char);
285 i += 1;
286 while i < bytes.len() {
287 result.push(bytes[i] as char);
288 if bytes[i] == quote {
289 i += 1;
290 if i < bytes.len() && bytes[i] == quote {
291 result.push(bytes[i] as char);
292 i += 1;
293 } else {
294 break;
295 }
296 } else {
297 i += 1;
298 }
299 }
300 } else if bytes[i] == b' ' || bytes[i] == b'\t' {
301 i += 1;
302 } else {
303 result.push(bytes[i] as char);
304 i += 1;
305 }
306 }
307 result
308 }
309
310 /// Lex a string literal in whitespace-stripped body.
311 fn lex_fixed_string(
312 text: &str,
313 pos: usize,
314 file_id: u32,
315 line: u32,
316 ) -> Result<(Token, usize), LexError> {
317 let bytes = text.as_bytes();
318 let quote = bytes[pos];
319 let mut end = pos + 1;
320 let mut tok_text = String::new();
321 tok_text.push(quote as char);
322
323 let mut closed = false;
324 while end < bytes.len() {
325 tok_text.push(bytes[end] as char);
326 if bytes[end] == quote {
327 end += 1;
328 if end < bytes.len() && bytes[end] == quote {
329 tok_text.push(bytes[end] as char);
330 end += 1;
331 } else {
332 closed = true;
333 break;
334 }
335 } else {
336 end += 1;
337 }
338 }
339
340 if !closed {
341 let col = (pos as u32) + 7;
342 return Err(LexError {
343 span: Span {
344 file_id,
345 start: Position { line, col },
346 end: Position { line, col },
347 },
348 msg: "unterminated string literal in fixed-form body".into(),
349 });
350 }
351
352 let col = (pos as u32) + 7;
353 Ok((
354 Token {
355 kind: TokenKind::StringLiteral,
356 text: tok_text,
357 span: Span {
358 file_id,
359 start: Position { line, col },
360 end: Position {
361 line,
362 col: col + (end - pos) as u32,
363 },
364 },
365 },
366 end - pos,
367 ))
368 }
369
370 /// Lex a dot-operator (.AND., .EQ., .TRUE., .myop.) in whitespace-stripped body.
371 fn lex_fixed_dot_op(
372 text: &str,
373 pos: usize,
374 file_id: u32,
375 line: u32,
376 ) -> Result<(Token, usize), LexError> {
377 let bytes = text.as_bytes();
378 let mut end = pos + 1; // skip first dot
379 let mut name = String::new();
380
381 while end < bytes.len() && (bytes[end].is_ascii_alphabetic() || bytes[end] == b'_') {
382 name.push(bytes[end] as char);
383 end += 1;
384 }
385
386 if end < bytes.len() && bytes[end] == b'.' {
387 end += 1; // closing dot
388 }
389
390 let lower = name.to_lowercase();
391 let col = (pos as u32) + 7;
392 let tok_text = format!(".{}.", name);
393 let span = Span {
394 file_id,
395 start: Position { line, col },
396 end: Position {
397 line,
398 col: col + (end - pos) as u32,
399 },
400 };
401
402 if lower == "true" || lower == "false" {
403 // Check for kind suffix.
404 let mut full_text = tok_text;
405 if end < bytes.len() && bytes[end] == b'_' {
406 full_text.push('_');
407 end += 1;
408 while end < bytes.len() && (bytes[end].is_ascii_alphanumeric() || bytes[end] == b'_') {
409 full_text.push(bytes[end] as char);
410 end += 1;
411 }
412 }
413 return Ok((
414 Token {
415 kind: TokenKind::LogicalLiteral,
416 text: full_text,
417 span,
418 },
419 end - pos,
420 ));
421 }
422
423 let kind = if is_known_dot_op(&lower) {
424 TokenKind::DotOp(lower)
425 } else {
426 TokenKind::DefinedOp(name.to_lowercase())
427 };
428
429 Ok((
430 Token {
431 kind,
432 text: tok_text,
433 span,
434 },
435 end - pos,
436 ))
437 }
438
439 /// Lex a number (integer or real) in whitespace-stripped body.
440 fn lex_fixed_number(text: &str, pos: usize, file_id: u32, line: u32) -> (Token, usize) {
441 let bytes = text.as_bytes();
442 let mut end = pos;
443 let mut is_real = false;
444 let mut tok_text = String::new();
445
446 // Leading digits.
447 while end < bytes.len() && bytes[end].is_ascii_digit() {
448 tok_text.push(bytes[end] as char);
449 end += 1;
450 }
451
452 // Decimal point — but not if followed by letter (dot-op like .EQ.).
453 if end < bytes.len() && bytes[end] == b'.' {
454 let after_dot = if end + 1 < bytes.len() {
455 bytes[end + 1]
456 } else {
457 0
458 };
459 let dot_is_numeric = after_dot.is_ascii_digit()
460 || tok_text.is_empty() // leading dot
461 || {
462 // Check for exponent: .e5 vs .eq.
463 if matches!(after_dot, b'e' | b'E' | b'd' | b'D') {
464 let after_ed = if end + 2 < bytes.len() { bytes[end + 2] } else { 0 };
465 matches!(after_ed, b'0'..=b'9' | b'+' | b'-')
466 } else {
467 !after_dot.is_ascii_alphabetic() // 5. followed by op/end
468 }
469 };
470
471 if dot_is_numeric {
472 is_real = true;
473 tok_text.push(bytes[end] as char);
474 end += 1;
475 while end < bytes.len() && bytes[end].is_ascii_digit() {
476 tok_text.push(bytes[end] as char);
477 end += 1;
478 }
479 }
480 }
481
482 // Exponent — only consume e/d if followed by digit or +/- then digit.
483 // This prevents `10DO` from being lexed as real `10D` + identifier `O`.
484 if end < bytes.len() && matches!(bytes[end], b'e' | b'E' | b'd' | b'D') {
485 let after_ed = if end + 1 < bytes.len() {
486 bytes[end + 1]
487 } else {
488 0
489 };
490 let has_exponent_digits = after_ed.is_ascii_digit()
491 || (matches!(after_ed, b'+' | b'-')
492 && end + 2 < bytes.len()
493 && bytes[end + 2].is_ascii_digit());
494
495 if has_exponent_digits {
496 is_real = true;
497 tok_text.push(bytes[end] as char);
498 end += 1;
499 if end < bytes.len() && matches!(bytes[end], b'+' | b'-') {
500 tok_text.push(bytes[end] as char);
501 end += 1;
502 }
503 while end < bytes.len() && bytes[end].is_ascii_digit() {
504 tok_text.push(bytes[end] as char);
505 end += 1;
506 }
507 }
508 }
509
510 // Kind suffix.
511 if end < bytes.len() && bytes[end] == b'_' {
512 tok_text.push(bytes[end] as char);
513 end += 1;
514 while end < bytes.len() && (bytes[end].is_ascii_alphanumeric() || bytes[end] == b'_') {
515 tok_text.push(bytes[end] as char);
516 end += 1;
517 }
518 }
519
520 let col = (pos as u32) + 7;
521 let kind = if is_real {
522 TokenKind::RealLiteral
523 } else {
524 TokenKind::IntegerLiteral
525 };
526 (
527 Token {
528 kind,
529 text: tok_text,
530 span: Span {
531 file_id,
532 start: Position { line, col },
533 end: Position {
534 line,
535 col: col + (end - pos) as u32,
536 },
537 },
538 },
539 end - pos,
540 )
541 }
542
543 /// Lex an identifier or keyword in whitespace-stripped fixed-form body.
544 ///
545 /// Fixed-form removes spaces from the statement body, so common source like
546 /// `PROGRAM HELLO` and `INTEGER I, N` reaches us as `PROGRAMHELLO` and
547 /// `INTEGERI,N`. The parser does not have enough context to recover those
548 /// boundaries reliably from a single opaque identifier token, so the fixed-form
549 /// lexer splits a small set of keyword prefixes when we are at a statement
550 /// boundary or another keyword-following context.
551 ///
552 /// The DO/assignment ambiguity still needs special handling before the generic
553 /// prefix splitter because `DO10I=1,10` is a loop while `DO10I=1.10` is an
554 /// assignment.
555 fn lex_fixed_ident_or_keyword(
556 text: &str,
557 pos: usize,
558 file_id: u32,
559 line: u32,
560 prior_tokens: &[Token],
561 ) -> (Token, usize) {
562 let bytes = text.as_bytes();
563 let mut run_end = pos;
564 while run_end < bytes.len()
565 && (bytes[run_end].is_ascii_alphanumeric() || bytes[run_end] == b'_')
566 {
567 run_end += 1;
568 }
569 let run = &text[pos..run_end];
570 let run_lower = run.to_lowercase();
571
572 // DO/assignment ambiguity: if the run starts with "do" followed by digits,
573 // check if this is a DO loop (has comma after =) or an assignment.
574 if run_lower.starts_with("do")
575 && run.len() > 2
576 && run.as_bytes()[2].is_ascii_digit()
577 && is_do_loop_context(text, pos + 2)
578 {
579 // IS a DO loop — emit just "DO" (2 chars). Subsequent calls
580 // will pick up the label (digits) and variable (letters) separately.
581 let col = (pos as u32) + 7;
582 return (
583 Token {
584 kind: TokenKind::Identifier,
585 text: run[..2].to_string(),
586 span: Span {
587 file_id,
588 start: Position { line, col },
589 end: Position { line, col: col + 2 },
590 },
591 },
592 2,
593 );
594 }
595
596 if let Some(prefix_len) = split_fixed_keyword_prefix(text, pos, run, prior_tokens) {
597 return make_ident_token(&run[..prefix_len], pos, file_id, line);
598 }
599
600 // Emit the entire alphanumeric run as one identifier.
601 make_ident_token(run, pos, file_id, line)
602 }
603
604 fn split_fixed_keyword_prefix(
605 text: &str,
606 pos: usize,
607 run: &str,
608 prior_tokens: &[Token],
609 ) -> Option<usize> {
610 if !allow_fixed_keyword_split(prior_tokens) || run.len() <= 4 {
611 return None;
612 }
613
614 let trailing = text.as_bytes().get(pos + run.len()).copied();
615 if matches!(trailing, Some(b'=') | Some(b'%')) {
616 return None;
617 }
618
619 for prefix_len in (4..run.len()).rev() {
620 let prefix = &run[..prefix_len];
621 let prefix_lower = prefix.to_ascii_lowercase();
622 let suffix = &run[prefix_len..];
623 let suffix_first = suffix.as_bytes()[0];
624
625 let is_fixed_keyword = prefix_lower == "endtype" || is_keyword(prefix).is_some();
626 if !is_fixed_keyword {
627 continue;
628 }
629
630 if suffix_first.is_ascii_digit() && !matches!(prefix_lower.as_str(), "goto" | "call") {
631 continue;
632 }
633
634 return Some(prefix_len);
635 }
636
637 None
638 }
639
640 fn allow_fixed_keyword_split(prior_tokens: &[Token]) -> bool {
641 let Some(prev) = prior_tokens.last() else {
642 return true;
643 };
644
645 match prev.kind {
646 TokenKind::Comma | TokenKind::ColonColon => true,
647 TokenKind::Identifier => matches!(
648 prev.text.to_ascii_lowercase().as_str(),
649 "integer"
650 | "real"
651 | "doubleprecision"
652 | "doublecomplex"
653 | "complex"
654 | "character"
655 | "logical"
656 | "type"
657 | "class"
658 | "implicit"
659 | "program"
660 | "module"
661 | "submodule"
662 | "subroutine"
663 | "function"
664 | "entry"
665 | "call"
666 | "pure"
667 | "impure"
668 | "elemental"
669 | "recursive"
670 | "end"
671 | "endtype"
672 ),
673 _ => false,
674 }
675 }
676
677 fn make_ident_token(text: &str, pos: usize, file_id: u32, line: u32) -> (Token, usize) {
678 let col = (pos as u32) + 7;
679 (
680 Token {
681 kind: TokenKind::Identifier,
682 text: text.to_string(),
683 span: Span {
684 file_id,
685 start: Position { line, col },
686 end: Position {
687 line,
688 col: col + text.len() as u32,
689 },
690 },
691 },
692 text.len(),
693 )
694 }
695
696 /// Check if the rest of the statement after DO+digits looks like a DO loop.
697 /// A DO loop has: DO [label] variable = start , end [, step]
698 /// An assignment has: DO[label][var] = expr (no top-level comma after =).
699 fn is_do_loop_context(text: &str, after_do: usize) -> bool {
700 let bytes = text.as_bytes();
701
702 // Find '=' that is not inside strings or parens.
703 let eq_pos = find_top_level_char(bytes, after_do, b'=');
704 let eq_pos = match eq_pos {
705 Some(p) => p,
706 None => return false,
707 };
708
709 // Make sure '=' is not '==' (comparison).
710 if eq_pos + 1 < bytes.len() && bytes[eq_pos + 1] == b'=' {
711 return false;
712 }
713
714 // Check for a top-level comma after the '='.
715 find_top_level_char(bytes, eq_pos + 1, b',').is_some()
716 }
717
718 /// Find the first occurrence of `target` byte at the top level
719 /// (not inside parentheses or string literals).
720 fn find_top_level_char(bytes: &[u8], start: usize, target: u8) -> Option<usize> {
721 let mut i = start;
722 let mut depth = 0i32;
723 while i < bytes.len() {
724 let b = bytes[i];
725
726 // Skip string literals.
727 if b == b'\'' || b == b'"' {
728 let quote = b;
729 i += 1;
730 while i < bytes.len() {
731 if bytes[i] == quote {
732 i += 1;
733 if i < bytes.len() && bytes[i] == quote {
734 i += 1; // doubled quote escape
735 } else {
736 break;
737 }
738 } else {
739 i += 1;
740 }
741 }
742 continue;
743 }
744
745 match b {
746 b'(' => {
747 depth += 1;
748 }
749 b')' => {
750 depth -= 1;
751 }
752 c if c == target && depth == 0 => return Some(i),
753 _ => {}
754 }
755 i += 1;
756 }
757 None
758 }
759
760 /// Lex a BOZ literal in fixed-form body.
761 fn lex_fixed_boz(
762 text: &str,
763 pos: usize,
764 file_id: u32,
765 line: u32,
766 ) -> Result<(Token, usize), LexError> {
767 let bytes = text.as_bytes();
768 let mut end = pos;
769 let mut tok_text = String::new();
770
771 tok_text.push(bytes[end] as char); // B/O/Z
772 end += 1;
773 let quote = bytes[end];
774 tok_text.push(quote as char); // opening quote
775 end += 1;
776
777 while end < bytes.len() && bytes[end] != quote {
778 tok_text.push(bytes[end] as char);
779 end += 1;
780 }
781 if end >= bytes.len() {
782 return Err(LexError {
783 span: Span {
784 file_id,
785 start: Position {
786 line,
787 col: (pos as u32) + 7,
788 },
789 end: Position {
790 line,
791 col: (pos as u32) + 7,
792 },
793 },
794 msg: "unterminated BOZ literal".into(),
795 });
796 }
797 tok_text.push(bytes[end] as char); // closing quote
798 end += 1;
799
800 let col = (pos as u32) + 7;
801 Ok((
802 Token {
803 kind: TokenKind::BozLiteral,
804 text: tok_text,
805 span: Span {
806 file_id,
807 start: Position { line, col },
808 end: Position {
809 line,
810 col: col + (end - pos) as u32,
811 },
812 },
813 },
814 end - pos,
815 ))
816 }
817
818 /// Lex an operator or punctuation in whitespace-stripped body.
819 fn lex_fixed_punct(
820 text: &str,
821 pos: usize,
822 file_id: u32,
823 line: u32,
824 ) -> Result<(Token, usize), LexError> {
825 let bytes = text.as_bytes();
826 let ch = bytes[pos];
827 let next = if pos + 1 < bytes.len() {
828 bytes[pos + 1]
829 } else {
830 0
831 };
832 let col = (pos as u32) + 7;
833 let start = Position { line, col };
834
835 let (kind, tok_text, consumed) = match ch {
836 b'+' => (TokenKind::Plus, "+", 1),
837 b'-' => (TokenKind::Minus, "-", 1),
838 b'*' if next == b'*' => (TokenKind::Power, "**", 2),
839 b'*' => (TokenKind::Star, "*", 1),
840 b'/' if next == b'/' => (TokenKind::Concat, "//", 2),
841 b'/' if next == b'=' => (TokenKind::Ne, "/=", 2),
842 b'/' => (TokenKind::Slash, "/", 1),
843 b'=' if next == b'=' => (TokenKind::Eq, "==", 2),
844 b'=' if next == b'>' => (TokenKind::Arrow, "=>", 2),
845 b'=' => (TokenKind::Assign, "=", 1),
846 b'<' if next == b'=' => (TokenKind::Le, "<=", 2),
847 b'<' => (TokenKind::Lt, "<", 1),
848 b'>' if next == b'=' => (TokenKind::Ge, ">=", 2),
849 b'>' => (TokenKind::Gt, ">", 1),
850 b'(' => (TokenKind::LParen, "(", 1),
851 b')' => (TokenKind::RParen, ")", 1),
852 b'[' => (TokenKind::LBracket, "[", 1),
853 b']' => (TokenKind::RBracket, "]", 1),
854 b',' => (TokenKind::Comma, ",", 1),
855 b':' if next == b':' => (TokenKind::ColonColon, "::", 2),
856 b':' => (TokenKind::Colon, ":", 1),
857 b';' => (TokenKind::Semicolon, ";", 1),
858 b'%' => (TokenKind::Percent, "%", 1),
859 b'&' => (TokenKind::Ampersand, "&", 1),
860 _ => {
861 return Err(LexError {
862 span: Span {
863 file_id,
864 start,
865 end: start,
866 },
867 msg: unexpected_char_message(text, pos, "unexpected character in fixed-form body"),
868 });
869 }
870 };
871
872 Ok((
873 Token {
874 kind,
875 text: tok_text.into(),
876 span: Span {
877 file_id,
878 start,
879 end: Position {
880 line,
881 col: col + consumed as u32,
882 },
883 },
884 },
885 consumed,
886 ))
887 }
888
889 // ---- Line preprocessing ----
890
891 enum FixedLine {
892 Comment {
893 text: String,
894 span: Span,
895 },
896 Statement {
897 label: Option<String>,
898 body: String,
899 start_line: u32,
900 file_id: u32,
901 },
902 Blank {
903 span: Span,
904 },
905 }
906
907 /// Preprocess fixed-form lines: identify comments, extract labels, join
908 /// continuations, strip columns 73+, handle tab-form.
909 fn preprocess_lines(src: &str, file_id: u32) -> Vec<FixedLine> {
910 let lines: Vec<&str> = src.lines().collect();
911 let mut result = Vec::new();
912 let mut i = 0;
913
914 while i < lines.len() {
915 let line = lines[i];
916 let line_num = (i + 1) as u32;
917
918 // Blank line.
919 if line.trim().is_empty() {
920 result.push(FixedLine::Blank {
921 span: Span {
922 file_id,
923 start: Position {
924 line: line_num,
925 col: 1,
926 },
927 end: Position {
928 line: line_num,
929 col: 1,
930 },
931 },
932 });
933 i += 1;
934 continue;
935 }
936
937 let first_byte = line.as_bytes().first().copied().unwrap_or(0);
938
939 // Comment line: C, c, *, or ! in column 1.
940 if matches!(first_byte, b'C' | b'c' | b'*' | b'!') {
941 result.push(FixedLine::Comment {
942 text: line.to_string(),
943 span: Span {
944 file_id,
945 start: Position {
946 line: line_num,
947 col: 1,
948 },
949 end: Position {
950 line: line_num,
951 col: line.len() as u32,
952 },
953 },
954 });
955 i += 1;
956 continue;
957 }
958
959 // Extract columns from this line.
960 let (label, body) = extract_fixed_columns(line);
961
962 // Collect continuation lines.
963 let start_line = line_num;
964 let mut full_body = body;
965 i += 1;
966
967 while i < lines.len() {
968 let next = lines[i];
969
970 // Blank lines between continuations: skip them only if the line
971 // after the blank is actually a continuation. Otherwise, the blank
972 // terminates the statement and should be emitted by the outer loop.
973 if next.trim().is_empty() {
974 // Peek ahead: is the line after this blank a continuation?
975 let lookahead = i + 1;
976 if lookahead < lines.len() && is_continuation_line(lines[lookahead]) {
977 i += 1;
978 continue;
979 }
980 break; // blank line ends the statement
981 }
982
983 let next_first = next.as_bytes().first().copied().unwrap_or(0);
984 // Comment lines between continuations: skip them.
985 if matches!(next_first, b'C' | b'c' | b'*' | b'!') {
986 // Emit the comment but don't break the continuation.
987 result.push(FixedLine::Comment {
988 text: next.to_string(),
989 span: Span {
990 file_id,
991 start: Position {
992 line: (i + 1) as u32,
993 col: 1,
994 },
995 end: Position {
996 line: (i + 1) as u32,
997 col: next.len() as u32,
998 },
999 },
1000 });
1001 i += 1;
1002 continue;
1003 }
1004
1005 // Check column 6 for continuation marker.
1006 if is_continuation_line(next) {
1007 let (_, cont_body) = extract_fixed_columns(next);
1008 full_body.push_str(&cont_body);
1009 i += 1;
1010 } else {
1011 break;
1012 }
1013 }
1014
1015 result.push(FixedLine::Statement {
1016 label: if label.trim().is_empty() {
1017 None
1018 } else {
1019 Some(label)
1020 },
1021 body: full_body,
1022 start_line,
1023 file_id,
1024 });
1025 }
1026
1027 result
1028 }
1029
1030 /// Check if a line is a continuation line (non-space, non-zero in column 6).
1031 fn is_continuation_line(line: &str) -> bool {
1032 let bytes = line.as_bytes();
1033
1034 // Tab-form: tab followed by digit 1-9 is continuation.
1035 if bytes.first() == Some(&b'\t') {
1036 if let Some(&d) = bytes.get(1) {
1037 return (b'1'..=b'9').contains(&d);
1038 }
1039 }
1040
1041 // Standard: column 6 (0-indexed: byte 5) is non-space, non-zero.
1042 if bytes.len() >= 6 {
1043 let col6 = bytes[5];
1044 return col6 != b' ' && col6 != b'0' && col6 != b'\t';
1045 }
1046
1047 false
1048 }
1049
1050 /// Extract label (columns 1-5) and body (columns 7-72) from a fixed-form line.
1051 /// Handles tab-form extension.
1052 fn extract_fixed_columns(line: &str) -> (String, String) {
1053 let bytes = line.as_bytes();
1054
1055 // Tab-form: if first character is a tab, everything after is body (or continuation).
1056 if bytes.first() == Some(&b'\t') {
1057 // Tab followed by digit 1-9: continuation (body starts after the digit).
1058 if let Some(&d) = bytes.get(1) {
1059 if (b'1'..=b'9').contains(&d) {
1060 let body = if bytes.len() > 2 {
1061 String::from_utf8_lossy(&bytes[2..]).to_string()
1062 } else {
1063 String::new()
1064 };
1065 return (String::new(), body);
1066 }
1067 }
1068 // Tab followed by anything else: body starts at position after tab.
1069 let body = if bytes.len() > 1 {
1070 String::from_utf8_lossy(&bytes[1..]).to_string()
1071 } else {
1072 String::new()
1073 };
1074 return (String::new(), body);
1075 }
1076
1077 // Standard fixed-form: columns 1-5 label, column 6 continuation marker, 7-72 body.
1078 let label = if bytes.len() >= 5 {
1079 String::from_utf8_lossy(&bytes[0..5]).to_string()
1080 } else {
1081 String::from_utf8_lossy(bytes).to_string()
1082 };
1083
1084 let body_start = 6.min(bytes.len());
1085 let body_end = 72.min(bytes.len()); // columns 73+ are ignored
1086 let body = if body_start < bytes.len() {
1087 String::from_utf8_lossy(&bytes[body_start..body_end]).to_string()
1088 } else {
1089 String::new()
1090 };
1091
1092 (label, body)
1093 }
1094
1095 // ---- Hollerith constants ----
1096
1097 #[cfg(test)]
1098 mod tests {
1099 use super::*;
1100 use crate::lexer::TokenKind;
1101
1102 fn fixed_toks(src: &str) -> Vec<Token> {
1103 tokenize_fixed(src, 0).unwrap()
1104 }
1105
1106 fn fixed_kinds(src: &str) -> Vec<TokenKind> {
1107 fixed_toks(src)
1108 .into_iter()
1109 .map(|t| t.kind)
1110 .filter(|k| !matches!(k, TokenKind::Eof | TokenKind::Newline))
1111 .collect()
1112 }
1113
1114 fn fixed_texts(src: &str) -> Vec<String> {
1115 fixed_toks(src)
1116 .into_iter()
1117 .filter(|t| {
1118 !matches!(
1119 t.kind,
1120 TokenKind::Eof | TokenKind::Newline | TokenKind::Comment
1121 )
1122 })
1123 .map(|t| t.text)
1124 .collect()
1125 }
1126
1127 // ---- Comment detection ----
1128
1129 #[test]
1130 fn comment_c_uppercase() {
1131 let k = fixed_kinds("C This is a comment\n");
1132 assert_eq!(k, vec![TokenKind::Comment]);
1133 }
1134
1135 #[test]
1136 fn comment_c_lowercase() {
1137 let k = fixed_kinds("c This is a comment\n");
1138 assert_eq!(k, vec![TokenKind::Comment]);
1139 }
1140
1141 #[test]
1142 fn comment_star() {
1143 let k = fixed_kinds("* This is a comment\n");
1144 assert_eq!(k, vec![TokenKind::Comment]);
1145 }
1146
1147 #[test]
1148 fn comment_bang() {
1149 let k = fixed_kinds("! This is a comment\n");
1150 assert_eq!(k, vec![TokenKind::Comment]);
1151 }
1152
1153 // ---- Statement labels ----
1154
1155 #[test]
1156 fn statement_with_label() {
1157 // " 10 CONTINUE" — label 10 in columns 1-5, CONTINUE in 7+
1158 let texts = fixed_texts(" 10 CONTINUE\n");
1159 assert!(texts.contains(&"10".to_string()), "got: {:?}", texts);
1160 assert!(texts.contains(&"CONTINUE".to_string()), "got: {:?}", texts);
1161 }
1162
1163 #[test]
1164 fn statement_without_label() {
1165 // No label means the first token should be the identifier X, not a label number.
1166 let toks = fixed_toks(" X = 42\n");
1167 let first_meaningful = toks
1168 .iter()
1169 .find(|t| {
1170 !matches!(
1171 t.kind,
1172 TokenKind::Newline | TokenKind::Eof | TokenKind::Comment
1173 )
1174 })
1175 .unwrap();
1176 assert_eq!(first_meaningful.kind, TokenKind::Identifier);
1177 assert_eq!(first_meaningful.text, "X");
1178 }
1179
1180 // ---- Column 73+ ignored ----
1181
1182 #[test]
1183 fn columns_past_72_ignored() {
1184 // Columns 73+ should be stripped. Place code in 7-72, junk in 73+.
1185 let line = format!(" X = 42{}\n", " ".repeat(60) + "JUNK");
1186 // Body should be "X = 42" + spaces, NOT including JUNK.
1187 let texts = fixed_texts(&line);
1188 assert!(texts.contains(&"X".to_string()));
1189 assert!(
1190 !texts.iter().any(|t| t.contains("JUNK")),
1191 "got: {:?}",
1192 texts
1193 );
1194 }
1195
1196 // ---- Continuation lines ----
1197
1198 #[test]
1199 fn continuation_in_column_6() {
1200 let src = " X = 1 +\n + 2\n";
1201 let kinds = fixed_kinds(src);
1202 assert!(kinds.contains(&TokenKind::Plus));
1203 // Should have both integer literals.
1204 let int_count = kinds
1205 .iter()
1206 .filter(|k| **k == TokenKind::IntegerLiteral)
1207 .count();
1208 assert_eq!(int_count, 2, "expected 2 integer literals, got {:?}", kinds);
1209 }
1210
1211 #[test]
1212 fn continuation_dollar_sign() {
1213 // Any non-space, non-zero character in column 6 is continuation.
1214 let src = " X = 1 +\n $ 2\n";
1215 let kinds = fixed_kinds(src);
1216 let int_count = kinds
1217 .iter()
1218 .filter(|k| **k == TokenKind::IntegerLiteral)
1219 .count();
1220 assert_eq!(int_count, 2);
1221 }
1222
1223 // ---- Tab-form extension ----
1224
1225 #[test]
1226 fn tab_form_statement() {
1227 let src = "\tX = 42\n";
1228 let texts = fixed_texts(src);
1229 assert!(texts.contains(&"X".to_string()));
1230 assert!(texts.contains(&"42".to_string()));
1231 }
1232
1233 #[test]
1234 fn tab_form_continuation() {
1235 // Tab followed by digit 1-9 is continuation.
1236 let src = "\tX = 1 +\n\t1 2\n";
1237 let kinds = fixed_kinds(src);
1238 let int_count = kinds
1239 .iter()
1240 .filter(|k| **k == TokenKind::IntegerLiteral)
1241 .count();
1242 assert_eq!(int_count, 2, "got: {:?}", kinds);
1243 }
1244
1245 // ---- Simple programs ----
1246
1247 #[test]
1248 fn simple_fixed_form_program() {
1249 let src = "\
1250 C Hello World
1251 PROGRAM HELLO
1252 INTEGER I
1253 DO 10 I = 1, 10
1254 WRITE(*,*) I
1255 10 CONTINUE
1256 STOP
1257 END
1258 ";
1259 let tokens = tokenize_fixed(src, 0).unwrap();
1260 let ident_count = tokens
1261 .iter()
1262 .filter(|t| t.kind == TokenKind::Identifier)
1263 .count();
1264 assert!(
1265 ident_count >= 8,
1266 "expected 8+ identifiers, got {}",
1267 ident_count
1268 );
1269
1270 // Should have a label "10".
1271 assert!(tokens
1272 .iter()
1273 .any(|t| t.kind == TokenKind::IntegerLiteral && t.text == "10"));
1274 }
1275
1276 // ---- Mode detection ----
1277
1278 #[test]
1279 fn detect_free_form() {
1280 use super::super::detect_source_form;
1281 assert_eq!(
1282 detect_source_form("test.f90"),
1283 super::super::SourceForm::FreeForm
1284 );
1285 assert_eq!(
1286 detect_source_form("test.f95"),
1287 super::super::SourceForm::FreeForm
1288 );
1289 assert_eq!(
1290 detect_source_form("test.f03"),
1291 super::super::SourceForm::FreeForm
1292 );
1293 assert_eq!(
1294 detect_source_form("test.f08"),
1295 super::super::SourceForm::FreeForm
1296 );
1297 assert_eq!(
1298 detect_source_form("test.f18"),
1299 super::super::SourceForm::FreeForm
1300 );
1301 }
1302
1303 #[test]
1304 fn detect_fixed_form() {
1305 use super::super::detect_source_form;
1306 assert_eq!(
1307 detect_source_form("test.f"),
1308 super::super::SourceForm::FixedForm
1309 );
1310 assert_eq!(
1311 detect_source_form("test.for"),
1312 super::super::SourceForm::FixedForm
1313 );
1314 assert_eq!(
1315 detect_source_form("test.ftn"),
1316 super::super::SourceForm::FixedForm
1317 );
1318 }
1319
1320 // ---- Unified token stream ----
1321
1322 #[test]
1323 fn fixed_and_free_produce_same_tokens() {
1324 let free_src = "integer :: x\nx = 42\n";
1325 let fixed_src = " integer :: x\n x = 42\n";
1326
1327 let free_kinds: Vec<_> = super::super::Lexer::tokenize(free_src, 0)
1328 .unwrap()
1329 .into_iter()
1330 .map(|t| t.kind)
1331 .filter(|k| !matches!(k, TokenKind::Eof | TokenKind::Newline))
1332 .collect();
1333
1334 let fixed_kinds = fixed_kinds(fixed_src);
1335
1336 assert_eq!(
1337 free_kinds, fixed_kinds,
1338 "free-form and fixed-form produced different tokens:\n free: {:?}\n fixed: {:?}",
1339 free_kinds, fixed_kinds
1340 );
1341 }
1342
1343 // ---- Blank lines ----
1344
1345 #[test]
1346 fn blank_lines_handled() {
1347 let src = " X = 1\n\n Y = 2\n";
1348 let kinds = fixed_kinds(src);
1349 assert!(
1350 kinds
1351 .iter()
1352 .filter(|k| **k == TokenKind::Identifier)
1353 .count()
1354 >= 2
1355 );
1356 }
1357
1358 // ---- Hollerith ----
1359
1360 #[test]
1361 fn hollerith_protect_converts_to_string() {
1362 assert_eq!(protect_hollerith("3HABC"), "'ABC'");
1363 assert_eq!(protect_hollerith("6HFOOBAR"), "'FOOBAR'");
1364 }
1365
1366 #[test]
1367 fn hollerith_with_spaces_preserved() {
1368 // 6H HELLO has a leading space — must be preserved.
1369 assert_eq!(protect_hollerith("6H HELLO"), "' HELLO'");
1370 }
1371
1372 #[test]
1373 fn hollerith_not_after_letter() {
1374 // X3HABC — the 3H is preceded by a letter, so it's NOT a Hollerith.
1375 assert_eq!(protect_hollerith("X3HABC"), "X3HABC");
1376 }
1377
1378 #[test]
1379 fn hollerith_after_operator() {
1380 // =3HABC — preceded by =, not a letter, so IS a Hollerith.
1381 assert_eq!(protect_hollerith("=3HABC"), "='ABC'");
1382 }
1383
1384 // ---- Real fixed-form files from refs ----
1385
1386 #[test]
1387 fn tokenize_flang_fixed_form_test() {
1388 let path = concat!(
1389 env!("CARGO_MANIFEST_DIR"),
1390 "/../.refs/llvm/flang/test/Driver/Inputs/fixed-form-test.f"
1391 );
1392 if !std::path::Path::new(path).exists() {
1393 return;
1394 }
1395 let src = std::fs::read_to_string(path).unwrap();
1396 let tokens = tokenize_fixed(&src, 0);
1397 assert!(tokens.is_ok(), "failed: {:?}", tokens.err());
1398 }
1399
1400 #[test]
1401 fn tokenize_gcc_nested_forall() {
1402 let path = concat!(
1403 env!("CARGO_MANIFEST_DIR"),
1404 "/../.refs/gcc/gcc/testsuite/gfortran.dg/nested_forall_1.f"
1405 );
1406 if !std::path::Path::new(path).exists() {
1407 return;
1408 }
1409 let src = std::fs::read_to_string(path).unwrap();
1410 let tokens = tokenize_fixed(&src, 0);
1411 assert!(tokens.is_ok(), "failed: {:?}", tokens.err());
1412 let toks = tokens.unwrap();
1413 assert!(toks.len() > 50, "expected 50+ tokens, got {}", toks.len());
1414 }
1415
1416 // ======================================================================
1417 // Whitespace insensitivity tests — the core challenge of fixed-form
1418 // ======================================================================
1419
1420 #[test]
1421 fn whitespace_stripped_goto() {
1422 // GO TO 100 collapses to GOTO100 in fixed-form source.
1423 let texts = fixed_texts(" GOTO100\n");
1424 assert_eq!(texts, vec!["GOTO", "100"], "got: {:?}", texts);
1425 }
1426
1427 #[test]
1428 fn whitespace_stripped_integer_decl() {
1429 // INTEGER I collapses to INTEGERI and must still parse as a declaration.
1430 let texts = fixed_texts(" INTEGERI\n");
1431 assert_eq!(texts, vec!["INTEGER", "I"], "got: {:?}", texts);
1432 }
1433
1434 #[test]
1435 fn whitespace_stripped_doubleprecision() {
1436 // DOUBLE PRECISION X collapses to DOUBLEPRECISIONX.
1437 let texts = fixed_texts(" DOUBLEPRECISIONX\n");
1438 assert_eq!(texts, vec!["DOUBLEPRECISION", "X"], "got: {:?}", texts);
1439 }
1440
1441 #[test]
1442 fn whitespace_stripped_program_name() {
1443 let texts = fixed_texts(" PROGRAMHELLO\n");
1444 assert_eq!(texts, vec!["PROGRAM", "HELLO"], "got: {:?}", texts);
1445 }
1446
1447 #[test]
1448 fn whitespace_stripped_typed_function() {
1449 let texts = fixed_texts(" INTEGERFUNCTIONF(X)\n");
1450 assert_eq!(texts, vec!["INTEGER", "FUNCTION", "F", "(", "X", ")"]);
1451 }
1452
1453 #[test]
1454 fn index_not_broken() {
1455 // INDEX must NOT be split into IN+DEX — this was the showstopper bug.
1456 let _kinds = fixed_kinds(" X=INDEX(A,'B')\n");
1457 let texts = fixed_texts(" X=INDEX(A,'B')\n");
1458 assert!(
1459 texts.contains(&"INDEX".to_string()),
1460 "INDEX was incorrectly split, got: {:?}",
1461 texts
1462 );
1463 }
1464
1465 #[test]
1466 fn include_not_broken() {
1467 // INCLUDE must not become IN+CLUDE.
1468 let texts = fixed_texts(" INCLUDEVAR=1\n");
1469 assert_eq!(texts[0], "INCLUDEVAR", "got: {:?}", texts);
1470 }
1471
1472 #[test]
1473 fn if_ident_not_broken() {
1474 // IFLAG must not become IF+LAG.
1475 let texts = fixed_texts(" IFLAG=1\n");
1476 assert_eq!(texts[0], "IFLAG", "got: {:?}", texts);
1477 }
1478
1479 #[test]
1480 fn whitespace_stripped_assignment() {
1481 // X=42 → identifier, =, integer
1482 let kinds = fixed_kinds(" X=42\n");
1483 assert_eq!(
1484 kinds,
1485 vec![
1486 TokenKind::Identifier,
1487 TokenKind::Assign,
1488 TokenKind::IntegerLiteral,
1489 ]
1490 );
1491 }
1492
1493 #[test]
1494 fn whitespace_stripped_expression() {
1495 // A+B*C → identifier, +, identifier, *, identifier
1496 let kinds = fixed_kinds(" A+B*C\n");
1497 assert_eq!(
1498 kinds,
1499 vec![
1500 TokenKind::Identifier,
1501 TokenKind::Plus,
1502 TokenKind::Identifier,
1503 TokenKind::Star,
1504 TokenKind::Identifier,
1505 ]
1506 );
1507 }
1508
1509 #[test]
1510 fn whitespace_stripped_with_parens() {
1511 // X=REAL(I) → identifier, =, identifier, (, identifier, )
1512 let kinds = fixed_kinds(" X=REAL(I)\n");
1513 assert_eq!(
1514 kinds,
1515 vec![
1516 TokenKind::Identifier,
1517 TokenKind::Assign,
1518 TokenKind::Identifier,
1519 TokenKind::LParen,
1520 TokenKind::Identifier,
1521 TokenKind::RParen,
1522 ]
1523 );
1524 }
1525
1526 #[test]
1527 fn whitespace_stripped_dot_op() {
1528 // A.AND.B → identifier, .and., identifier
1529 let kinds = fixed_kinds(" A.AND.B\n");
1530 assert_eq!(
1531 kinds,
1532 vec![
1533 TokenKind::Identifier,
1534 TokenKind::DotOp("and".into()),
1535 TokenKind::Identifier,
1536 ]
1537 );
1538 }
1539
1540 #[test]
1541 fn whitespace_stripped_real_literal() {
1542 // X=1.0D0 → identifier, =, real
1543 let kinds = fixed_kinds(" X=1.0D0\n");
1544 assert_eq!(
1545 kinds,
1546 vec![
1547 TokenKind::Identifier,
1548 TokenKind::Assign,
1549 TokenKind::RealLiteral,
1550 ]
1551 );
1552 }
1553
1554 #[test]
1555 fn whitespace_stripped_comparison() {
1556 // 1.EQ.2 → integer, .eq., integer
1557 let kinds = fixed_kinds(" IF(I.EQ.1)STOP\n");
1558 assert!(
1559 kinds.contains(&TokenKind::DotOp("eq".into())),
1560 "got: {:?}",
1561 kinds
1562 );
1563 }
1564
1565 #[test]
1566 fn whitespace_stripped_string_preserved() {
1567 // Whitespace INSIDE strings must be preserved.
1568 let kinds = fixed_kinds(" X='HELLO WORLD'\n");
1569 assert!(kinds.contains(&TokenKind::StringLiteral));
1570 let texts = fixed_texts(" X='HELLO WORLD'\n");
1571 assert!(
1572 texts.iter().any(|t| t.contains("HELLO WORLD")),
1573 "got: {:?}",
1574 texts
1575 );
1576 }
1577
1578 // ---- Continuation over blank lines ----
1579
1580 #[test]
1581 fn continuation_over_blank_line() {
1582 let src = " X = 1 +\n\n + 2\n";
1583 let kinds = fixed_kinds(src);
1584 let int_count = kinds
1585 .iter()
1586 .filter(|k| **k == TokenKind::IntegerLiteral)
1587 .count();
1588 assert_eq!(
1589 int_count, 2,
1590 "blank line should not break continuation, got: {:?}",
1591 kinds
1592 );
1593 }
1594
1595 // ---- DO/assignment ambiguity ----
1596
1597 #[test]
1598 fn do_loop_with_comma() {
1599 // DO10I=1,10 → DO loop: DO + 10 + I + = + 1 + , + 10
1600 let kinds = fixed_kinds(" DO10I=1,10\n");
1601 assert!(
1602 kinds.contains(&TokenKind::Comma),
1603 "DO loop must have comma, got: {:?}",
1604 kinds
1605 );
1606 let texts = fixed_texts(" DO10I=1,10\n");
1607 assert_eq!(
1608 texts[0], "DO",
1609 "first token should be DO keyword, got: {:?}",
1610 texts
1611 );
1612 }
1613
1614 #[test]
1615 fn do_assignment_no_comma() {
1616 // DO10I=1.10 → assignment: DO10I + = + 1.10 (no comma → not a loop)
1617 let kinds = fixed_kinds(" DO10I=1.10\n");
1618 assert!(
1619 !kinds.contains(&TokenKind::Comma),
1620 "assignment should have no comma, got: {:?}",
1621 kinds
1622 );
1623 let texts = fixed_texts(" DO10I=1.10\n");
1624 assert_eq!(
1625 texts[0], "DO10I",
1626 "should be single identifier, got: {:?}",
1627 texts
1628 );
1629 }
1630
1631 #[test]
1632 fn do_assignment_no_comma_integer() {
1633 // DO10I=1 → assignment (no comma)
1634 let kinds = fixed_kinds(" DO10I=1\n");
1635 assert!(!kinds.contains(&TokenKind::Comma));
1636 let texts = fixed_texts(" DO10I=1\n");
1637 assert_eq!(texts[0], "DO10I");
1638 }
1639
1640 // ---- BOZ in fixed-form ----
1641
1642 #[test]
1643 fn boz_in_fixed_form() {
1644 let kinds = fixed_kinds(" X=B'1010'\n");
1645 assert!(kinds.contains(&TokenKind::BozLiteral), "got: {:?}", kinds);
1646 }
1647
1648 #[test]
1649 fn boz_hex_in_fixed_form() {
1650 let kinds = fixed_kinds(" X=Z'FF'\n");
1651 assert!(kinds.contains(&TokenKind::BozLiteral), "got: {:?}", kinds);
1652 }
1653
1654 // ---- Hollerith integration ----
1655
1656 #[test]
1657 fn hollerith_in_source() {
1658 // 3HABC in a statement should produce a string literal.
1659 let kinds = fixed_kinds(" X=3HABC\n");
1660 assert!(
1661 kinds.contains(&TokenKind::StringLiteral),
1662 "got: {:?}",
1663 kinds
1664 );
1665 let texts = fixed_texts(" X=3HABC\n");
1666 assert!(
1667 texts.iter().any(|t| t.contains("ABC")),
1668 "Hollerith content missing, got: {:?}",
1669 texts
1670 );
1671 }
1672
1673 #[test]
1674 fn hollerith_with_spaces_in_source() {
1675 // 6H HELLO preserves the space.
1676 let texts = fixed_texts(" X=6H HELLO\n");
1677 assert!(
1678 texts.iter().any(|t| t.contains(" HELLO")),
1679 "space lost, got: {:?}",
1680 texts
1681 );
1682 }
1683
1684 #[test]
1685 fn hollerith_zero_length() {
1686 // 0H should produce empty string literal.
1687 assert_eq!(protect_hollerith("=0H+"), "=''+");
1688 }
1689
1690 // ---- String in fixed-form ----
1691
1692 #[test]
1693 fn string_literal_in_fixed_form() {
1694 let kinds = fixed_kinds(" X = 'IT''S'\n");
1695 assert!(kinds.contains(&TokenKind::StringLiteral));
1696 let texts = fixed_texts(" X = 'IT''S'\n");
1697 assert!(
1698 texts.iter().any(|t| t.contains("IT''S")),
1699 "got: {:?}",
1700 texts
1701 );
1702 }
1703
1704 #[test]
1705 fn unterminated_string_error() {
1706 let result = tokenize_fixed(" X = 'UNTERMINATED\n", 0);
1707 assert!(result.is_err(), "should error on unterminated string");
1708 }
1709
1710 #[test]
1711 fn doublecomplex_keyword() {
1712 use crate::lexer::is_keyword;
1713 assert!(is_keyword("doublecomplex").is_some());
1714 assert!(is_keyword("DOUBLECOMPLEX").is_some());
1715 }
1716
1717 #[test]
1718 fn continue_keyword() {
1719 use crate::lexer::is_keyword;
1720 assert!(is_keyword("continue").is_some());
1721 assert!(is_keyword("CONTINUE").is_some());
1722 }
1723 }
1724