Rust · 23202 bytes Raw Blame History
1 //! Core syntax highlighting engine
2
3 #![allow(dead_code)]
4
5 use super::languages::{Language, LanguageDef};
6 use crossterm::style::Color;
7
8 /// Token types for syntax highlighting
9 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
10 pub enum TokenType {
11 Plain,
12 Keyword,
13 String,
14 Number,
15 Comment,
16 Operator,
17 Type,
18 Function,
19 Preprocessor,
20 Attribute,
21 Punctuation,
22 }
23
24 impl TokenType {
25 /// Get the foreground color for this token type
26 pub fn color(&self) -> Color {
27 match self {
28 TokenType::Plain => Color::Reset,
29 TokenType::Keyword => Color::Blue,
30 TokenType::String => Color::Green,
31 TokenType::Number => Color::Magenta,
32 TokenType::Comment => Color::DarkGrey,
33 TokenType::Operator => Color::Yellow,
34 TokenType::Type => Color::Cyan,
35 TokenType::Function => Color::Cyan,
36 TokenType::Preprocessor => Color::Magenta,
37 TokenType::Attribute => Color::Yellow,
38 TokenType::Punctuation => Color::DarkGrey,
39 }
40 }
41
42 /// Whether this token type should be bold
43 pub fn bold(&self) -> bool {
44 matches!(self, TokenType::Keyword | TokenType::Function)
45 }
46 }
47
48 /// A token in a line of text
49 #[derive(Debug, Clone)]
50 pub struct Token {
51 /// Token type
52 pub token_type: TokenType,
53 /// Start column (character index, not byte)
54 pub start: usize,
55 /// End column (exclusive, character index)
56 pub end: usize,
57 }
58
59 /// State for multiline constructs (comments, strings)
60 #[derive(Debug, Clone, Default, PartialEq)]
61 pub struct HighlightState {
62 /// Currently in a multiline comment
63 pub in_block_comment: bool,
64 /// Currently in a multiline string (stores delimiter for matching)
65 pub in_multiline_string: Option<String>,
66 }
67
68 /// Syntax highlighter for a specific language
69 #[derive(Debug)]
70 pub struct Highlighter {
71 /// Current language definition
72 language: Option<LanguageDef>,
73 /// State for multiline constructs
74 state: HighlightState,
75 /// Cached state at the END of each line (state_cache[i] = state after processing line i)
76 /// This allows O(1) lookup of the starting state for any line
77 state_cache: Vec<HighlightState>,
78 /// Line index from which cache is invalid (everything from this line onward needs recalc)
79 cache_valid_until: usize,
80 }
81
82 impl Default for Highlighter {
83 fn default() -> Self {
84 Self::new()
85 }
86 }
87
88 impl Highlighter {
89 /// Create a new highlighter with no language
90 pub fn new() -> Self {
91 Self {
92 language: None,
93 state: HighlightState::default(),
94 state_cache: Vec::new(),
95 cache_valid_until: 0,
96 }
97 }
98
99 /// Detect and set language based on filename
100 pub fn detect_language(&mut self, filename: &str) {
101 self.language = Language::detect(filename).map(|l| l.definition());
102 self.invalidate_cache(0);
103 }
104
105 /// Set language explicitly
106 pub fn set_language(&mut self, lang: Language) {
107 self.language = Some(lang.definition());
108 self.invalidate_cache(0);
109 }
110
111 /// Clear language (disable highlighting)
112 pub fn clear_language(&mut self) {
113 self.language = None;
114 self.invalidate_cache(0);
115 }
116
117 /// Check if highlighting is enabled
118 pub fn is_enabled(&self) -> bool {
119 self.language.is_some()
120 }
121
122 /// Get current language name
123 pub fn language_name(&self) -> Option<&str> {
124 self.language.as_ref().map(|l| l.name)
125 }
126
127 /// Get the line comment prefix for the current language (e.g., "//", "#", "--")
128 pub fn line_comment(&self) -> Option<&'static str> {
129 self.language.as_ref().and_then(|l| l.line_comment)
130 }
131
132 /// Reset multiline state (call when buffer changes significantly)
133 pub fn reset_state(&mut self) {
134 self.invalidate_cache(0);
135 }
136
137 /// Invalidate the highlight state cache from a specific line onward.
138 /// Call this when the buffer content changes at or after line `from_line`.
139 pub fn invalidate_cache(&mut self, from_line: usize) {
140 self.cache_valid_until = self.cache_valid_until.min(from_line);
141 self.state = HighlightState::default();
142 }
143
144 /// Get the starting highlight state for a given line by looking up the cache.
145 /// Returns the state after processing (line_idx - 1), or default state for line 0.
146 pub fn get_state_for_line(&self, line_idx: usize) -> HighlightState {
147 if line_idx == 0 {
148 HighlightState::default()
149 } else if line_idx <= self.cache_valid_until && line_idx <= self.state_cache.len() {
150 self.state_cache[line_idx - 1].clone()
151 } else {
152 // Cache miss - caller needs to rebuild from last valid point
153 HighlightState::default()
154 }
155 }
156
157 /// Update the state cache after tokenizing a line.
158 /// Call this after tokenize_line() with the resulting state.
159 pub fn update_cache(&mut self, line_idx: usize, state: &HighlightState) {
160 // Ensure cache is large enough
161 if line_idx >= self.state_cache.len() {
162 self.state_cache.resize(line_idx + 1, HighlightState::default());
163 }
164 self.state_cache[line_idx] = state.clone();
165 // Update valid range if this extends it
166 if line_idx >= self.cache_valid_until {
167 self.cache_valid_until = line_idx + 1;
168 }
169 }
170
171 /// Get the line number from which the cache is valid
172 pub fn cache_valid_from(&self) -> usize {
173 self.cache_valid_until
174 }
175
176 /// Tokenize a single line, returning tokens and updated state
177 /// The state should be passed from the previous line for correct multiline handling
178 pub fn tokenize_line(&self, line: &str, state: &mut HighlightState) -> Vec<Token> {
179 let lang = match &self.language {
180 Some(l) => l,
181 None => return vec![],
182 };
183
184 let mut tokens = Vec::new();
185 let chars: Vec<char> = line.chars().collect();
186 let mut i = 0;
187
188 while i < chars.len() {
189 // Handle continuing multiline comment
190 if state.in_block_comment {
191 if let Some((end_start, end_len)) = self.find_block_comment_end(lang, &chars, i) {
192 tokens.push(Token {
193 token_type: TokenType::Comment,
194 start: i,
195 end: end_start + end_len,
196 });
197 i = end_start + end_len;
198 state.in_block_comment = false;
199 continue;
200 } else {
201 // Rest of line is comment
202 tokens.push(Token {
203 token_type: TokenType::Comment,
204 start: i,
205 end: chars.len(),
206 });
207 break;
208 }
209 }
210
211 // Handle continuing multiline string
212 if let Some(delim) = state.in_multiline_string.as_ref() {
213 if let Some(end_pos) = self.find_string_end(&chars, i, delim) {
214 tokens.push(Token {
215 token_type: TokenType::String,
216 start: i,
217 end: end_pos,
218 });
219 i = end_pos;
220 state.in_multiline_string = None;
221 continue;
222 } else {
223 // Rest of line is string
224 tokens.push(Token {
225 token_type: TokenType::String,
226 start: i,
227 end: chars.len(),
228 });
229 break;
230 }
231 }
232
233 // Skip whitespace
234 if chars[i].is_whitespace() {
235 i += 1;
236 continue;
237 }
238
239 // Check for line comment
240 if let Some(ref comment) = lang.line_comment {
241 if self.matches_at(&chars, i, comment) {
242 tokens.push(Token {
243 token_type: TokenType::Comment,
244 start: i,
245 end: chars.len(),
246 });
247 break;
248 }
249 }
250
251 // Check for block comment start
252 if let (Some(ref start), Some(_)) = (&lang.block_comment_start, &lang.block_comment_end) {
253 if self.matches_at(&chars, i, start) {
254 let comment_start = i;
255 i += start.chars().count();
256
257 if let Some((end_start, end_len)) = self.find_block_comment_end(lang, &chars, i) {
258 tokens.push(Token {
259 token_type: TokenType::Comment,
260 start: comment_start,
261 end: end_start + end_len,
262 });
263 i = end_start + end_len;
264 } else {
265 // Multiline comment continues
266 tokens.push(Token {
267 token_type: TokenType::Comment,
268 start: comment_start,
269 end: chars.len(),
270 });
271 state.in_block_comment = true;
272 break;
273 }
274 continue;
275 }
276 }
277
278 // Check for strings
279 if let Some((token, new_i, multiline_delim)) = self.try_parse_string(lang, &chars, i) {
280 tokens.push(token);
281 i = new_i;
282 if let Some(delim) = multiline_delim {
283 state.in_multiline_string = Some(delim);
284 break;
285 }
286 continue;
287 }
288
289 // Check for numbers
290 if let Some((token, new_i)) = self.try_parse_number(&chars, i) {
291 tokens.push(token);
292 i = new_i;
293 continue;
294 }
295
296 // Check for preprocessor directives
297 if lang.has_preprocessor && chars[i] == '#' && self.is_line_start(&chars, i) {
298 tokens.push(Token {
299 token_type: TokenType::Preprocessor,
300 start: i,
301 end: chars.len(),
302 });
303 break;
304 }
305
306 // Check for attributes (Rust #[], Python @)
307 if let Some((token, new_i)) = self.try_parse_attribute(lang, &chars, i) {
308 tokens.push(token);
309 i = new_i;
310 continue;
311 }
312
313 // Check for identifiers (keywords, types, functions)
314 if chars[i].is_alphabetic() || chars[i] == '_' {
315 let start = i;
316 while i < chars.len() && (chars[i].is_alphanumeric() || chars[i] == '_') {
317 i += 1;
318 }
319 let word: String = chars[start..i].iter().collect();
320
321 let token_type = if lang.keywords.contains(&word.as_str()) {
322 TokenType::Keyword
323 } else if lang.types.contains(&word.as_str()) {
324 TokenType::Type
325 } else if i < chars.len() && chars[i] == '(' {
326 TokenType::Function
327 } else {
328 TokenType::Plain
329 };
330
331 if token_type != TokenType::Plain {
332 tokens.push(Token {
333 token_type,
334 start,
335 end: i,
336 });
337 }
338 continue;
339 }
340
341 // Check for operators
342 if let Some((token, new_i)) = self.try_parse_operator(lang, &chars, i) {
343 tokens.push(token);
344 i = new_i;
345 continue;
346 }
347
348 // Check for punctuation
349 if lang.punctuation.contains(&chars[i]) {
350 tokens.push(Token {
351 token_type: TokenType::Punctuation,
352 start: i,
353 end: i + 1,
354 });
355 i += 1;
356 continue;
357 }
358
359 // Skip unknown character
360 i += 1;
361 }
362
363 tokens
364 }
365
366 fn matches_at(&self, chars: &[char], pos: usize, pattern: &str) -> bool {
367 let pattern_chars: Vec<char> = pattern.chars().collect();
368 if pos + pattern_chars.len() > chars.len() {
369 return false;
370 }
371 for (i, &pc) in pattern_chars.iter().enumerate() {
372 if chars[pos + i] != pc {
373 return false;
374 }
375 }
376 true
377 }
378
379 fn find_block_comment_end(&self, lang: &LanguageDef, chars: &[char], start: usize) -> Option<(usize, usize)> {
380 let end_pattern = lang.block_comment_end.as_ref()?;
381 let end_chars: Vec<char> = end_pattern.chars().collect();
382
383 for i in start..chars.len() {
384 if self.matches_at(chars, i, end_pattern) {
385 return Some((i, end_chars.len()));
386 }
387 }
388 None
389 }
390
391 fn try_parse_string(&self, lang: &LanguageDef, chars: &[char], start: usize) -> Option<(Token, usize, Option<String>)> {
392 let c = chars[start];
393
394 // Check for string delimiters
395 if !lang.string_delimiters.contains(&c) {
396 return None;
397 }
398
399 // Check for triple-quoted strings (Python, etc.)
400 if lang.multiline_strings {
401 let triple: String = std::iter::repeat(c).take(3).collect();
402 if self.matches_at(chars, start, &triple) {
403 let delim_len = 3;
404 let mut i = start + delim_len;
405
406 while i < chars.len() {
407 if self.matches_at(chars, i, &triple) {
408 return Some((
409 Token {
410 token_type: TokenType::String,
411 start,
412 end: i + delim_len,
413 },
414 i + delim_len,
415 None,
416 ));
417 }
418 if chars[i] == '\\' && i + 1 < chars.len() {
419 i += 2;
420 } else {
421 i += 1;
422 }
423 }
424
425 // String continues on next line
426 return Some((
427 Token {
428 token_type: TokenType::String,
429 start,
430 end: chars.len(),
431 },
432 chars.len(),
433 Some(triple),
434 ));
435 }
436 }
437
438 // Regular string
439 let mut i = start + 1;
440 while i < chars.len() {
441 if chars[i] == c {
442 return Some((
443 Token {
444 token_type: TokenType::String,
445 start,
446 end: i + 1,
447 },
448 i + 1,
449 None,
450 ));
451 }
452 if chars[i] == '\\' && i + 1 < chars.len() {
453 i += 2;
454 } else {
455 i += 1;
456 }
457 }
458
459 // Unterminated string - highlight to end of line
460 Some((
461 Token {
462 token_type: TokenType::String,
463 start,
464 end: chars.len(),
465 },
466 chars.len(),
467 None,
468 ))
469 }
470
471 fn find_string_end(&self, chars: &[char], start: usize, delim: &str) -> Option<usize> {
472 let mut i = start;
473 while i < chars.len() {
474 if self.matches_at(chars, i, delim) {
475 return Some(i + delim.chars().count());
476 }
477 if chars[i] == '\\' && i + 1 < chars.len() {
478 i += 2;
479 } else {
480 i += 1;
481 }
482 }
483 None
484 }
485
486 fn try_parse_number(&self, chars: &[char], start: usize) -> Option<(Token, usize)> {
487 let c = chars[start];
488
489 // Must start with digit, or . followed by digit
490 if !c.is_ascii_digit() {
491 if c == '.' && start + 1 < chars.len() && chars[start + 1].is_ascii_digit() {
492 // .5 style float
493 } else {
494 return None;
495 }
496 }
497
498 let mut i = start;
499 let mut has_dot = c == '.';
500 let mut has_exp = false;
501
502 // Handle hex, octal, binary
503 if c == '0' && i + 1 < chars.len() {
504 match chars[i + 1] {
505 'x' | 'X' => {
506 i += 2;
507 while i < chars.len() && (chars[i].is_ascii_hexdigit() || chars[i] == '_') {
508 i += 1;
509 }
510 return Some((Token { token_type: TokenType::Number, start, end: i }, i));
511 }
512 'o' | 'O' => {
513 i += 2;
514 while i < chars.len() && (chars[i].is_digit(8) || chars[i] == '_') {
515 i += 1;
516 }
517 return Some((Token { token_type: TokenType::Number, start, end: i }, i));
518 }
519 'b' | 'B' => {
520 i += 2;
521 while i < chars.len() && (chars[i] == '0' || chars[i] == '1' || chars[i] == '_') {
522 i += 1;
523 }
524 return Some((Token { token_type: TokenType::Number, start, end: i }, i));
525 }
526 _ => {}
527 }
528 }
529
530 // Decimal number (possibly float)
531 while i < chars.len() {
532 let ch = chars[i];
533 if ch.is_ascii_digit() || ch == '_' {
534 i += 1;
535 } else if ch == '.' && !has_dot && !has_exp {
536 // Check it's not a method call like 5.to_string()
537 if i + 1 < chars.len() && chars[i + 1].is_ascii_digit() {
538 has_dot = true;
539 i += 1;
540 } else if i + 1 >= chars.len() {
541 has_dot = true;
542 i += 1;
543 } else {
544 break;
545 }
546 } else if (ch == 'e' || ch == 'E') && !has_exp {
547 has_exp = true;
548 i += 1;
549 if i < chars.len() && (chars[i] == '+' || chars[i] == '-') {
550 i += 1;
551 }
552 } else {
553 break;
554 }
555 }
556
557 // Handle type suffixes (f32, i64, etc.)
558 if i < chars.len() && chars[i].is_alphabetic() {
559 let suffix_start = i;
560 while i < chars.len() && (chars[i].is_alphanumeric() || chars[i] == '_') {
561 i += 1;
562 }
563 // Common numeric suffixes
564 let suffix: String = chars[suffix_start..i].iter().collect();
565 let valid_suffixes = ["f32", "f64", "i8", "i16", "i32", "i64", "i128", "isize",
566 "u8", "u16", "u32", "u64", "u128", "usize", "f", "d", "l", "L"];
567 if !valid_suffixes.contains(&suffix.as_str()) {
568 i = suffix_start; // Not a valid suffix, rollback
569 }
570 }
571
572 if i > start {
573 Some((Token { token_type: TokenType::Number, start, end: i }, i))
574 } else {
575 None
576 }
577 }
578
579 fn try_parse_operator(&self, lang: &LanguageDef, chars: &[char], start: usize) -> Option<(Token, usize)> {
580 // Try longer operators first
581 for &op in &lang.operators {
582 if self.matches_at(chars, start, op) {
583 let len = op.chars().count();
584 return Some((
585 Token {
586 token_type: TokenType::Operator,
587 start,
588 end: start + len,
589 },
590 start + len,
591 ));
592 }
593 }
594 None
595 }
596
597 fn try_parse_attribute(&self, lang: &LanguageDef, chars: &[char], start: usize) -> Option<(Token, usize)> {
598 // Rust attributes: #[...] or #![...]
599 if lang.name == "Rust" && chars[start] == '#' {
600 let mut i = start + 1;
601 if i < chars.len() && chars[i] == '!' {
602 i += 1;
603 }
604 if i < chars.len() && chars[i] == '[' {
605 let attr_start = start;
606 let mut bracket_depth = 1;
607 i += 1;
608 while i < chars.len() && bracket_depth > 0 {
609 match chars[i] {
610 '[' => bracket_depth += 1,
611 ']' => bracket_depth -= 1,
612 _ => {}
613 }
614 i += 1;
615 }
616 return Some((
617 Token {
618 token_type: TokenType::Attribute,
619 start: attr_start,
620 end: i,
621 },
622 i,
623 ));
624 }
625 }
626
627 // Python decorators: @name
628 if lang.name == "Python" && chars[start] == '@' {
629 let mut i = start + 1;
630 while i < chars.len() && (chars[i].is_alphanumeric() || chars[i] == '_' || chars[i] == '.') {
631 i += 1;
632 }
633 if i > start + 1 {
634 return Some((
635 Token {
636 token_type: TokenType::Attribute,
637 start,
638 end: i,
639 },
640 i,
641 ));
642 }
643 }
644
645 None
646 }
647
648 fn is_line_start(&self, chars: &[char], pos: usize) -> bool {
649 for i in 0..pos {
650 if !chars[i].is_whitespace() {
651 return false;
652 }
653 }
654 true
655 }
656 }
657
658 #[cfg(test)]
659 mod tests {
660 use super::*;
661
662 #[test]
663 fn test_rust_keywords() {
664 let mut hl = Highlighter::new();
665 hl.set_language(Language::Rust);
666 let mut state = HighlightState::default();
667
668 let tokens = hl.tokenize_line("let x = 42;", &mut state);
669 assert!(tokens.iter().any(|t| t.token_type == TokenType::Keyword)); // let
670 assert!(tokens.iter().any(|t| t.token_type == TokenType::Number)); // 42
671 }
672
673 #[test]
674 fn test_string_parsing() {
675 let mut hl = Highlighter::new();
676 hl.set_language(Language::Rust);
677 let mut state = HighlightState::default();
678
679 let tokens = hl.tokenize_line(r#"let s = "hello";"#, &mut state);
680 assert!(tokens.iter().any(|t| t.token_type == TokenType::String));
681 }
682
683 #[test]
684 fn test_comment_parsing() {
685 let mut hl = Highlighter::new();
686 hl.set_language(Language::Rust);
687 let mut state = HighlightState::default();
688
689 let tokens = hl.tokenize_line("// this is a comment", &mut state);
690 assert_eq!(tokens.len(), 1);
691 assert_eq!(tokens[0].token_type, TokenType::Comment);
692 }
693 }
694