fackr Public

Watch 0 Fork 0 Star 0

Rust · 23202 bytes Raw Blame History

  
        1
        //! Core syntax highlighting engine
      
        2
        
        3
        #![allow(dead_code)]
      
        4
        
        5
        use super::languages::{Language, LanguageDef};
      
        6
        use crossterm::style::Color;
      
        7
        
        8
        /// Token types for syntax highlighting
      
        9
        #[derive(Debug, Clone, Copy, PartialEq, Eq)]
      
        10
        pub enum TokenType {
      
        11
            Plain,
      
        12
            Keyword,
      
        13
            String,
      
        14
            Number,
      
        15
            Comment,
      
        16
            Operator,
      
        17
            Type,
      
        18
            Function,
      
        19
            Preprocessor,
      
        20
            Attribute,
      
        21
            Punctuation,
      
        22
        }
      
        23
        
        24
        impl TokenType {
      
        25
            /// Get the foreground color for this token type
      
        26
            pub fn color(&self) -> Color {
      
        27
                match self {
      
        28
                    TokenType::Plain => Color::Reset,
      
        29
                    TokenType::Keyword => Color::Blue,
      
        30
                    TokenType::String => Color::Green,
      
        31
                    TokenType::Number => Color::Magenta,
      
        32
                    TokenType::Comment => Color::DarkGrey,
      
        33
                    TokenType::Operator => Color::Yellow,
      
        34
                    TokenType::Type => Color::Cyan,
      
        35
                    TokenType::Function => Color::Cyan,
      
        36
                    TokenType::Preprocessor => Color::Magenta,
      
        37
                    TokenType::Attribute => Color::Yellow,
      
        38
                    TokenType::Punctuation => Color::DarkGrey,
      
        39
                }
      
        40
            }
      
        41
        
        42
            /// Whether this token type should be bold
      
        43
            pub fn bold(&self) -> bool {
      
        44
                matches!(self, TokenType::Keyword | TokenType::Function)
      
        45
            }
      
        46
        }
      
        47
        
        48
        /// A token in a line of text
      
        49
        #[derive(Debug, Clone)]
      
        50
        pub struct Token {
      
        51
            /// Token type
      
        52
            pub token_type: TokenType,
      
        53
            /// Start column (character index, not byte)
      
        54
            pub start: usize,
      
        55
            /// End column (exclusive, character index)
      
        56
            pub end: usize,
      
        57
        }
      
        58
        
        59
        /// State for multiline constructs (comments, strings)
      
        60
        #[derive(Debug, Clone, Default, PartialEq)]
      
        61
        pub struct HighlightState {
      
        62
            /// Currently in a multiline comment
      
        63
            pub in_block_comment: bool,
      
        64
            /// Currently in a multiline string (stores delimiter for matching)
      
        65
            pub in_multiline_string: Option<String>,
      
        66
        }
      
        67
        
        68
        /// Syntax highlighter for a specific language
      
        69
        #[derive(Debug)]
      
        70
        pub struct Highlighter {
      
        71
            /// Current language definition
      
        72
            language: Option<LanguageDef>,
      
        73
            /// State for multiline constructs
      
        74
            state: HighlightState,
      
        75
            /// Cached state at the END of each line (state_cache[i] = state after processing line i)
      
        76
            /// This allows O(1) lookup of the starting state for any line
      
        77
            state_cache: Vec<HighlightState>,
      
        78
            /// Line index from which cache is invalid (everything from this line onward needs recalc)
      
        79
            cache_valid_until: usize,
      
        80
        }
      
        81
        
        82
        impl Default for Highlighter {
      
        83
            fn default() -> Self {
      
        84
                Self::new()
      
        85
            }
      
        86
        }
      
        87
        
        88
        impl Highlighter {
      
        89
            /// Create a new highlighter with no language
      
        90
            pub fn new() -> Self {
      
        91
                Self {
      
        92
                    language: None,
      
        93
                    state: HighlightState::default(),
      
        94
                    state_cache: Vec::new(),
      
        95
                    cache_valid_until: 0,
      
        96
                }
      
        97
            }
      
        98
        
        99
            /// Detect and set language based on filename
      
        100
            pub fn detect_language(&mut self, filename: &str) {
      
        101
                self.language = Language::detect(filename).map(|l| l.definition());
      
        102
                self.invalidate_cache(0);
      
        103
            }
      
        104
        
        105
            /// Set language explicitly
      
        106
            pub fn set_language(&mut self, lang: Language) {
      
        107
                self.language = Some(lang.definition());
      
        108
                self.invalidate_cache(0);
      
        109
            }
      
        110
        
        111
            /// Clear language (disable highlighting)
      
        112
            pub fn clear_language(&mut self) {
      
        113
                self.language = None;
      
        114
                self.invalidate_cache(0);
      
        115
            }
      
        116
        
        117
            /// Check if highlighting is enabled
      
        118
            pub fn is_enabled(&self) -> bool {
      
        119
                self.language.is_some()
      
        120
            }
      
        121
        
        122
            /// Get current language name
      
        123
            pub fn language_name(&self) -> Option<&str> {
      
        124
                self.language.as_ref().map(|l| l.name)
      
        125
            }
      
        126
        
        127
            /// Get the line comment prefix for the current language (e.g., "//", "#", "--")
      
        128
            pub fn line_comment(&self) -> Option<&'static str> {
      
        129
                self.language.as_ref().and_then(|l| l.line_comment)
      
        130
            }
      
        131
        
        132
            /// Reset multiline state (call when buffer changes significantly)
      
        133
            pub fn reset_state(&mut self) {
      
        134
                self.invalidate_cache(0);
      
        135
            }
      
        136
        
        137
            /// Invalidate the highlight state cache from a specific line onward.
      
        138
            /// Call this when the buffer content changes at or after line `from_line`.
      
        139
            pub fn invalidate_cache(&mut self, from_line: usize) {
      
        140
                self.cache_valid_until = self.cache_valid_until.min(from_line);
      
        141
                self.state = HighlightState::default();
      
        142
            }
      
        143
        
        144
            /// Get the starting highlight state for a given line by looking up the cache.
      
        145
            /// Returns the state after processing (line_idx - 1), or default state for line 0.
      
        146
            pub fn get_state_for_line(&self, line_idx: usize) -> HighlightState {
      
        147
                if line_idx == 0 {
      
        148
                    HighlightState::default()
      
        149
                } else if line_idx <= self.cache_valid_until && line_idx <= self.state_cache.len() {
      
        150
                    self.state_cache[line_idx - 1].clone()
      
        151
                } else {
      
        152
                    // Cache miss - caller needs to rebuild from last valid point
      
        153
                    HighlightState::default()
      
        154
                }
      
        155
            }
      
        156
        
        157
            /// Update the state cache after tokenizing a line.
      
        158
            /// Call this after tokenize_line() with the resulting state.
      
        159
            pub fn update_cache(&mut self, line_idx: usize, state: &HighlightState) {
      
        160
                // Ensure cache is large enough
      
        161
                if line_idx >= self.state_cache.len() {
      
        162
                    self.state_cache.resize(line_idx + 1, HighlightState::default());
      
        163
                }
      
        164
                self.state_cache[line_idx] = state.clone();
      
        165
                // Update valid range if this extends it
      
        166
                if line_idx >= self.cache_valid_until {
      
        167
                    self.cache_valid_until = line_idx + 1;
      
        168
                }
      
        169
            }
      
        170
        
        171
            /// Get the line number from which the cache is valid
      
        172
            pub fn cache_valid_from(&self) -> usize {
      
        173
                self.cache_valid_until
      
        174
            }
      
        175
        
        176
            /// Tokenize a single line, returning tokens and updated state
      
        177
            /// The state should be passed from the previous line for correct multiline handling
      
        178
            pub fn tokenize_line(&self, line: &str, state: &mut HighlightState) -> Vec<Token> {
      
        179
                let lang = match &self.language {
      
        180
                    Some(l) => l,
      
        181
                    None => return vec![],
      
        182
                };
      
        183
        
        184
                let mut tokens = Vec::new();
      
        185
                let chars: Vec<char> = line.chars().collect();
      
        186
                let mut i = 0;
      
        187
        
        188
                while i < chars.len() {
      
        189
                    // Handle continuing multiline comment
      
        190
                    if state.in_block_comment {
      
        191
                        if let Some((end_start, end_len)) = self.find_block_comment_end(lang, &chars, i) {
      
        192
                            tokens.push(Token {
      
        193
                                token_type: TokenType::Comment,
      
        194
                                start: i,
      
        195
                                end: end_start + end_len,
      
        196
                            });
      
        197
                            i = end_start + end_len;
      
        198
                            state.in_block_comment = false;
      
        199
                            continue;
      
        200
                        } else {
      
        201
                            // Rest of line is comment
      
        202
                            tokens.push(Token {
      
        203
                                token_type: TokenType::Comment,
      
        204
                                start: i,
      
        205
                                end: chars.len(),
      
        206
                            });
      
        207
                            break;
      
        208
                        }
      
        209
                    }
      
        210
        
        211
                    // Handle continuing multiline string
      
        212
                    if let Some(delim) = state.in_multiline_string.as_ref() {
      
        213
                        if let Some(end_pos) = self.find_string_end(&chars, i, delim) {
      
        214
                            tokens.push(Token {
      
        215
                                token_type: TokenType::String,
      
        216
                                start: i,
      
        217
                                end: end_pos,
      
        218
                            });
      
        219
                            i = end_pos;
      
        220
                            state.in_multiline_string = None;
      
        221
                            continue;
      
        222
                        } else {
      
        223
                            // Rest of line is string
      
        224
                            tokens.push(Token {
      
        225
                                token_type: TokenType::String,
      
        226
                                start: i,
      
        227
                                end: chars.len(),
      
        228
                            });
      
        229
                            break;
      
        230
                        }
      
        231
                    }
      
        232
        
        233
                    // Skip whitespace
      
        234
                    if chars[i].is_whitespace() {
      
        235
                        i += 1;
      
        236
                        continue;
      
        237
                    }
      
        238
        
        239
                    // Check for line comment
      
        240
                    if let Some(ref comment) = lang.line_comment {
      
        241
                        if self.matches_at(&chars, i, comment) {
      
        242
                            tokens.push(Token {
      
        243
                                token_type: TokenType::Comment,
      
        244
                                start: i,
      
        245
                                end: chars.len(),
      
        246
                            });
      
        247
                            break;
      
        248
                        }
      
        249
                    }
      
        250
        
        251
                    // Check for block comment start
      
        252
                    if let (Some(ref start), Some(_)) = (&lang.block_comment_start, &lang.block_comment_end) {
      
        253
                        if self.matches_at(&chars, i, start) {
      
        254
                            let comment_start = i;
      
        255
                            i += start.chars().count();
      
        256
        
        257
                            if let Some((end_start, end_len)) = self.find_block_comment_end(lang, &chars, i) {
      
        258
                                tokens.push(Token {
      
        259
                                    token_type: TokenType::Comment,
      
        260
                                    start: comment_start,
      
        261
                                    end: end_start + end_len,
      
        262
                                });
      
        263
                                i = end_start + end_len;
      
        264
                            } else {
      
        265
                                // Multiline comment continues
      
        266
                                tokens.push(Token {
      
        267
                                    token_type: TokenType::Comment,
      
        268
                                    start: comment_start,
      
        269
                                    end: chars.len(),
      
        270
                                });
      
        271
                                state.in_block_comment = true;
      
        272
                                break;
      
        273
                            }
      
        274
                            continue;
      
        275
                        }
      
        276
                    }
      
        277
        
        278
                    // Check for strings
      
        279
                    if let Some((token, new_i, multiline_delim)) = self.try_parse_string(lang, &chars, i) {
      
        280
                        tokens.push(token);
      
        281
                        i = new_i;
      
        282
                        if let Some(delim) = multiline_delim {
      
        283
                            state.in_multiline_string = Some(delim);
      
        284
                            break;
      
        285
                        }
      
        286
                        continue;
      
        287
                    }
      
        288
        
        289
                    // Check for numbers
      
        290
                    if let Some((token, new_i)) = self.try_parse_number(&chars, i) {
      
        291
                        tokens.push(token);
      
        292
                        i = new_i;
      
        293
                        continue;
      
        294
                    }
      
        295
        
        296
                    // Check for preprocessor directives
      
        297
                    if lang.has_preprocessor && chars[i] == '#' && self.is_line_start(&chars, i) {
      
        298
                        tokens.push(Token {
      
        299
                            token_type: TokenType::Preprocessor,
      
        300
                            start: i,
      
        301
                            end: chars.len(),
      
        302
                        });
      
        303
                        break;
      
        304
                    }
      
        305
        
        306
                    // Check for attributes (Rust #[], Python @)
      
        307
                    if let Some((token, new_i)) = self.try_parse_attribute(lang, &chars, i) {
      
        308
                        tokens.push(token);
      
        309
                        i = new_i;
      
        310
                        continue;
      
        311
                    }
      
        312
        
        313
                    // Check for identifiers (keywords, types, functions)
      
        314
                    if chars[i].is_alphabetic() || chars[i] == '_' {
      
        315
                        let start = i;
      
        316
                        while i < chars.len() && (chars[i].is_alphanumeric() || chars[i] == '_') {
      
        317
                            i += 1;
      
        318
                        }
      
        319
                        let word: String = chars[start..i].iter().collect();
      
        320
        
        321
                        let token_type = if lang.keywords.contains(&word.as_str()) {
      
        322
                            TokenType::Keyword
      
        323
                        } else if lang.types.contains(&word.as_str()) {
      
        324
                            TokenType::Type
      
        325
                        } else if i < chars.len() && chars[i] == '(' {
      
        326
                            TokenType::Function
      
        327
                        } else {
      
        328
                            TokenType::Plain
      
        329
                        };
      
        330
        
        331
                        if token_type != TokenType::Plain {
      
        332
                            tokens.push(Token {
      
        333
                                token_type,
      
        334
                                start,
      
        335
                                end: i,
      
        336
                            });
      
        337
                        }
      
        338
                        continue;
      
        339
                    }
      
        340
        
        341
                    // Check for operators
      
        342
                    if let Some((token, new_i)) = self.try_parse_operator(lang, &chars, i) {
      
        343
                        tokens.push(token);
      
        344
                        i = new_i;
      
        345
                        continue;
      
        346
                    }
      
        347
        
        348
                    // Check for punctuation
      
        349
                    if lang.punctuation.contains(&chars[i]) {
      
        350
                        tokens.push(Token {
      
        351
                            token_type: TokenType::Punctuation,
      
        352
                            start: i,
      
        353
                            end: i + 1,
      
        354
                        });
      
        355
                        i += 1;
      
        356
                        continue;
      
        357
                    }
      
        358
        
        359
                    // Skip unknown character
      
        360
                    i += 1;
      
        361
                }
      
        362
        
        363
                tokens
      
        364
            }
      
        365
        
        366
            fn matches_at(&self, chars: &[char], pos: usize, pattern: &str) -> bool {
      
        367
                let pattern_chars: Vec<char> = pattern.chars().collect();
      
        368
                if pos + pattern_chars.len() > chars.len() {
      
        369
                    return false;
      
        370
                }
      
        371
                for (i, &pc) in pattern_chars.iter().enumerate() {
      
        372
                    if chars[pos + i] != pc {
      
        373
                        return false;
      
        374
                    }
      
        375
                }
      
        376
                true
      
        377
            }
      
        378
        
        379
            fn find_block_comment_end(&self, lang: &LanguageDef, chars: &[char], start: usize) -> Option<(usize, usize)> {
      
        380
                let end_pattern = lang.block_comment_end.as_ref()?;
      
        381
                let end_chars: Vec<char> = end_pattern.chars().collect();
      
        382
        
        383
                for i in start..chars.len() {
      
        384
                    if self.matches_at(chars, i, end_pattern) {
      
        385
                        return Some((i, end_chars.len()));
      
        386
                    }
      
        387
                }
      
        388
                None
      
        389
            }
      
        390
        
        391
            fn try_parse_string(&self, lang: &LanguageDef, chars: &[char], start: usize) -> Option<(Token, usize, Option<String>)> {
      
        392
                let c = chars[start];
      
        393
        
        394
                // Check for string delimiters
      
        395
                if !lang.string_delimiters.contains(&c) {
      
        396
                    return None;
      
        397
                }
      
        398
        
        399
                // Check for triple-quoted strings (Python, etc.)
      
        400
                if lang.multiline_strings {
      
        401
                    let triple: String = std::iter::repeat(c).take(3).collect();
      
        402
                    if self.matches_at(chars, start, &triple) {
      
        403
                        let delim_len = 3;
      
        404
                        let mut i = start + delim_len;
      
        405
        
        406
                        while i < chars.len() {
      
        407
                            if self.matches_at(chars, i, &triple) {
      
        408
                                return Some((
      
        409
                                    Token {
      
        410
                                        token_type: TokenType::String,
      
        411
                                        start,
      
        412
                                        end: i + delim_len,
      
        413
                                    },
      
        414
                                    i + delim_len,
      
        415
                                    None,
      
        416
                                ));
      
        417
                            }
      
        418
                            if chars[i] == '\\' && i + 1 < chars.len() {
      
        419
                                i += 2;
      
        420
                            } else {
      
        421
                                i += 1;
      
        422
                            }
      
        423
                        }
      
        424
        
        425
                        // String continues on next line
      
        426
                        return Some((
      
        427
                            Token {
      
        428
                                token_type: TokenType::String,
      
        429
                                start,
      
        430
                                end: chars.len(),
      
        431
                            },
      
        432
                            chars.len(),
      
        433
                            Some(triple),
      
        434
                        ));
      
        435
                    }
      
        436
                }
      
        437
        
        438
                // Regular string
      
        439
                let mut i = start + 1;
      
        440
                while i < chars.len() {
      
        441
                    if chars[i] == c {
      
        442
                        return Some((
      
        443
                            Token {
      
        444
                                token_type: TokenType::String,
      
        445
                                start,
      
        446
                                end: i + 1,
      
        447
                            },
      
        448
                            i + 1,
      
        449
                            None,
      
        450
                        ));
      
        451
                    }
      
        452
                    if chars[i] == '\\' && i + 1 < chars.len() {
      
        453
                        i += 2;
      
        454
                    } else {
      
        455
                        i += 1;
      
        456
                    }
      
        457
                }
      
        458
        
        459
                // Unterminated string - highlight to end of line
      
        460
                Some((
      
        461
                    Token {
      
        462
                        token_type: TokenType::String,
      
        463
                        start,
      
        464
                        end: chars.len(),
      
        465
                    },
      
        466
                    chars.len(),
      
        467
                    None,
      
        468
                ))
      
        469
            }
      
        470
        
        471
            fn find_string_end(&self, chars: &[char], start: usize, delim: &str) -> Option<usize> {
      
        472
                let mut i = start;
      
        473
                while i < chars.len() {
      
        474
                    if self.matches_at(chars, i, delim) {
      
        475
                        return Some(i + delim.chars().count());
      
        476
                    }
      
        477
                    if chars[i] == '\\' && i + 1 < chars.len() {
      
        478
                        i += 2;
      
        479
                    } else {
      
        480
                        i += 1;
      
        481
                    }
      
        482
                }
      
        483
                None
      
        484
            }
      
        485
        
        486
            fn try_parse_number(&self, chars: &[char], start: usize) -> Option<(Token, usize)> {
      
        487
                let c = chars[start];
      
        488
        
        489
                // Must start with digit, or . followed by digit
      
        490
                if !c.is_ascii_digit() {
      
        491
                    if c == '.' && start + 1 < chars.len() && chars[start + 1].is_ascii_digit() {
      
        492
                        // .5 style float
      
        493
                    } else {
      
        494
                        return None;
      
        495
                    }
      
        496
                }
      
        497
        
        498
                let mut i = start;
      
        499
                let mut has_dot = c == '.';
      
        500
                let mut has_exp = false;
      
        501
        
        502
                // Handle hex, octal, binary
      
        503
                if c == '0' && i + 1 < chars.len() {
      
        504
                    match chars[i + 1] {
      
        505
                        'x' | 'X' => {
      
        506
                            i += 2;
      
        507
                            while i < chars.len() && (chars[i].is_ascii_hexdigit() || chars[i] == '_') {
      
        508
                                i += 1;
      
        509
                            }
      
        510
                            return Some((Token { token_type: TokenType::Number, start, end: i }, i));
      
        511
                        }
      
        512
                        'o' | 'O' => {
      
        513
                            i += 2;
      
        514
                            while i < chars.len() && (chars[i].is_digit(8) || chars[i] == '_') {
      
        515
                                i += 1;
      
        516
                            }
      
        517
                            return Some((Token { token_type: TokenType::Number, start, end: i }, i));
      
        518
                        }
      
        519
                        'b' | 'B' => {
      
        520
                            i += 2;
      
        521
                            while i < chars.len() && (chars[i] == '0' || chars[i] == '1' || chars[i] == '_') {
      
        522
                                i += 1;
      
        523
                            }
      
        524
                            return Some((Token { token_type: TokenType::Number, start, end: i }, i));
      
        525
                        }
      
        526
                        _ => {}
      
        527
                    }
      
        528
                }
      
        529
        
        530
                // Decimal number (possibly float)
      
        531
                while i < chars.len() {
      
        532
                    let ch = chars[i];
      
        533
                    if ch.is_ascii_digit() || ch == '_' {
      
        534
                        i += 1;
      
        535
                    } else if ch == '.' && !has_dot && !has_exp {
      
        536
                        // Check it's not a method call like 5.to_string()
      
        537
                        if i + 1 < chars.len() && chars[i + 1].is_ascii_digit() {
      
        538
                            has_dot = true;
      
        539
                            i += 1;
      
        540
                        } else if i + 1 >= chars.len() {
      
        541
                            has_dot = true;
      
        542
                            i += 1;
      
        543
                        } else {
      
        544
                            break;
      
        545
                        }
      
        546
                    } else if (ch == 'e' || ch == 'E') && !has_exp {
      
        547
                        has_exp = true;
      
        548
                        i += 1;
      
        549
                        if i < chars.len() && (chars[i] == '+' || chars[i] == '-') {
      
        550
                            i += 1;
      
        551
                        }
      
        552
                    } else {
      
        553
                        break;
      
        554
                    }
      
        555
                }
      
        556
        
        557
                // Handle type suffixes (f32, i64, etc.)
      
        558
                if i < chars.len() && chars[i].is_alphabetic() {
      
        559
                    let suffix_start = i;
      
        560
                    while i < chars.len() && (chars[i].is_alphanumeric() || chars[i] == '_') {
      
        561
                        i += 1;
      
        562
                    }
      
        563
                    // Common numeric suffixes
      
        564
                    let suffix: String = chars[suffix_start..i].iter().collect();
      
        565
                    let valid_suffixes = ["f32", "f64", "i8", "i16", "i32", "i64", "i128", "isize",
      
        566
                                          "u8", "u16", "u32", "u64", "u128", "usize", "f", "d", "l", "L"];
      
        567
                    if !valid_suffixes.contains(&suffix.as_str()) {
      
        568
                        i = suffix_start; // Not a valid suffix, rollback
      
        569
                    }
      
        570
                }
      
        571
        
        572
                if i > start {
      
        573
                    Some((Token { token_type: TokenType::Number, start, end: i }, i))
      
        574
                } else {
      
        575
                    None
      
        576
                }
      
        577
            }
      
        578
        
        579
            fn try_parse_operator(&self, lang: &LanguageDef, chars: &[char], start: usize) -> Option<(Token, usize)> {
      
        580
                // Try longer operators first
      
        581
                for &op in &lang.operators {
      
        582
                    if self.matches_at(chars, start, op) {
      
        583
                        let len = op.chars().count();
      
        584
                        return Some((
      
        585
                            Token {
      
        586
                                token_type: TokenType::Operator,
      
        587
                                start,
      
        588
                                end: start + len,
      
        589
                            },
      
        590
                            start + len,
      
        591
                        ));
      
        592
                    }
      
        593
                }
      
        594
                None
      
        595
            }
      
        596
        
        597
            fn try_parse_attribute(&self, lang: &LanguageDef, chars: &[char], start: usize) -> Option<(Token, usize)> {
      
        598
                // Rust attributes: #[...] or #![...]
      
        599
                if lang.name == "Rust" && chars[start] == '#' {
      
        600
                    let mut i = start + 1;
      
        601
                    if i < chars.len() && chars[i] == '!' {
      
        602
                        i += 1;
      
        603
                    }
      
        604
                    if i < chars.len() && chars[i] == '[' {
      
        605
                        let attr_start = start;
      
        606
                        let mut bracket_depth = 1;
      
        607
                        i += 1;
      
        608
                        while i < chars.len() && bracket_depth > 0 {
      
        609
                            match chars[i] {
      
        610
                                '[' => bracket_depth += 1,
      
        611
                                ']' => bracket_depth -= 1,
      
        612
                                _ => {}
      
        613
                            }
      
        614
                            i += 1;
      
        615
                        }
      
        616
                        return Some((
      
        617
                            Token {
      
        618
                                token_type: TokenType::Attribute,
      
        619
                                start: attr_start,
      
        620
                                end: i,
      
        621
                            },
      
        622
                            i,
      
        623
                        ));
      
        624
                    }
      
        625
                }
      
        626
        
        627
                // Python decorators: @name
      
        628
                if lang.name == "Python" && chars[start] == '@' {
      
        629
                    let mut i = start + 1;
      
        630
                    while i < chars.len() && (chars[i].is_alphanumeric() || chars[i] == '_' || chars[i] == '.') {
      
        631
                        i += 1;
      
        632
                    }
      
        633
                    if i > start + 1 {
      
        634
                        return Some((
      
        635
                            Token {
      
        636
                                token_type: TokenType::Attribute,
      
        637
                                start,
      
        638
                                end: i,
      
        639
                            },
      
        640
                            i,
      
        641
                        ));
      
        642
                    }
      
        643
                }
      
        644
        
        645
                None
      
        646
            }
      
        647
        
        648
            fn is_line_start(&self, chars: &[char], pos: usize) -> bool {
      
        649
                for i in 0..pos {
      
        650
                    if !chars[i].is_whitespace() {
      
        651
                        return false;
      
        652
                    }
      
        653
                }
      
        654
                true
      
        655
            }
      
        656
        }
      
        657
        
        658
        #[cfg(test)]
      
        659
        mod tests {
      
        660
            use super::*;
      
        661
        
        662
            #[test]
      
        663
            fn test_rust_keywords() {
      
        664
                let mut hl = Highlighter::new();
      
        665
                hl.set_language(Language::Rust);
      
        666
                let mut state = HighlightState::default();
      
        667
        
        668
                let tokens = hl.tokenize_line("let x = 42;", &mut state);
      
        669
                assert!(tokens.iter().any(|t| t.token_type == TokenType::Keyword)); // let
      
        670
                assert!(tokens.iter().any(|t| t.token_type == TokenType::Number));  // 42
      
        671
            }
      
        672
        
        673
            #[test]
      
        674
            fn test_string_parsing() {
      
        675
                let mut hl = Highlighter::new();
      
        676
                hl.set_language(Language::Rust);
      
        677
                let mut state = HighlightState::default();
      
        678
        
        679
                let tokens = hl.tokenize_line(r#"let s = "hello";"#, &mut state);
      
        680
                assert!(tokens.iter().any(|t| t.token_type == TokenType::String));
      
        681
            }
      
        682
        
        683
            #[test]
      
        684
            fn test_comment_parsing() {
      
        685
                let mut hl = Highlighter::new();
      
        686
                hl.set_language(Language::Rust);
      
        687
                let mut state = HighlightState::default();
      
        688
        
        689
                let tokens = hl.tokenize_line("// this is a comment", &mut state);
      
        690
                assert_eq!(tokens.len(), 1);
      
        691
                assert_eq!(tokens[0].token_type, TokenType::Comment);
      
        692
            }
      
        693
        }
      
        694

1	//! Core syntax highlighting engine
2
3	#![allow(dead_code)]
4
5	use super::languages::{Language, LanguageDef};
6	use crossterm::style::Color;
7
8	/// Token types for syntax highlighting
9	#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10	pub enum TokenType {
11	Plain,
12	Keyword,
13	String,
14	Number,
15	Comment,
16	Operator,
17	Type,
18	Function,
19	Preprocessor,
20	Attribute,
21	Punctuation,
22	}
23
24	impl TokenType {
25	/// Get the foreground color for this token type
26	pub fn color(&self) -> Color {
27	match self {
28	TokenType::Plain => Color::Reset,
29	TokenType::Keyword => Color::Blue,
30	TokenType::String => Color::Green,
31	TokenType::Number => Color::Magenta,
32	TokenType::Comment => Color::DarkGrey,
33	TokenType::Operator => Color::Yellow,
34	TokenType::Type => Color::Cyan,
35	TokenType::Function => Color::Cyan,
36	TokenType::Preprocessor => Color::Magenta,
37	TokenType::Attribute => Color::Yellow,
38	TokenType::Punctuation => Color::DarkGrey,
39	}
40	}
41
42	/// Whether this token type should be bold
43	pub fn bold(&self) -> bool {
44	matches!(self, TokenType::Keyword \| TokenType::Function)
45	}
46	}
47
48	/// A token in a line of text
49	#[derive(Debug, Clone)]
50	pub struct Token {
51	/// Token type
52	pub token_type: TokenType,
53	/// Start column (character index, not byte)
54	pub start: usize,
55	/// End column (exclusive, character index)
56	pub end: usize,
57	}
58
59	/// State for multiline constructs (comments, strings)
60	#[derive(Debug, Clone, Default, PartialEq)]
61	pub struct HighlightState {
62	/// Currently in a multiline comment
63	pub in_block_comment: bool,
64	/// Currently in a multiline string (stores delimiter for matching)
65	pub in_multiline_string: Option<String>,
66	}
67
68	/// Syntax highlighter for a specific language
69	#[derive(Debug)]
70	pub struct Highlighter {
71	/// Current language definition
72	language: Option<LanguageDef>,
73	/// State for multiline constructs
74	state: HighlightState,
75	/// Cached state at the END of each line (state_cache[i] = state after processing line i)
76	/// This allows O(1) lookup of the starting state for any line
77	state_cache: Vec<HighlightState>,
78	/// Line index from which cache is invalid (everything from this line onward needs recalc)
79	cache_valid_until: usize,
80	}
81
82	impl Default for Highlighter {
83	fn default() -> Self {
84	Self::new()
85	}
86	}
87
88	impl Highlighter {
89	/// Create a new highlighter with no language
90	pub fn new() -> Self {
91	Self {
92	language: None,
93	state: HighlightState::default(),
94	state_cache: Vec::new(),
95	cache_valid_until: 0,
96	}
97	}
98
99	/// Detect and set language based on filename
100	pub fn detect_language(&mut self, filename: &str) {
101	self.language = Language::detect(filename).map(\|l\| l.definition());
102	self.invalidate_cache(0);
103	}
104
105	/// Set language explicitly
106	pub fn set_language(&mut self, lang: Language) {
107	self.language = Some(lang.definition());
108	self.invalidate_cache(0);
109	}
110
111	/// Clear language (disable highlighting)
112	pub fn clear_language(&mut self) {
113	self.language = None;
114	self.invalidate_cache(0);
115	}
116
117	/// Check if highlighting is enabled
118	pub fn is_enabled(&self) -> bool {
119	self.language.is_some()
120	}
121
122	/// Get current language name
123	pub fn language_name(&self) -> Option<&str> {
124	self.language.as_ref().map(\|l\| l.name)
125	}
126
127	/// Get the line comment prefix for the current language (e.g., "//", "#", "--")
128	pub fn line_comment(&self) -> Option<&'static str> {
129	self.language.as_ref().and_then(\|l\| l.line_comment)
130	}
131
132	/// Reset multiline state (call when buffer changes significantly)
133	pub fn reset_state(&mut self) {
134	self.invalidate_cache(0);
135	}
136
137	/// Invalidate the highlight state cache from a specific line onward.
138	/// Call this when the buffer content changes at or after line `from_line`.
139	pub fn invalidate_cache(&mut self, from_line: usize) {
140	self.cache_valid_until = self.cache_valid_until.min(from_line);
141	self.state = HighlightState::default();
142	}
143
144	/// Get the starting highlight state for a given line by looking up the cache.
145	/// Returns the state after processing (line_idx - 1), or default state for line 0.
146	pub fn get_state_for_line(&self, line_idx: usize) -> HighlightState {
147	if line_idx == 0 {
148	HighlightState::default()
149	} else if line_idx <= self.cache_valid_until && line_idx <= self.state_cache.len() {
150	self.state_cache[line_idx - 1].clone()
151	} else {
152	// Cache miss - caller needs to rebuild from last valid point
153	HighlightState::default()
154	}
155	}
156
157	/// Update the state cache after tokenizing a line.
158	/// Call this after tokenize_line() with the resulting state.
159	pub fn update_cache(&mut self, line_idx: usize, state: &HighlightState) {
160	// Ensure cache is large enough
161	if line_idx >= self.state_cache.len() {
162	self.state_cache.resize(line_idx + 1, HighlightState::default());
163	}
164	self.state_cache[line_idx] = state.clone();
165	// Update valid range if this extends it
166	if line_idx >= self.cache_valid_until {
167	self.cache_valid_until = line_idx + 1;
168	}
169	}
170
171	/// Get the line number from which the cache is valid
172	pub fn cache_valid_from(&self) -> usize {
173	self.cache_valid_until
174	}
175
176	/// Tokenize a single line, returning tokens and updated state
177	/// The state should be passed from the previous line for correct multiline handling
178	pub fn tokenize_line(&self, line: &str, state: &mut HighlightState) -> Vec<Token> {
179	let lang = match &self.language {
180	Some(l) => l,
181	None => return vec![],
182	};
183
184	let mut tokens = Vec::new();
185	let chars: Vec<char> = line.chars().collect();
186	let mut i = 0;
187
188	while i < chars.len() {
189	// Handle continuing multiline comment
190	if state.in_block_comment {
191	if let Some((end_start, end_len)) = self.find_block_comment_end(lang, &chars, i) {
192	tokens.push(Token {
193	token_type: TokenType::Comment,
194	start: i,
195	end: end_start + end_len,
196	});
197	i = end_start + end_len;
198	state.in_block_comment = false;
199	continue;
200	} else {
201	// Rest of line is comment
202	tokens.push(Token {
203	token_type: TokenType::Comment,
204	start: i,
205	end: chars.len(),
206	});
207	break;
208	}
209	}
210
211	// Handle continuing multiline string
212	if let Some(delim) = state.in_multiline_string.as_ref() {
213	if let Some(end_pos) = self.find_string_end(&chars, i, delim) {
214	tokens.push(Token {
215	token_type: TokenType::String,
216	start: i,
217	end: end_pos,
218	});
219	i = end_pos;
220	state.in_multiline_string = None;
221	continue;
222	} else {
223	// Rest of line is string
224	tokens.push(Token {
225	token_type: TokenType::String,
226	start: i,
227	end: chars.len(),
228	});
229	break;
230	}
231	}
232
233	// Skip whitespace
234	if chars[i].is_whitespace() {
235	i += 1;
236	continue;
237	}
238
239	// Check for line comment
240	if let Some(ref comment) = lang.line_comment {
241	if self.matches_at(&chars, i, comment) {
242	tokens.push(Token {
243	token_type: TokenType::Comment,
244	start: i,
245	end: chars.len(),
246	});
247	break;
248	}
249	}
250
251	// Check for block comment start
252	if let (Some(ref start), Some(_)) = (&lang.block_comment_start, &lang.block_comment_end) {
253	if self.matches_at(&chars, i, start) {
254	let comment_start = i;
255	i += start.chars().count();
256
257	if let Some((end_start, end_len)) = self.find_block_comment_end(lang, &chars, i) {
258	tokens.push(Token {
259	token_type: TokenType::Comment,
260	start: comment_start,
261	end: end_start + end_len,
262	});
263	i = end_start + end_len;
264	} else {
265	// Multiline comment continues
266	tokens.push(Token {
267	token_type: TokenType::Comment,
268	start: comment_start,
269	end: chars.len(),
270	});
271	state.in_block_comment = true;
272	break;
273	}
274	continue;
275	}
276	}
277
278	// Check for strings
279	if let Some((token, new_i, multiline_delim)) = self.try_parse_string(lang, &chars, i) {
280	tokens.push(token);
281	i = new_i;
282	if let Some(delim) = multiline_delim {
283	state.in_multiline_string = Some(delim);
284	break;
285	}
286	continue;
287	}
288
289	// Check for numbers
290	if let Some((token, new_i)) = self.try_parse_number(&chars, i) {
291	tokens.push(token);
292	i = new_i;
293	continue;
294	}
295
296	// Check for preprocessor directives
297	if lang.has_preprocessor && chars[i] == '#' && self.is_line_start(&chars, i) {
298	tokens.push(Token {
299	token_type: TokenType::Preprocessor,
300	start: i,
301	end: chars.len(),
302	});
303	break;
304	}
305
306	// Check for attributes (Rust #[], Python @)
307	if let Some((token, new_i)) = self.try_parse_attribute(lang, &chars, i) {
308	tokens.push(token);
309	i = new_i;
310	continue;
311	}
312
313	// Check for identifiers (keywords, types, functions)
314	if chars[i].is_alphabetic() \|\| chars[i] == '_' {
315	let start = i;
316	while i < chars.len() && (chars[i].is_alphanumeric() \|\| chars[i] == '_') {
317	i += 1;
318	}
319	let word: String = chars[start..i].iter().collect();
320
321	let token_type = if lang.keywords.contains(&word.as_str()) {
322	TokenType::Keyword
323	} else if lang.types.contains(&word.as_str()) {
324	TokenType::Type
325	} else if i < chars.len() && chars[i] == '(' {
326	TokenType::Function
327	} else {
328	TokenType::Plain
329	};
330
331	if token_type != TokenType::Plain {
332	tokens.push(Token {
333	token_type,
334	start,
335	end: i,
336	});
337	}
338	continue;
339	}
340
341	// Check for operators
342	if let Some((token, new_i)) = self.try_parse_operator(lang, &chars, i) {
343	tokens.push(token);
344	i = new_i;
345	continue;
346	}
347
348	// Check for punctuation
349	if lang.punctuation.contains(&chars[i]) {
350	tokens.push(Token {
351	token_type: TokenType::Punctuation,
352	start: i,
353	end: i + 1,
354	});
355	i += 1;
356	continue;
357	}
358
359	// Skip unknown character
360	i += 1;
361	}
362
363	tokens
364	}
365
366	fn matches_at(&self, chars: &[char], pos: usize, pattern: &str) -> bool {
367	let pattern_chars: Vec<char> = pattern.chars().collect();
368	if pos + pattern_chars.len() > chars.len() {
369	return false;
370	}
371	for (i, &pc) in pattern_chars.iter().enumerate() {
372	if chars[pos + i] != pc {
373	return false;
374	}
375	}
376	true
377	}
378
379	fn find_block_comment_end(&self, lang: &LanguageDef, chars: &[char], start: usize) -> Option<(usize, usize)> {
380	let end_pattern = lang.block_comment_end.as_ref()?;
381	let end_chars: Vec<char> = end_pattern.chars().collect();
382
383	for i in start..chars.len() {
384	if self.matches_at(chars, i, end_pattern) {
385	return Some((i, end_chars.len()));
386	}
387	}
388	None
389	}
390
391	fn try_parse_string(&self, lang: &LanguageDef, chars: &[char], start: usize) -> Option<(Token, usize, Option<String>)> {
392	let c = chars[start];
393
394	// Check for string delimiters
395	if !lang.string_delimiters.contains(&c) {
396	return None;
397	}
398
399	// Check for triple-quoted strings (Python, etc.)
400	if lang.multiline_strings {
401	let triple: String = std::iter::repeat(c).take(3).collect();
402	if self.matches_at(chars, start, &triple) {
403	let delim_len = 3;
404	let mut i = start + delim_len;
405
406	while i < chars.len() {
407	if self.matches_at(chars, i, &triple) {
408	return Some((
409	Token {
410	token_type: TokenType::String,
411	start,
412	end: i + delim_len,
413	},
414	i + delim_len,
415	None,
416	));
417	}
418	if chars[i] == '\\' && i + 1 < chars.len() {
419	i += 2;
420	} else {
421	i += 1;
422	}
423	}
424
425	// String continues on next line
426	return Some((
427	Token {
428	token_type: TokenType::String,
429	start,
430	end: chars.len(),
431	},
432	chars.len(),
433	Some(triple),
434	));
435	}
436	}
437
438	// Regular string
439	let mut i = start + 1;
440	while i < chars.len() {
441	if chars[i] == c {
442	return Some((
443	Token {
444	token_type: TokenType::String,
445	start,
446	end: i + 1,
447	},
448	i + 1,
449	None,
450	));
451	}
452	if chars[i] == '\\' && i + 1 < chars.len() {
453	i += 2;
454	} else {
455	i += 1;
456	}
457	}
458
459	// Unterminated string - highlight to end of line
460	Some((
461	Token {
462	token_type: TokenType::String,
463	start,
464	end: chars.len(),
465	},
466	chars.len(),
467	None,
468	))
469	}
470
471	fn find_string_end(&self, chars: &[char], start: usize, delim: &str) -> Option<usize> {
472	let mut i = start;
473	while i < chars.len() {
474	if self.matches_at(chars, i, delim) {
475	return Some(i + delim.chars().count());
476	}
477	if chars[i] == '\\' && i + 1 < chars.len() {
478	i += 2;
479	} else {
480	i += 1;
481	}
482	}
483	None
484	}
485
486	fn try_parse_number(&self, chars: &[char], start: usize) -> Option<(Token, usize)> {
487	let c = chars[start];
488
489	// Must start with digit, or . followed by digit
490	if !c.is_ascii_digit() {
491	if c == '.' && start + 1 < chars.len() && chars[start + 1].is_ascii_digit() {
492	// .5 style float
493	} else {
494	return None;
495	}
496	}
497
498	let mut i = start;
499	let mut has_dot = c == '.';
500	let mut has_exp = false;
501
502	// Handle hex, octal, binary
503	if c == '0' && i + 1 < chars.len() {
504	match chars[i + 1] {
505	'x' \| 'X' => {
506	i += 2;
507	while i < chars.len() && (chars[i].is_ascii_hexdigit() \|\| chars[i] == '_') {
508	i += 1;
509	}
510	return Some((Token { token_type: TokenType::Number, start, end: i }, i));
511	}
512	'o' \| 'O' => {
513	i += 2;
514	while i < chars.len() && (chars[i].is_digit(8) \|\| chars[i] == '_') {
515	i += 1;
516	}
517	return Some((Token { token_type: TokenType::Number, start, end: i }, i));
518	}
519	'b' \| 'B' => {
520	i += 2;
521	while i < chars.len() && (chars[i] == '0' \|\| chars[i] == '1' \|\| chars[i] == '_') {
522	i += 1;
523	}
524	return Some((Token { token_type: TokenType::Number, start, end: i }, i));
525	}
526	_ => {}
527	}
528	}
529
530	// Decimal number (possibly float)
531	while i < chars.len() {
532	let ch = chars[i];
533	if ch.is_ascii_digit() \|\| ch == '_' {
534	i += 1;
535	} else if ch == '.' && !has_dot && !has_exp {
536	// Check it's not a method call like 5.to_string()
537	if i + 1 < chars.len() && chars[i + 1].is_ascii_digit() {
538	has_dot = true;
539	i += 1;
540	} else if i + 1 >= chars.len() {
541	has_dot = true;
542	i += 1;
543	} else {
544	break;
545	}
546	} else if (ch == 'e' \|\| ch == 'E') && !has_exp {
547	has_exp = true;
548	i += 1;
549	if i < chars.len() && (chars[i] == '+' \|\| chars[i] == '-') {
550	i += 1;
551	}
552	} else {
553	break;
554	}
555	}
556
557	// Handle type suffixes (f32, i64, etc.)
558	if i < chars.len() && chars[i].is_alphabetic() {
559	let suffix_start = i;
560	while i < chars.len() && (chars[i].is_alphanumeric() \|\| chars[i] == '_') {
561	i += 1;
562	}
563	// Common numeric suffixes
564	let suffix: String = chars[suffix_start..i].iter().collect();
565	let valid_suffixes = ["f32", "f64", "i8", "i16", "i32", "i64", "i128", "isize",
566	"u8", "u16", "u32", "u64", "u128", "usize", "f", "d", "l", "L"];
567	if !valid_suffixes.contains(&suffix.as_str()) {
568	i = suffix_start; // Not a valid suffix, rollback
569	}
570	}
571
572	if i > start {
573	Some((Token { token_type: TokenType::Number, start, end: i }, i))
574	} else {
575	None
576	}
577	}
578
579	fn try_parse_operator(&self, lang: &LanguageDef, chars: &[char], start: usize) -> Option<(Token, usize)> {
580	// Try longer operators first
581	for &op in &lang.operators {
582	if self.matches_at(chars, start, op) {
583	let len = op.chars().count();
584	return Some((
585	Token {
586	token_type: TokenType::Operator,
587	start,
588	end: start + len,
589	},
590	start + len,
591	));
592	}
593	}
594	None
595	}
596
597	fn try_parse_attribute(&self, lang: &LanguageDef, chars: &[char], start: usize) -> Option<(Token, usize)> {
598	// Rust attributes: #[...] or #![...]
599	if lang.name == "Rust" && chars[start] == '#' {
600	let mut i = start + 1;
601	if i < chars.len() && chars[i] == '!' {
602	i += 1;
603	}
604	if i < chars.len() && chars[i] == '[' {
605	let attr_start = start;
606	let mut bracket_depth = 1;
607	i += 1;
608	while i < chars.len() && bracket_depth > 0 {
609	match chars[i] {
610	'[' => bracket_depth += 1,
611	']' => bracket_depth -= 1,
612	_ => {}
613	}
614	i += 1;
615	}
616	return Some((
617	Token {
618	token_type: TokenType::Attribute,
619	start: attr_start,
620	end: i,
621	},
622	i,
623	));
624	}
625	}
626
627	// Python decorators: @name
628	if lang.name == "Python" && chars[start] == '@' {
629	let mut i = start + 1;
630	while i < chars.len() && (chars[i].is_alphanumeric() \|\| chars[i] == '_' \|\| chars[i] == '.') {
631	i += 1;
632	}
633	if i > start + 1 {
634	return Some((
635	Token {
636	token_type: TokenType::Attribute,
637	start,
638	end: i,
639	},
640	i,
641	));
642	}
643	}
644
645	None
646	}
647
648	fn is_line_start(&self, chars: &[char], pos: usize) -> bool {
649	for i in 0..pos {
650	if !chars[i].is_whitespace() {
651	return false;
652	}
653	}
654	true
655	}
656	}
657
658	#[cfg(test)]
659	mod tests {
660	use super::*;
661
662	#[test]
663	fn test_rust_keywords() {
664	let mut hl = Highlighter::new();
665	hl.set_language(Language::Rust);
666	let mut state = HighlightState::default();
667
668	let tokens = hl.tokenize_line("let x = 42;", &mut state);
669	assert!(tokens.iter().any(\|t\| t.token_type == TokenType::Keyword)); // let
670	assert!(tokens.iter().any(\|t\| t.token_type == TokenType::Number)); // 42
671	}
672
673	#[test]
674	fn test_string_parsing() {
675	let mut hl = Highlighter::new();
676	hl.set_language(Language::Rust);
677	let mut state = HighlightState::default();
678
679	let tokens = hl.tokenize_line(r#"let s = "hello";"#, &mut state);
680	assert!(tokens.iter().any(\|t\| t.token_type == TokenType::String));
681	}
682
683	#[test]
684	fn test_comment_parsing() {
685	let mut hl = Highlighter::new();
686	hl.set_language(Language::Rust);
687	let mut state = HighlightState::default();
688
689	let tokens = hl.tokenize_line("// this is a comment", &mut state);
690	assert_eq!(tokens.len(), 1);
691	assert_eq!(tokens[0].token_type, TokenType::Comment);
692	}
693	}
694