Rust · 17529 bytes Raw Blame History
1 //! Cheap metadata extraction for the TOC sidebar.
2 //!
3 //! A session file can be 50+ MB for long agent runs. We never parse the
4 //! whole thing for summaries. Instead:
5 //!
6 //! 1. Open the file once and count lines (no JSON parse) for an
7 //! approximate `messageCount`.
8 //! 2. Parse at most `HEAD_SCAN_LINES` lines from the start, shallowly,
9 //! to extract `startedAt`, `cwd`, `gitBranch`, `version`, `slug`,
10 //! `customTitle`, the first assistant's `model`, and a fallback
11 //! title from the first non-meta user message.
12 //! 3. Reverse-scan the last `TAIL_SCAN_BYTES` bytes for the final
13 //! complete JSON object carrying a `timestamp` field, used as
14 //! `lastActivityAt`. Falls back to the file's mtime.
15
16 use std::fs::File;
17 use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
18 use std::path::Path;
19
20 use chrono::{DateTime, Utc};
21 use serde_json::Value;
22
23 use crate::core::error::CoreResult;
24 use crate::core::reader::{is_timeline_event, is_user_prompt, NON_TIMELINE_KINDS};
25 use crate::core::schema::{RawEvent, SessionSummary};
26 use crate::core::title::sanitize_title;
27
28 /// How many lines from the top of the file we're willing to parse for
29 /// metadata before giving up. Empirically the first ~20 lines of a real
30 /// Claude Code session contain customTitle / cwd / first user message.
31 const HEAD_SCAN_LINES: usize = 100;
32
33 /// How many bytes from the end of the file to tail-scan for the
34 /// latest timestamp.
35 const TAIL_SCAN_BYTES: u64 = 64 * 1024;
36
37 pub fn summarize(session_path: &Path, project_id: &str) -> CoreResult<SessionSummary> {
38 let id = session_path
39 .file_stem()
40 .and_then(|s| s.to_str())
41 .map(str::to_owned)
42 .unwrap_or_default();
43
44 let file = File::open(session_path)?;
45 let file_meta = file.metadata()?;
46
47 // Single-pass scan:
48 // * For each line, cheaply classify it as timeline / prompt
49 // via prefix + substring checks so that counting 171 MB of
50 // JSONL doesn't cost a full serde parse per line.
51 // * For the first HEAD_SCAN_LINES (while head info is still
52 // unsaturated) also do a full RawEvent parse to pick up
53 // title / cwd / model / version metadata.
54 let mut head = HeadInfo::default();
55 let mut user_title_fallback: Option<String> = None;
56 let mut message_count: u32 = 0;
57 let mut prompt_count: u32 = 0;
58
59 {
60 let reader = BufReader::new(&file);
61 for (i, line) in reader.lines().enumerate() {
62 let line = match line {
63 Ok(l) => l,
64 Err(_) => continue, // tolerate partial last line
65 };
66 if line.is_empty() {
67 continue;
68 }
69
70 // Head path: full parse, feeds both counters (via the
71 // shared helpers) and head metadata.
72 let head_parse = i < HEAD_SCAN_LINES && !head.is_saturated();
73 if head_parse {
74 if let Ok(ev) = serde_json::from_str::<RawEvent>(&line) {
75 head.absorb(&ev);
76 if user_title_fallback.is_none() {
77 if let Some(t) = extract_user_title(&ev) {
78 user_title_fallback = Some(t);
79 }
80 }
81 if is_timeline_event(&ev) {
82 message_count += 1;
83 }
84 if is_user_prompt(&ev) {
85 prompt_count += 1;
86 }
87 continue;
88 }
89 // Fall through to fast path if JSON parse fails.
90 }
91
92 // Fast path: skip full parse. Claude Code's jsonl
93 // serializer emits `type` first and field order is
94 // stable, so these byte-level checks match what the
95 // full helpers would return without allocating.
96 if is_sidechain_line(&line) {
97 continue;
98 }
99 let kind = parse_kind_prefix(&line);
100 match kind {
101 Some(k) if NON_TIMELINE_KINDS.contains(&k) => continue,
102 Some(k) => {
103 message_count += 1;
104 if k == "user" && !has_tool_result(&line) {
105 prompt_count += 1;
106 }
107 }
108 None => {
109 // Unrecognized shape — count as timeline so we
110 // don't silently lose events on schema drift.
111 message_count += 1;
112 }
113 }
114 }
115 }
116
117 // Tail scan for lastActivityAt.
118 let tail_last_ts = scan_tail_for_last_timestamp(session_path)?;
119
120 let last_activity_at = tail_last_ts
121 .or(head.last_seen_timestamp)
122 .or_else(|| {
123 file_meta
124 .modified()
125 .ok()
126 .map(|st| DateTime::<Utc>::from(st))
127 });
128
129 let title = head
130 .custom_title
131 .clone()
132 .or_else(|| head.ai_title.clone())
133 .or(user_title_fallback)
134 .or(head.slug.clone())
135 .unwrap_or_else(|| "(untitled)".to_string());
136
137 Ok(SessionSummary {
138 id,
139 project_id: project_id.to_string(),
140 title,
141 started_at: head.started_at,
142 last_activity_at,
143 model: head.model,
144 message_count,
145 prompt_count,
146 git_branch: head.git_branch,
147 version: head.version,
148 slug: head.slug,
149 cwd: head.cwd,
150 custom_title: head.custom_title,
151 entrypoint: head.entrypoint,
152 source: crate::core::schema::SessionSource::Disk,
153 })
154 }
155
156 /// Extract the `type` field value from a JSONL line that starts
157 /// with `{"type":"..."`. Returns `None` for lines with any other
158 /// shape. Zero-allocation; caller borrows into the original line.
159 fn parse_kind_prefix(line: &str) -> Option<&str> {
160 let rest = line.strip_prefix("{\"type\":\"")?;
161 let end = rest.find('"')?;
162 Some(&rest[..end])
163 }
164
165 /// Substring check for `"isSidechain":true`. Stable across field
166 /// order because the literal `true` doesn't appear for this key
167 /// in any other configuration.
168 fn is_sidechain_line(line: &str) -> bool {
169 line.contains("\"isSidechain\":true")
170 }
171
172 /// Substring check for `"type":"tool_result"`. Used to classify
173 /// a `user` event as a tool return rather than a human prompt.
174 fn has_tool_result(line: &str) -> bool {
175 line.contains("\"type\":\"tool_result\"")
176 }
177
178 /// Scratchpad for everything we're learning from the file's head.
179 #[derive(Debug, Default)]
180 struct HeadInfo {
181 started_at: Option<DateTime<Utc>>,
182 last_seen_timestamp: Option<DateTime<Utc>>,
183 cwd: Option<String>,
184 git_branch: Option<String>,
185 version: Option<String>,
186 slug: Option<String>,
187 custom_title: Option<String>,
188 ai_title: Option<String>,
189 model: Option<String>,
190 entrypoint: Option<String>,
191 }
192
193 impl HeadInfo {
194 /// Have we collected every field the summary needs? If so the head
195 /// loop can short-circuit further parsing.
196 fn is_saturated(&self) -> bool {
197 self.started_at.is_some()
198 && self.cwd.is_some()
199 && self.git_branch.is_some()
200 && self.version.is_some()
201 && self.slug.is_some()
202 && (self.custom_title.is_some() || self.ai_title.is_some())
203 && self.model.is_some()
204 && self.entrypoint.is_some()
205 }
206
207 fn absorb(&mut self, ev: &RawEvent) {
208 if let Some(ts) = ev.timestamp {
209 if self.started_at.is_none() {
210 self.started_at = Some(ts);
211 }
212 self.last_seen_timestamp = Some(ts);
213 }
214 if self.cwd.is_none() {
215 if let Some(c) = &ev.cwd {
216 self.cwd = Some(c.clone());
217 }
218 }
219 if self.git_branch.is_none() {
220 if let Some(b) = &ev.git_branch {
221 self.git_branch = Some(b.clone());
222 }
223 }
224 if self.version.is_none() {
225 if let Some(v) = &ev.version {
226 self.version = Some(v.clone());
227 }
228 }
229 if self.slug.is_none() {
230 if let Some(s) = &ev.slug {
231 self.slug = Some(s.clone());
232 }
233 }
234 if self.custom_title.is_none() {
235 if let Some(t) = &ev.custom_title {
236 self.custom_title = Some(t.clone());
237 }
238 }
239 if self.ai_title.is_none() {
240 if let Some(t) = &ev.ai_title {
241 self.ai_title = Some(t.clone());
242 }
243 }
244 if self.entrypoint.is_none() {
245 if let Some(e) = &ev.entrypoint {
246 self.entrypoint = Some(e.clone());
247 }
248 }
249 if self.model.is_none() && ev.kind == "assistant" {
250 if let Some(msg) = &ev.message {
251 if let Some(m) = msg.get("model").and_then(Value::as_str) {
252 self.model = Some(m.to_string());
253 }
254 }
255 }
256 }
257 }
258
259 /// Pull a short human-readable title from the first non-meta user event,
260 /// running it through the [`sanitize_title`] pipeline to strip IDE
261 /// wrappers, code blocks, and other noise. Returns `None` if the event
262 /// is not a user message or nothing usable survives sanitization.
263 fn extract_user_title(ev: &RawEvent) -> Option<String> {
264 if ev.kind != "user" {
265 return None;
266 }
267 if ev.is_meta.unwrap_or(false) {
268 return None;
269 }
270 let msg = ev.message.as_ref()?;
271 let text = match msg.get("content") {
272 Some(Value::String(s)) => s.clone(),
273 Some(Value::Array(blocks)) => blocks
274 .iter()
275 .filter_map(|b| b.get("text").and_then(Value::as_str))
276 .collect::<Vec<_>>()
277 .join(" "),
278 _ => return None,
279 };
280 sanitize_title(&text)
281 }
282
283 /// Read the last `TAIL_SCAN_BYTES` of the file, find the last complete
284 /// `{...}\n` line, parse it, and return its `timestamp` if any.
285 fn scan_tail_for_last_timestamp(path: &Path) -> CoreResult<Option<DateTime<Utc>>> {
286 let mut file = File::open(path)?;
287 let len = file.metadata()?.len();
288 if len == 0 {
289 return Ok(None);
290 }
291 let read_len = len.min(TAIL_SCAN_BYTES);
292 let start = len - read_len;
293 file.seek(SeekFrom::Start(start))?;
294
295 let mut buf = vec![0u8; read_len as usize];
296 file.read_exact(&mut buf)?;
297
298 // Walk backward through complete lines (newline-terminated or EOF-terminated).
299 for line in buf.rsplit(|b| *b == b'\n') {
300 if line.is_empty() {
301 continue;
302 }
303 let text = match std::str::from_utf8(line) {
304 Ok(s) => s,
305 Err(_) => continue,
306 };
307 if let Ok(ev) = serde_json::from_str::<RawEvent>(text) {
308 if let Some(ts) = ev.timestamp {
309 return Ok(Some(ts));
310 }
311 }
312 }
313 Ok(None)
314 }
315
316 #[cfg(test)]
317 mod tests {
318 use super::*;
319 use std::io::Write;
320 use tempfile::tempdir;
321
322 const FIXTURE_LINES: &[&str] = &[
323 r#"{"type":"permission-mode","permissionMode":"default","sessionId":"abc"}"#,
324 r#"{"type":"attachment","uuid":"u1","timestamp":"2026-04-11T00:55:32.249Z","cwd":"/Users/me/repo","sessionId":"abc","version":"2.1.101","gitBranch":"main","attachment":{"type":"hook_success"}}"#,
325 r#"{"type":"user","uuid":"u2","timestamp":"2026-04-11T00:55:35.000Z","cwd":"/Users/me/repo","sessionId":"abc","version":"2.1.101","gitBranch":"main","message":{"role":"user","content":"plan the thread browser feature"}}"#,
326 r#"{"type":"assistant","uuid":"u3","timestamp":"2026-04-11T00:55:40.000Z","cwd":"/Users/me/repo","sessionId":"abc","version":"2.1.101","gitBranch":"main","message":{"model":"claude-opus-4-6","content":[{"type":"text","text":"Let's plan."}]}}"#,
327 r#"{"type":"assistant","uuid":"u4","timestamp":"2026-04-11T01:05:00.000Z","cwd":"/Users/me/repo","sessionId":"abc","version":"2.1.101","gitBranch":"main","message":{"model":"claude-opus-4-6","content":[{"type":"text","text":"Done."}]}}"#,
328 ];
329
330 fn write_fixture(path: &Path, lines: &[&str]) {
331 let mut f = File::create(path).unwrap();
332 for line in lines {
333 writeln!(f, "{line}").unwrap();
334 }
335 }
336
337 #[test]
338 fn extracts_summary_from_realistic_head() {
339 let tmp = tempdir().unwrap();
340 let path = tmp.path().join("abc.jsonl");
341 write_fixture(&path, FIXTURE_LINES);
342
343 let s = summarize(&path, "-Users-me-repo").unwrap();
344 assert_eq!(s.id, "abc");
345 assert_eq!(s.project_id, "-Users-me-repo");
346 assert_eq!(s.title, "plan the thread browser feature");
347 assert_eq!(s.model.as_deref(), Some("claude-opus-4-6"));
348 assert_eq!(s.git_branch.as_deref(), Some("main"));
349 assert_eq!(s.version.as_deref(), Some("2.1.101"));
350 // permission-mode is metadata → not counted in message_count.
351 // attachment + user + 2 assistants = 4 timeline events.
352 assert_eq!(s.message_count, 4);
353 assert_eq!(s.prompt_count, 1);
354 assert!(s.started_at.is_some());
355 assert!(s.last_activity_at.is_some());
356 assert_ne!(s.started_at, s.last_activity_at);
357 }
358
359 #[test]
360 fn survives_truncated_last_line() {
361 let tmp = tempdir().unwrap();
362 let path = tmp.path().join("abc.jsonl");
363 // Write good lines, then append a partial JSON line with no newline.
364 let mut f = File::create(&path).unwrap();
365 for line in FIXTURE_LINES {
366 writeln!(f, "{line}").unwrap();
367 }
368 f.write_all(br#"{"type":"assistant","timestamp":"2026-04-11T"#)
369 .unwrap();
370 drop(f);
371
372 let s = summarize(&path, "-Users-me-repo").unwrap();
373 assert_eq!(s.title, "plan the thread browser feature");
374 // Should still have a valid last activity from the preceding good line.
375 assert!(s.last_activity_at.is_some());
376 }
377
378 #[test]
379 fn ignores_meta_user_events_for_title() {
380 let tmp = tempdir().unwrap();
381 let path = tmp.path().join("abc.jsonl");
382 write_fixture(
383 &path,
384 &[
385 r#"{"type":"permission-mode","permissionMode":"default","sessionId":"abc"}"#,
386 r#"{"type":"user","uuid":"u1","timestamp":"2026-04-11T00:55:00.000Z","cwd":"/Users/me/repo","sessionId":"abc","version":"2.1.101","gitBranch":"main","isMeta":true,"message":{"role":"user","content":"<local-command-caveat>noise</local-command-caveat>"}}"#,
387 r#"{"type":"user","uuid":"u2","timestamp":"2026-04-11T00:55:30.000Z","cwd":"/Users/me/repo","sessionId":"abc","version":"2.1.101","gitBranch":"main","message":{"role":"user","content":"actual first question"}}"#,
388 ],
389 );
390
391 let s = summarize(&path, "-Users-me-repo").unwrap();
392 assert_eq!(s.title, "actual first question");
393 }
394
395 #[test]
396 fn prefers_custom_title_over_user_message() {
397 let tmp = tempdir().unwrap();
398 let path = tmp.path().join("abc.jsonl");
399 write_fixture(
400 &path,
401 &[
402 r#"{"type":"custom-title","customTitle":"The Real Title","sessionId":"abc"}"#,
403 r#"{"type":"user","uuid":"u1","timestamp":"2026-04-11T00:55:00.000Z","cwd":"/Users/me/repo","sessionId":"abc","version":"2.1.101","gitBranch":"main","message":{"role":"user","content":"something else"}}"#,
404 ],
405 );
406
407 let s = summarize(&path, "-Users-me-repo").unwrap();
408 assert_eq!(s.title, "The Real Title");
409 }
410
411 #[test]
412 fn empty_file_returns_fallback_title() {
413 let tmp = tempdir().unwrap();
414 let path = tmp.path().join("empty.jsonl");
415 File::create(&path).unwrap();
416
417 let s = summarize(&path, "-Users-me-repo").unwrap();
418 assert_eq!(s.title, "(untitled)");
419 assert_eq!(s.message_count, 0);
420 assert_eq!(s.prompt_count, 0);
421 // last_activity falls back to mtime.
422 assert!(s.last_activity_at.is_some());
423 }
424
425 #[test]
426 fn prompt_count_excludes_tool_result_returns() {
427 let tmp = tempdir().unwrap();
428 let path = tmp.path().join("abc.jsonl");
429 write_fixture(
430 &path,
431 &[
432 // Real human prompt.
433 r#"{"type":"user","uuid":"u1","timestamp":"2026-04-11T00:55:00.000Z","cwd":"/Users/me/repo","sessionId":"abc","message":{"role":"user","content":"run the tests"}}"#,
434 // Assistant's tool_use.
435 r#"{"type":"assistant","uuid":"u2","timestamp":"2026-04-11T00:55:10.000Z","cwd":"/Users/me/repo","sessionId":"abc","message":{"model":"claude-opus-4-6","content":[{"type":"tool_use","id":"t1","name":"Bash","input":{"command":"cargo test"}}]}}"#,
436 // Tool_result posing as a user event — NOT a prompt.
437 r#"{"type":"user","uuid":"u3","timestamp":"2026-04-11T00:55:20.000Z","cwd":"/Users/me/repo","sessionId":"abc","message":{"role":"user","content":[{"type":"tool_result","tool_use_id":"t1","content":"ok","is_error":false}]}}"#,
438 // Another real human prompt.
439 r#"{"type":"user","uuid":"u4","timestamp":"2026-04-11T00:55:30.000Z","cwd":"/Users/me/repo","sessionId":"abc","message":{"role":"user","content":"now commit"}}"#,
440 ],
441 );
442
443 let s = summarize(&path, "-Users-me-repo").unwrap();
444 // All 4 events render in the viewer timeline.
445 assert_eq!(s.message_count, 4);
446 // Only the two real human prompts count, not the tool_result.
447 assert_eq!(s.prompt_count, 2);
448 }
449 }
450