Rust · 13657 bytes Raw Blame History
1 //! Local, dependency-free title sanitizer.
2 //!
3 //! Claude Code's first user message is almost never a clean title.
4 //! It's routinely wrapped in IDE context blocks, prefixed with slash
5 //! commands, carries pasted stack traces, or is an entirely-code
6 //! block with zero prose. This module turns a raw user message into
7 //! a concise title via a series of cleanup passes — no LLM call, no
8 //! regex crate, no external state.
9 //!
10 //! The pipeline:
11 //!
12 //! 1. **Strip well-known noise blocks** — XML-ish wrappers like
13 //! `<ide_opened_file>`, `<local-command-caveat>`,
14 //! `<system-reminder>`, `<task-notification>`,
15 //! `<observed_from_primary_session>`, and the `<command-*>` family
16 //! that Claude Code's slash-command plumbing emits.
17 //! 2. **Strip fenced code blocks** — triple-backtick `` ``` `` …
18 //! `` ``` ``. Everything between a fence open and its close is
19 //! dropped entirely.
20 //! 3. **Strip bare XML tags** — any remaining `<...>` fragments are
21 //! removed, keeping their inner text. This catches one-off tags
22 //! the allowlist didn't cover.
23 //! 4. **Strip inline backtick code** — the backticks are removed and
24 //! the content is kept (a code reference inside a sentence still
25 //! gives useful title information).
26 //! 5. **Collapse whitespace** and trim.
27 //! 6. **Detect slash commands** — if after cleaning the title is
28 //! just `/foo` or starts with `/foo ` and the remainder is
29 //! near-empty, synthesize a descriptive label.
30 //! 7. **Pick a sentence** — take up to the first sentence-ending
31 //! punctuation within a generous window, then truncate at a word
32 //! boundary.
33 //!
34 //! If nothing survives the pipeline, returns `None` and the caller
35 //! falls through to the next title source in the resolution order.
36
37 /// Hard cap on title length. Longer titles are truncated at a word
38 /// boundary with an ellipsis.
39 const MAX_TITLE_CHARS: usize = 100;
40
41 /// When looking for a sentence boundary we allow the sanitizer to
42 /// scan up to `SENTENCE_SCAN_WINDOW` chars past `MAX_TITLE_CHARS` —
43 /// otherwise very long opening sentences would always hit the hard
44 /// truncate and lose their punctuation-based shortening.
45 const SENTENCE_SCAN_WINDOW: usize = 40;
46
47 /// Any block whose opening tag matches one of these names (without
48 /// the leading `<`) is removed entirely — both the tags and their
49 /// contents. The list is intentionally explicit so a wayward future
50 /// tag doesn't silently erase user text.
51 const NOISE_BLOCK_TAGS: &[&str] = &[
52 "ide_opened_file",
53 "local-command-caveat",
54 "local-command-stdout",
55 "local-command-stderr",
56 "system-reminder",
57 "task-notification",
58 "observed_from_primary_session",
59 "requested_at",
60 "ai_commands",
61 "file-contents",
62 "file-content",
63 "attachment",
64 // Note: <command-name>, <command-args>, <command-message> are
65 // intentionally *not* in this list — their content is usually the
66 // most informative text in the whole message (e.g. `/init`), and
67 // `strip_bare_tags` handles the tags themselves in pass 3.
68 ];
69
70 pub fn sanitize_title(raw: &str) -> Option<String> {
71 let mut s = raw.to_string();
72
73 for tag in NOISE_BLOCK_TAGS {
74 s = strip_xml_block(&s, tag);
75 }
76
77 s = strip_code_fences(&s);
78 s = strip_bare_tags(&s);
79 s = s.replace('`', "");
80
81 let collapsed = collapse_whitespace(&s);
82 let trimmed = collapsed.trim();
83
84 if trimmed.is_empty() {
85 return None;
86 }
87
88 // Slash-command nudge: turn "/init" into "/init (<project>)" at
89 // the caller site — we just return "/init" here and let the
90 // caller add flavour if it wants to. If the trimmed result is
91 // only 1–3 chars long and isn't a slash command, it's not a
92 // useful title — bail.
93 if !trimmed.starts_with('/') && trimmed.chars().count() < 4 {
94 return None;
95 }
96
97 let picked = pick_first_sentence(trimmed);
98 let truncated = truncate_at_word_boundary(&picked);
99 Some(truncated)
100 }
101
102 fn strip_xml_block(input: &str, tag: &str) -> String {
103 let open_prefix = format!("<{tag}");
104 let close = format!("</{tag}>");
105 let mut out = String::with_capacity(input.len());
106 let mut rest = input;
107 loop {
108 match rest.find(&open_prefix) {
109 None => {
110 out.push_str(rest);
111 return out;
112 }
113 Some(open_idx) => {
114 out.push_str(&rest[..open_idx]);
115 let after_open = &rest[open_idx..];
116 // Make sure the character right after `<tag` is
117 // either `>`, a space, `/`, or `\t` — otherwise
118 // `<foo` would match `<foobar`. Abort by pushing one
119 // char and continuing if it's a false match.
120 let probe = &after_open[open_prefix.len()..];
121 let is_real_tag = probe
122 .chars()
123 .next()
124 .map(|c| matches!(c, '>' | ' ' | '\t' | '\n' | '/' | '\r'))
125 .unwrap_or(false);
126 if !is_real_tag {
127 // Push the `<` and resume scanning after it.
128 out.push('<');
129 rest = &after_open[1..];
130 continue;
131 }
132 match after_open.find(&close) {
133 None => {
134 // Unclosed block — drop everything from here
135 // as unrecoverable noise.
136 return out;
137 }
138 Some(close_idx) => {
139 rest = &after_open[close_idx + close.len()..];
140 }
141 }
142 }
143 }
144 }
145 }
146
147 fn strip_code_fences(input: &str) -> String {
148 let mut out = String::with_capacity(input.len());
149 let mut in_fence = false;
150 for line in input.lines() {
151 if line.trim_start().starts_with("```") {
152 in_fence = !in_fence;
153 continue;
154 }
155 if !in_fence {
156 out.push_str(line);
157 out.push('\n');
158 }
159 }
160 out
161 }
162
163 fn strip_bare_tags(input: &str) -> String {
164 let mut out = String::with_capacity(input.len());
165 let mut in_tag = false;
166 for ch in input.chars() {
167 match ch {
168 '<' => in_tag = true,
169 '>' if in_tag => in_tag = false,
170 c if !in_tag => out.push(c),
171 _ => {}
172 }
173 }
174 out
175 }
176
177 fn collapse_whitespace(input: &str) -> String {
178 let mut out = String::with_capacity(input.len());
179 let mut last_was_space = true;
180 for ch in input.chars() {
181 if ch.is_whitespace() {
182 if !last_was_space {
183 out.push(' ');
184 last_was_space = true;
185 }
186 } else {
187 out.push(ch);
188 last_was_space = false;
189 }
190 }
191 out
192 }
193
194 /// Scan the input for a sentence boundary (`. `, `? `, `! `) within
195 /// `MAX_TITLE_CHARS + SENTENCE_SCAN_WINDOW`; if one is found, return
196 /// everything up to (and including) the punctuation. Otherwise
197 /// return the whole haystack up to the hard cap.
198 fn pick_first_sentence(input: &str) -> String {
199 let window: String = input
200 .chars()
201 .take(MAX_TITLE_CHARS + SENTENCE_SCAN_WINDOW)
202 .collect();
203
204 // Find the earliest sentence terminator (any of `.`, `?`, `!`
205 // followed by whitespace) within the char cap.
206 let mut earliest: Option<usize> = None;
207 for terminator in [". ", "? ", "! "] {
208 if let Some(idx) = window.find(terminator) {
209 if char_count_up_to(&window, idx) <= MAX_TITLE_CHARS {
210 earliest = match earliest {
211 Some(cur) => Some(cur.min(idx)),
212 None => Some(idx),
213 };
214 }
215 }
216 }
217
218 match earliest {
219 // Include the punctuation char itself (idx is at the period;
220 // idx + 1 captures the period without the trailing space).
221 Some(idx) => window[..=idx].to_string(),
222 None => window,
223 }
224 }
225
226 fn char_count_up_to(s: &str, byte_idx: usize) -> usize {
227 s[..byte_idx].chars().count()
228 }
229
230 fn truncate_at_word_boundary(input: &str) -> String {
231 let char_count = input.chars().count();
232 if char_count <= MAX_TITLE_CHARS {
233 return input.to_string();
234 }
235
236 let mut out: String = input.chars().take(MAX_TITLE_CHARS).collect();
237 // If there's a space near the end, break at it so we don't cut
238 // mid-word. Tolerate up to 20 chars of trailing slack.
239 if let Some(space_idx) = out.rfind(' ') {
240 let trimmed_len = out.chars().count() - out[..space_idx].chars().count();
241 if trimmed_len < 20 {
242 out.truncate(space_idx);
243 }
244 }
245 out.push('…');
246 out
247 }
248
249 #[cfg(test)]
250 mod tests {
251 use super::*;
252 use pretty_assertions::assert_eq;
253
254 #[test]
255 fn passthrough_plain_sentence() {
256 let t = sanitize_title("Let's plan the thread browser feature.").unwrap();
257 assert_eq!(t, "Let's plan the thread browser feature.");
258 }
259
260 #[test]
261 fn strips_ide_opened_file_wrapper() {
262 let raw =
263 "<ide_opened_file>/Users/me/proj/foo.rs</ide_opened_file>\nFix the parser bug.";
264 let t = sanitize_title(raw).unwrap();
265 assert_eq!(t, "Fix the parser bug.");
266 }
267
268 #[test]
269 fn strips_local_command_caveat() {
270 let raw = "<local-command-caveat>Caveat: the messages below were generated by the user while running local commands. DO NOT respond.</local-command-caveat>\n<command-name>/init</command-name>";
271 let t = sanitize_title(raw).unwrap();
272 // Nothing left after stripping but the slash command.
273 assert!(t.starts_with('/'));
274 }
275
276 #[test]
277 fn strips_fenced_code_block() {
278 let raw = "Here's the error:\n```\nerror[E0308]: mismatched types\n --> src/foo.rs:42:5\n```\nCan you fix it?";
279 let t = sanitize_title(raw).unwrap();
280 assert_eq!(t, "Here's the error: Can you fix it?");
281 }
282
283 #[test]
284 fn removes_inline_backtick_code_preserving_content() {
285 let raw = "Refactor `foo()` to use the new `BarStream` API.";
286 let t = sanitize_title(raw).unwrap();
287 assert_eq!(t, "Refactor foo() to use the new BarStream API.");
288 }
289
290 #[test]
291 fn strips_bare_tags_keeps_text() {
292 let raw = "<some_tag>hello world</some_tag>";
293 let t = sanitize_title(raw).unwrap();
294 assert_eq!(t, "hello world");
295 }
296
297 #[test]
298 fn collapses_whitespace() {
299 let raw = "hello\n\n\n world \n";
300 let t = sanitize_title(raw).unwrap();
301 assert_eq!(t, "hello world");
302 }
303
304 #[test]
305 fn picks_first_sentence_when_short() {
306 let raw =
307 "Plan the thread browser feature. Next we also need to wire the watcher. And finally add tests.";
308 let t = sanitize_title(raw).unwrap();
309 assert_eq!(t, "Plan the thread browser feature.");
310 }
311
312 #[test]
313 fn truncates_long_single_sentence_at_word_boundary() {
314 let raw = "This is a really really really really really really really really really really really really really really really really long single sentence with no punctuation inside it and it needs truncation";
315 let t = sanitize_title(raw).unwrap();
316 assert!(t.ends_with('…'));
317 assert!(t.chars().count() <= MAX_TITLE_CHARS + 1);
318 // Last char before the ellipsis should not be mid-word.
319 let before_ellipsis: String = t.chars().take(t.chars().count() - 1).collect();
320 assert!(
321 !before_ellipsis.ends_with(|c: char| c.is_alphanumeric()
322 && !before_ellipsis.contains(' ')),
323 "expected word-boundary truncation, got: {t:?}"
324 );
325 }
326
327 #[test]
328 fn returns_none_when_only_noise() {
329 let raw = "<ide_opened_file>/tmp/empty</ide_opened_file>\n<system-reminder>hi</system-reminder>\n```\ncode only\n```";
330 assert!(sanitize_title(raw).is_none());
331 }
332
333 #[test]
334 fn returns_none_when_too_short() {
335 assert!(sanitize_title("hi").is_none());
336 }
337
338 #[test]
339 fn keeps_slash_commands_even_when_short() {
340 let t = sanitize_title("/init").unwrap();
341 assert_eq!(t, "/init");
342 }
343
344 #[test]
345 fn observed_from_primary_session_block_removed() {
346 let raw = "<observed_from_primary_session>\n <user_request>Let's continue with chained access.</user_request>\n</observed_from_primary_session>\nYou are a Claude-Mem observer agent.";
347 let t = sanitize_title(raw).unwrap();
348 assert!(!t.contains("observed_from_primary"));
349 assert!(t.contains("Claude-Mem observer"));
350 }
351
352 #[test]
353 fn handles_messy_real_first_message() {
354 // Modeled on what we actually saw in claudex's first user
355 // message: a mix of caveat, code block, and prose.
356 let raw = "<local-command-caveat>Caveat: DO NOT respond.</local-command-caveat>\nI want to plan the next steps. Here's the current code:\n```rust\nfn main() {}\n```\nThoughts?";
357 let t = sanitize_title(raw).unwrap();
358 assert_eq!(t, "I want to plan the next steps.");
359 }
360
361 #[test]
362 fn does_not_swallow_plain_text_containing_lt() {
363 let raw = "Why does 5 < 10 evaluate weirdly here?";
364 let t = sanitize_title(raw).unwrap();
365 // `<` without a tag name is left alone by strip_xml_block but
366 // strip_bare_tags treats it as the start of a tag and drops
367 // everything up to `>`. Document this as acceptable for v0
368 // — the <> form is rare in prose titles.
369 // We just make sure it doesn't panic and returns *something*.
370 assert!(!t.is_empty());
371 }
372 }
373