Stabilize qwen HTML TOC recovery
Authored by
mfwolffe <wolffemf@dukes.jmu.edu>
- SHA
723240e9f255c866ae1459361a835c31fe00f952- Parents
-
3a703a1 - Tree
39f6098
723240e
723240e9f255c866ae1459361a835c31fe00f9523a703a1
39f6098src/loader/agent/parsing.pymodified@@ -2,6 +2,7 @@ | ||
| 2 | 2 | |
| 3 | 3 | from ..runtime.parsing import ( # noqa: F401 |
| 4 | 4 | ParsedResponse, |
| 5 | + canonicalize_tool_name, | |
| 5 | 6 | format_tool_result, |
| 6 | 7 | parse_tool_calls, |
| 7 | 8 | ) |
src/loader/llm/ollama.pymodified@@ -6,7 +6,7 @@ from typing import Any | ||
| 6 | 6 | |
| 7 | 7 | import httpx |
| 8 | 8 | |
| 9 | -from ..agent.parsing import parse_tool_calls | |
| 9 | +from ..agent.parsing import canonicalize_tool_name, parse_tool_calls | |
| 10 | 10 | from ..runtime.capabilities import CapabilityProfile, resolve_capability_profile |
| 11 | 11 | from .base import ( |
| 12 | 12 | CompletionResponse, |
@@ -372,6 +372,23 @@ class OllamaBackend(LLMBackend): | ||
| 372 | 372 | ) |
| 373 | 373 | return parsed.content, parsed.tool_calls |
| 374 | 374 | |
| 375 | + def _canonical_native_tool_name( | |
| 376 | + self, | |
| 377 | + raw_name: object, | |
| 378 | + *, | |
| 379 | + tools: list[dict[str, Any]] | None = None, | |
| 380 | + ) -> str: | |
| 381 | + """Normalize native Ollama tool-call names to Loader's canonical names.""" | |
| 382 | + | |
| 383 | + name = str(raw_name or "").strip() | |
| 384 | + if not name: | |
| 385 | + return "" | |
| 386 | + canonical_name = canonicalize_tool_name( | |
| 387 | + name, | |
| 388 | + allowed_tool_names=self._allowed_tool_names(tools), | |
| 389 | + ) | |
| 390 | + return canonical_name or name | |
| 391 | + | |
| 375 | 392 | async def complete( |
| 376 | 393 | self, |
| 377 | 394 | messages: list[Message], |
@@ -435,7 +452,10 @@ class OllamaBackend(LLMBackend): | ||
| 435 | 452 | args = {} |
| 436 | 453 | tool_calls.append(ToolCall( |
| 437 | 454 | id=tc.get("id", f"call_{i}"), |
| 438 | - name=func.get("name", ""), | |
| 455 | + name=self._canonical_native_tool_name( | |
| 456 | + func.get("name", ""), | |
| 457 | + tools=tools, | |
| 458 | + ), | |
| 439 | 459 | arguments=args, |
| 440 | 460 | )) |
| 441 | 461 | else: |
@@ -561,7 +581,10 @@ class OllamaBackend(LLMBackend): | ||
| 561 | 581 | args = {} |
| 562 | 582 | accumulated_tool_calls.append(ToolCall( |
| 563 | 583 | id=tc.get("id", f"call_{len(accumulated_tool_calls)}"), |
| 564 | - name=func.get("name", ""), | |
| 584 | + name=self._canonical_native_tool_name( | |
| 585 | + func.get("name", ""), | |
| 586 | + tools=tools, | |
| 587 | + ), | |
| 565 | 588 | arguments=args, |
| 566 | 589 | )) |
| 567 | 590 | continue |
@@ -581,7 +604,10 @@ class OllamaBackend(LLMBackend): | ||
| 581 | 604 | args = {} |
| 582 | 605 | tool_calls.append(ToolCall( |
| 583 | 606 | id=tc.get("id", f"call_{i}"), |
| 584 | - name=func.get("name", ""), | |
| 607 | + name=self._canonical_native_tool_name( | |
| 608 | + func.get("name", ""), | |
| 609 | + tools=tools, | |
| 610 | + ), | |
| 585 | 611 | arguments=args, |
| 586 | 612 | )) |
| 587 | 613 | |
src/loader/runtime/compaction.pymodified@@ -2,12 +2,13 @@ | ||
| 2 | 2 | |
| 3 | 3 | from __future__ import annotations |
| 4 | 4 | |
| 5 | +import html | |
| 5 | 6 | import re |
| 6 | 7 | from collections import Counter |
| 7 | 8 | from dataclasses import dataclass |
| 8 | 9 | from pathlib import Path |
| 9 | 10 | |
| 10 | -from ..llm.base import Message, Role | |
| 11 | +from ..llm.base import Message, Role, ToolCall | |
| 11 | 12 | |
| 12 | 13 | DEFAULT_AUTO_COMPACTION_INPUT_TOKENS_THRESHOLD = 100_000 |
| 13 | 14 | MIN_AUTO_COMPACTION_INPUT_TOKENS_THRESHOLD = 12_000 |
@@ -302,37 +303,7 @@ def extract_key_files(messages: list[Message], *, limit: int | None = 6) -> list | ||
| 302 | 303 | def summarize_confirmed_facts(messages: list[Message], *, max_items: int = 2) -> str | None: |
| 303 | 304 | """Summarize recent confirmed discoveries from successful tool results.""" |
| 304 | 305 | |
| 305 | - facts: list[str] = [] | |
| 306 | - for message in reversed(messages): | |
| 307 | - if message.role != Role.TOOL or _is_compacted_context_message(message.content): | |
| 308 | - continue | |
| 309 | - if any(result.is_error for result in message.tool_results): | |
| 310 | - continue | |
| 311 | - | |
| 312 | - tool_name = _observed_tool_name(message.content) | |
| 313 | - payload = "\n".join( | |
| 314 | - result.content.strip() | |
| 315 | - for result in message.tool_results | |
| 316 | - if result.content.strip() | |
| 317 | - ) or message.content | |
| 318 | - | |
| 319 | - if tool_name in { | |
| 320 | - "notepad_write_working", | |
| 321 | - "notepad_append", | |
| 322 | - "notepad_write_priority", | |
| 323 | - "notepad_write_manual", | |
| 324 | - }: | |
| 325 | - mapping_fact = _summarize_html_mappings(payload) | |
| 326 | - if mapping_fact and mapping_fact not in facts: | |
| 327 | - facts.append(mapping_fact) | |
| 328 | - | |
| 329 | - if tool_name in {"glob", "bash"}: | |
| 330 | - file_fact = _summarize_html_file_discovery(payload) | |
| 331 | - if file_fact and file_fact not in facts: | |
| 332 | - facts.append(file_fact) | |
| 333 | - | |
| 334 | - if len(facts) >= max_items: | |
| 335 | - break | |
| 306 | + facts = _collect_confirmed_facts(messages) | |
| 336 | 307 | |
| 337 | 308 | if not facts: |
| 338 | 309 | return None |
@@ -350,7 +321,19 @@ def infer_preferred_next_step( | ||
| 350 | 321 | return None |
| 351 | 322 | |
| 352 | 323 | target_path = _choose_target_path(messages, current_task=current_task) |
| 324 | + has_confirmed_titles = _summarize_html_title_discovery(messages) is not None | |
| 325 | + verification_gap = _summarize_latest_html_verification_gap(messages) | |
| 353 | 326 | if target_path: |
| 327 | + if verification_gap: | |
| 328 | + return ( | |
| 329 | + f"Update `{target_path}` to fix the specific verification failures " | |
| 330 | + f"({verification_gap}) instead of restarting discovery." | |
| 331 | + ) | |
| 332 | + if has_confirmed_titles: | |
| 333 | + return ( | |
| 334 | + f"Update `{target_path}` using the confirmed chapter file/title pairs " | |
| 335 | + "instead of rereading files." | |
| 336 | + ) | |
| 354 | 337 | return ( |
| 355 | 338 | f"Update `{target_path}` using the confirmed findings instead of " |
| 356 | 339 | "restarting earlier discovery steps." |
@@ -426,6 +409,92 @@ def _observed_tool_name(content: str) -> str | None: | ||
| 426 | 409 | return None |
| 427 | 410 | |
| 428 | 411 | |
| 412 | +def _collect_confirmed_facts(messages: list[Message]) -> list[str]: | |
| 413 | + facts: list[str] = [] | |
| 414 | + tool_calls_by_id = { | |
| 415 | + tool_call.id: tool_call | |
| 416 | + for message in messages | |
| 417 | + for tool_call in message.tool_calls | |
| 418 | + } | |
| 419 | + | |
| 420 | + explicit_mapping_fact = _collect_explicit_mapping_fact( | |
| 421 | + messages, | |
| 422 | + tool_calls_by_id=tool_calls_by_id, | |
| 423 | + ) | |
| 424 | + if explicit_mapping_fact: | |
| 425 | + facts.append(explicit_mapping_fact) | |
| 426 | + | |
| 427 | + verification_gap_fact = _collect_html_verification_gap_fact( | |
| 428 | + messages, | |
| 429 | + tool_calls_by_id=tool_calls_by_id, | |
| 430 | + ) | |
| 431 | + if verification_gap_fact: | |
| 432 | + facts.append(verification_gap_fact) | |
| 433 | + | |
| 434 | + title_fact = _summarize_html_title_discovery( | |
| 435 | + messages, | |
| 436 | + tool_calls_by_id=tool_calls_by_id, | |
| 437 | + ) | |
| 438 | + if title_fact: | |
| 439 | + facts.append(title_fact) | |
| 440 | + | |
| 441 | + file_fact = _collect_html_file_discovery_fact( | |
| 442 | + messages, | |
| 443 | + tool_calls_by_id=tool_calls_by_id, | |
| 444 | + ) | |
| 445 | + if file_fact: | |
| 446 | + facts.append(file_fact) | |
| 447 | + | |
| 448 | + return facts | |
| 449 | + | |
| 450 | + | |
| 451 | +def _collect_explicit_mapping_fact( | |
| 452 | + messages: list[Message], | |
| 453 | + *, | |
| 454 | + tool_calls_by_id: dict[str, ToolCall], | |
| 455 | +) -> str | None: | |
| 456 | + mappings: list[str] = [] | |
| 457 | + for message in messages: | |
| 458 | + if message.role != Role.TOOL or _is_compacted_context_message(message.content): | |
| 459 | + continue | |
| 460 | + if any(result.is_error for result in message.tool_results): | |
| 461 | + continue | |
| 462 | + | |
| 463 | + tool_name = _resolve_tool_name( | |
| 464 | + message, | |
| 465 | + tool_calls_by_id=tool_calls_by_id, | |
| 466 | + ) | |
| 467 | + if tool_name not in { | |
| 468 | + "notepad_write_working", | |
| 469 | + "notepad_append", | |
| 470 | + "notepad_write_priority", | |
| 471 | + "notepad_write_manual", | |
| 472 | + }: | |
| 473 | + continue | |
| 474 | + | |
| 475 | + payload = "\n".join( | |
| 476 | + result.content.strip() | |
| 477 | + for result in message.tool_results | |
| 478 | + if result.content.strip() | |
| 479 | + ) or message.content | |
| 480 | + pairs = re.findall( | |
| 481 | + r"([A-Za-z0-9_.-]+\.html)\s*->\s*([A-Za-z0-9_.-]+\.html)", | |
| 482 | + payload, | |
| 483 | + ) | |
| 484 | + for left, right in pairs: | |
| 485 | + mapping = f"{left} -> {right}" | |
| 486 | + if mapping not in mappings: | |
| 487 | + mappings.append(mapping) | |
| 488 | + | |
| 489 | + if not mappings: | |
| 490 | + return None | |
| 491 | + | |
| 492 | + preview = ", ".join(mappings[:4]) | |
| 493 | + if len(mappings) > 4: | |
| 494 | + preview += ", ..." | |
| 495 | + return f"Filename mappings confirmed: {preview}" | |
| 496 | + | |
| 497 | + | |
| 429 | 498 | def _summarize_html_mappings(payload: str) -> str | None: |
| 430 | 499 | pairs = re.findall( |
| 431 | 500 | r"([A-Za-z0-9_.-]+\.html)\s*->\s*([A-Za-z0-9_.-]+\.html)", |
@@ -444,6 +513,209 @@ def _summarize_html_mappings(payload: str) -> str | None: | ||
| 444 | 513 | return f"Filename mappings confirmed: {preview}" |
| 445 | 514 | |
| 446 | 515 | |
| 516 | +def _summarize_html_title_discovery( | |
| 517 | + messages: list[Message], | |
| 518 | + *, | |
| 519 | + max_pairs: int = 4, | |
| 520 | + tool_calls_by_id: dict[str, ToolCall] | None = None, | |
| 521 | +) -> str | None: | |
| 522 | + if tool_calls_by_id is None: | |
| 523 | + tool_calls_by_id = { | |
| 524 | + tool_call.id: tool_call | |
| 525 | + for message in messages | |
| 526 | + for tool_call in message.tool_calls | |
| 527 | + } | |
| 528 | + | |
| 529 | + confirmed_pairs: list[str] = [] | |
| 530 | + for message in messages: | |
| 531 | + if message.role != Role.TOOL or _is_compacted_context_message(message.content): | |
| 532 | + continue | |
| 533 | + if any(result.is_error for result in message.tool_results): | |
| 534 | + continue | |
| 535 | + | |
| 536 | + tool_call = next( | |
| 537 | + ( | |
| 538 | + tool_calls_by_id.get(result.tool_call_id) | |
| 539 | + for result in message.tool_results | |
| 540 | + if result.tool_call_id in tool_calls_by_id | |
| 541 | + ), | |
| 542 | + None, | |
| 543 | + ) | |
| 544 | + if tool_call is None or tool_call.name != "read": | |
| 545 | + continue | |
| 546 | + | |
| 547 | + raw_path = tool_call.arguments.get("file_path") | |
| 548 | + if not isinstance(raw_path, str): | |
| 549 | + continue | |
| 550 | + normalized_path = _normalize_path_candidate(raw_path) or raw_path | |
| 551 | + if Path(normalized_path).name == "index.html" or "/chapters/" not in normalized_path: | |
| 552 | + continue | |
| 553 | + | |
| 554 | + payload = "\n".join( | |
| 555 | + result.content.strip() | |
| 556 | + for result in message.tool_results | |
| 557 | + if result.content.strip() | |
| 558 | + ) or message.content | |
| 559 | + title = _extract_html_title(payload) | |
| 560 | + if not title: | |
| 561 | + continue | |
| 562 | + | |
| 563 | + pair = f"{Path(normalized_path).name} = {title}" | |
| 564 | + if pair not in confirmed_pairs: | |
| 565 | + confirmed_pairs.append(pair) | |
| 566 | + | |
| 567 | + if not confirmed_pairs: | |
| 568 | + return None | |
| 569 | + | |
| 570 | + preview = ", ".join(confirmed_pairs[:max_pairs]) | |
| 571 | + if len(confirmed_pairs) > max_pairs: | |
| 572 | + preview += ", ..." | |
| 573 | + return f"Chapter titles confirmed: {preview}" | |
| 574 | + | |
| 575 | + | |
| 576 | +def _extract_html_title(payload: str) -> str | None: | |
| 577 | + for pattern in ( | |
| 578 | + r"<h1[^>]*>(.*?)</h1>", | |
| 579 | + r"<title[^>]*>(.*?)</title>", | |
| 580 | + ): | |
| 581 | + match = re.search(pattern, payload, re.IGNORECASE | re.DOTALL) | |
| 582 | + if not match: | |
| 583 | + continue | |
| 584 | + title = re.sub(r"<[^>]+>", " ", match.group(1)) | |
| 585 | + title = _collapse_inline_whitespace(html.unescape(title)) | |
| 586 | + if title: | |
| 587 | + return title | |
| 588 | + return None | |
| 589 | + | |
| 590 | + | |
| 591 | +def _collect_html_file_discovery_fact( | |
| 592 | + messages: list[Message], | |
| 593 | + *, | |
| 594 | + tool_calls_by_id: dict[str, ToolCall], | |
| 595 | +) -> str | None: | |
| 596 | + filenames: list[str] = [] | |
| 597 | + for message in messages: | |
| 598 | + if message.role != Role.TOOL or _is_compacted_context_message(message.content): | |
| 599 | + continue | |
| 600 | + if any(result.is_error for result in message.tool_results): | |
| 601 | + continue | |
| 602 | + | |
| 603 | + tool_name = _resolve_tool_name( | |
| 604 | + message, | |
| 605 | + tool_calls_by_id=tool_calls_by_id, | |
| 606 | + ) | |
| 607 | + if tool_name not in {"glob", "bash"}: | |
| 608 | + continue | |
| 609 | + | |
| 610 | + payload = "\n".join( | |
| 611 | + result.content.strip() | |
| 612 | + for result in message.tool_results | |
| 613 | + if result.content.strip() | |
| 614 | + ) or message.content | |
| 615 | + matches = re.findall(r"([A-Za-z0-9_.-]+\.html)", payload) | |
| 616 | + for name in matches: | |
| 617 | + if name not in filenames: | |
| 618 | + filenames.append(name) | |
| 619 | + | |
| 620 | + if len(filenames) < 3: | |
| 621 | + return None | |
| 622 | + | |
| 623 | + preview = ", ".join(filenames[:6]) | |
| 624 | + if len(filenames) > 6: | |
| 625 | + preview += ", ..." | |
| 626 | + return f"Existing files include {preview}" | |
| 627 | + | |
| 628 | + | |
| 629 | +def _collect_html_verification_gap_fact( | |
| 630 | + messages: list[Message], | |
| 631 | + *, | |
| 632 | + tool_calls_by_id: dict[str, ToolCall], | |
| 633 | +) -> str | None: | |
| 634 | + gap = _summarize_latest_html_verification_gap( | |
| 635 | + messages, | |
| 636 | + tool_calls_by_id=tool_calls_by_id, | |
| 637 | + ) | |
| 638 | + if not gap: | |
| 639 | + return None | |
| 640 | + return f"Verification gaps: {gap}" | |
| 641 | + | |
| 642 | + | |
| 643 | +def _summarize_latest_html_verification_gap( | |
| 644 | + messages: list[Message], | |
| 645 | + *, | |
| 646 | + max_items: int = 2, | |
| 647 | + tool_calls_by_id: dict[str, ToolCall] | None = None, | |
| 648 | +) -> str | None: | |
| 649 | + if tool_calls_by_id is None: | |
| 650 | + tool_calls_by_id = { | |
| 651 | + tool_call.id: tool_call | |
| 652 | + for message in messages | |
| 653 | + for tool_call in message.tool_calls | |
| 654 | + } | |
| 655 | + | |
| 656 | + for message in reversed(messages): | |
| 657 | + if message.role != Role.TOOL or _is_compacted_context_message(message.content): | |
| 658 | + continue | |
| 659 | + if not any(result.is_error for result in message.tool_results): | |
| 660 | + continue | |
| 661 | + tool_name = _resolve_tool_name( | |
| 662 | + message, | |
| 663 | + tool_calls_by_id=tool_calls_by_id, | |
| 664 | + ) | |
| 665 | + if tool_name != "bash": | |
| 666 | + continue | |
| 667 | + | |
| 668 | + payload = "\n".join( | |
| 669 | + result.content.strip() | |
| 670 | + for result in message.tool_results | |
| 671 | + if result.content.strip() | |
| 672 | + ) or message.content | |
| 673 | + gap = _extract_html_verification_gap(payload, max_items=max_items) | |
| 674 | + if gap: | |
| 675 | + return gap | |
| 676 | + | |
| 677 | + return None | |
| 678 | + | |
| 679 | + | |
| 680 | +def _extract_html_verification_gap(payload: str, *, max_items: int = 2) -> str | None: | |
| 681 | + missing: list[str] = [] | |
| 682 | + mismatches: list[str] = [] | |
| 683 | + mode: str | None = None | |
| 684 | + | |
| 685 | + for raw_line in payload.splitlines(): | |
| 686 | + line = raw_line.strip() | |
| 687 | + if not line: | |
| 688 | + continue | |
| 689 | + lowered = line.lower() | |
| 690 | + if lowered == "missing links:": | |
| 691 | + mode = "missing" | |
| 692 | + continue | |
| 693 | + if lowered == "title mismatches:": | |
| 694 | + mode = "mismatch" | |
| 695 | + continue | |
| 696 | + if mode == "missing" and "->" in line: | |
| 697 | + href = line.split("->", 1)[0].strip() | |
| 698 | + if href and href not in missing: | |
| 699 | + missing.append(href) | |
| 700 | + continue | |
| 701 | + if mode == "mismatch" and "!=" in line: | |
| 702 | + if line not in mismatches: | |
| 703 | + mismatches.append(line) | |
| 704 | + | |
| 705 | + parts: list[str] = [] | |
| 706 | + if missing: | |
| 707 | + preview = ", ".join(missing[:max_items]) | |
| 708 | + if len(missing) > max_items: | |
| 709 | + preview += ", ..." | |
| 710 | + parts.append(f"missing TOC links {preview}") | |
| 711 | + if mismatches: | |
| 712 | + preview = ", ".join(mismatches[:max_items]) | |
| 713 | + if len(mismatches) > max_items: | |
| 714 | + preview += ", ..." | |
| 715 | + parts.append(f"title mismatches {preview}") | |
| 716 | + return "; ".join(parts) if parts else None | |
| 717 | + | |
| 718 | + | |
| 447 | 719 | def _summarize_html_file_discovery(payload: str) -> str | None: |
| 448 | 720 | filenames = re.findall(r"([A-Za-z0-9_.-]+\.html)", payload) |
| 449 | 721 | unique_names: list[str] = [] |
@@ -458,6 +730,22 @@ def _summarize_html_file_discovery(payload: str) -> str | None: | ||
| 458 | 730 | return f"Existing files include {preview}" |
| 459 | 731 | |
| 460 | 732 | |
| 733 | +def _resolve_tool_name( | |
| 734 | + message: Message, | |
| 735 | + *, | |
| 736 | + tool_calls_by_id: dict[str, ToolCall], | |
| 737 | +) -> str | None: | |
| 738 | + observed = _observed_tool_name(message.content) | |
| 739 | + if observed: | |
| 740 | + return observed | |
| 741 | + | |
| 742 | + for result in message.tool_results: | |
| 743 | + tool_call = tool_calls_by_id.get(result.tool_call_id) | |
| 744 | + if tool_call is not None: | |
| 745 | + return tool_call.name | |
| 746 | + return None | |
| 747 | + | |
| 748 | + | |
| 461 | 749 | def _choose_target_path( |
| 462 | 750 | messages: list[Message], |
| 463 | 751 | *, |
src/loader/runtime/dod.pymodified@@ -26,6 +26,7 @@ class VerificationEvidence: | ||
| 26 | 26 | |
| 27 | 27 | command: str |
| 28 | 28 | passed: bool |
| 29 | + skipped: bool = False | |
| 29 | 30 | exit_code: int | None = None |
| 30 | 31 | stdout: str = "" |
| 31 | 32 | stderr: str = "" |
@@ -54,6 +55,7 @@ class DefinitionOfDone: | ||
| 54 | 55 | line_changes: int = 0 |
| 55 | 56 | storage_path: str | None = None |
| 56 | 57 | last_verification_result: str | None = None |
| 58 | + last_verification_signature: str | None = None | |
| 57 | 59 | verification_attempt_counter: int = 0 |
| 58 | 60 | active_verification_attempt_id: str | None = None |
| 59 | 61 | active_verification_attempt_number: int | None = None |
@@ -92,6 +94,7 @@ class DefinitionOfDone: | ||
| 92 | 94 | line_changes=int(data.get("line_changes", 0)), |
| 93 | 95 | storage_path=data.get("storage_path"), |
| 94 | 96 | last_verification_result=data.get("last_verification_result"), |
| 97 | + last_verification_signature=data.get("last_verification_signature"), | |
| 95 | 98 | verification_attempt_counter=int(data.get("verification_attempt_counter", 0)), |
| 96 | 99 | active_verification_attempt_id=data.get("active_verification_attempt_id"), |
| 97 | 100 | active_verification_attempt_number=( |
@@ -265,8 +268,8 @@ def build_verification_summary(evidence: list[VerificationEvidence]) -> str: | ||
| 265 | 268 | |
| 266 | 269 | lines = ["Verification:"] |
| 267 | 270 | for item in evidence: |
| 268 | - status = "PASS" if item.passed else "FAIL" | |
| 269 | - detail = _first_non_empty_line(item.stdout) or _first_non_empty_line(item.stderr) | |
| 271 | + status = "SKIP" if item.skipped else "PASS" if item.passed else "FAIL" | |
| 272 | + detail = _summarize_verification_detail(item) | |
| 270 | 273 | if detail: |
| 271 | 274 | lines.append(f"- `{item.command}`: {status} ({detail})") |
| 272 | 275 | else: |
@@ -325,12 +328,19 @@ class DefinitionOfDoneStore: | ||
| 325 | 328 | task_statement: str, |
| 326 | 329 | *, |
| 327 | 330 | retry_budget: int = 3, |
| 331 | + resume_path: Path | str | None = None, | |
| 328 | 332 | ) -> DefinitionOfDone: |
| 329 | - """Load an unfinished DoD for the same task, or create a new one.""" | |
| 330 | - | |
| 331 | - existing = self.load_latest(task_statement) | |
| 332 | - if existing is not None and existing.status not in {"done", "failed"}: | |
| 333 | - return existing | |
| 333 | + """Resume the active DoD for this session, or create a new one.""" | |
| 334 | + | |
| 335 | + if resume_path is not None: | |
| 336 | + path = Path(resume_path) | |
| 337 | + if path.exists(): | |
| 338 | + existing = self.load(path) | |
| 339 | + if ( | |
| 340 | + existing.task_statement == task_statement | |
| 341 | + and existing.status not in {"done", "failed"} | |
| 342 | + ): | |
| 343 | + return existing | |
| 334 | 344 | |
| 335 | 345 | dod = create_definition_of_done(task_statement, retry_budget=retry_budget) |
| 336 | 346 | slug = slugify(task_statement) |
@@ -501,7 +511,7 @@ def _build_html_toc_verification_command(index_path: Path) -> str: | ||
| 501 | 511 | path_literal = repr(str(index_path)) |
| 502 | 512 | return "\n".join( |
| 503 | 513 | [ |
| 504 | - "/usr/bin/python3 - <<'PY'", | |
| 514 | + "python3 - <<'PY'", | |
| 505 | 515 | "from pathlib import Path", |
| 506 | 516 | "import re", |
| 507 | 517 | "import sys", |
@@ -554,3 +564,23 @@ def _first_non_empty_line(text: str) -> str: | ||
| 554 | 564 | if stripped: |
| 555 | 565 | return stripped[:120] |
| 556 | 566 | return "" |
| 567 | + | |
| 568 | + | |
| 569 | +def _summarize_verification_detail(item: VerificationEvidence) -> str: | |
| 570 | + for candidate in (item.stdout, item.stderr, item.output): | |
| 571 | + lines = [line.strip() for line in str(candidate).splitlines() if line.strip()] | |
| 572 | + if not lines: | |
| 573 | + continue | |
| 574 | + if len(lines) == 1: | |
| 575 | + return lines[0][:240] | |
| 576 | + | |
| 577 | + head = lines[0][:120] | |
| 578 | + tail = [line[:120] for line in lines[1:3]] | |
| 579 | + if head.endswith(":") and tail: | |
| 580 | + detail = f"{head} {'; '.join(tail)}" | |
| 581 | + else: | |
| 582 | + detail = "; ".join([head, *tail[:1]]) | |
| 583 | + if len(lines) > len(tail) + 1: | |
| 584 | + detail += "; ..." | |
| 585 | + return detail[:240] | |
| 586 | + return "" | |
src/loader/runtime/explore.pymodified@@ -108,6 +108,7 @@ class ExploreRuntime: | ||
| 108 | 108 | validator=self.context.safeguards.validator, |
| 109 | 109 | registry=self.registry, |
| 110 | 110 | rollback_plan=None, |
| 111 | + workspace_root=self.context.project_root, | |
| 111 | 112 | ), |
| 112 | 113 | ) |
| 113 | 114 | |
src/loader/runtime/finalization.pymodified@@ -203,6 +203,63 @@ class TurnFinalizer: | ||
| 203 | 203 | verification_observations=skip_observations, |
| 204 | 204 | ) |
| 205 | 205 | |
| 206 | + current_verification_signature = _verification_state_signature(dod) | |
| 207 | + if ( | |
| 208 | + dod.last_verification_result == "failed" | |
| 209 | + and dod.last_verification_signature | |
| 210 | + and dod.last_verification_signature == current_verification_signature | |
| 211 | + ): | |
| 212 | + summary.verification_status = "failed" | |
| 213 | + summary.definition_of_done = dod | |
| 214 | + failed_provenance = _verification_result_provenance(dod, passed=False) | |
| 215 | + if dod.retry_count >= dod.retry_budget: | |
| 216 | + dod.status = "failed" | |
| 217 | + dod.confidence = "low" | |
| 218 | + self.dod_store.save(dod) | |
| 219 | + await self.emit_dod_status(emit, dod) | |
| 220 | + exhausted_response = ( | |
| 221 | + "I couldn't verify that the task is complete within the retry budget.\n\n" | |
| 222 | + f"{build_verification_summary(dod.evidence)}" | |
| 223 | + ) | |
| 224 | + return CompletionGateResult( | |
| 225 | + should_continue=False, | |
| 226 | + reason_code="verification_retry_budget_exhausted", | |
| 227 | + reason_summary="stopped after verification retry budget was exhausted", | |
| 228 | + final_response=exhausted_response, | |
| 229 | + evidence_provenance=failed_provenance, | |
| 230 | + verification_observations=_verification_result_observations( | |
| 231 | + dod, | |
| 232 | + passed=False, | |
| 233 | + attempt_id=dod.active_verification_attempt_id, | |
| 234 | + attempt_number=dod.active_verification_attempt_number, | |
| 235 | + ), | |
| 236 | + ) | |
| 237 | + repair_prompt = ( | |
| 238 | + "[DEFINITION OF DONE CHECK STILL FAILING]\n" | |
| 239 | + f"Task: {dod.task_statement}\n" | |
| 240 | + "No new file changes were made since the last failed verification.\n\n" | |
| 241 | + f"{build_verification_summary(dod.evidence)}\n\n" | |
| 242 | + f"{_build_verification_repair_guidance(dod)}\n\n" | |
| 243 | + "Apply a concrete edit or patch before trying to finish again." | |
| 244 | + ) | |
| 245 | + self.context.session.append(Message(role=Role.USER, content=repair_prompt)) | |
| 246 | + return CompletionGateResult( | |
| 247 | + should_continue=True, | |
| 248 | + reason_code="verification_failed_no_new_changes", | |
| 249 | + reason_summary=( | |
| 250 | + "continued because verification already failed and no new " | |
| 251 | + "mutating changes were made before trying to finish again" | |
| 252 | + ), | |
| 253 | + final_response="", | |
| 254 | + evidence_provenance=failed_provenance, | |
| 255 | + verification_observations=_verification_result_observations( | |
| 256 | + dod, | |
| 257 | + passed=False, | |
| 258 | + attempt_id=dod.active_verification_attempt_id, | |
| 259 | + attempt_number=dod.active_verification_attempt_number, | |
| 260 | + ), | |
| 261 | + ) | |
| 262 | + | |
| 206 | 263 | verify_item = "Collect verification evidence" |
| 207 | 264 | if verify_item not in dod.pending_items and verify_item not in dod.completed_items: |
| 208 | 265 | dod.pending_items.append(verify_item) |
@@ -365,6 +422,7 @@ class TurnFinalizer: | ||
| 365 | 422 | f"Attempt: {dod.retry_count}/{dod.retry_budget}\n" |
| 366 | 423 | f"Pending items: {', '.join(dod.pending_items)}\n\n" |
| 367 | 424 | f"{build_verification_summary(dod.evidence)}\n\n" |
| 425 | + f"{_build_verification_repair_guidance(dod)}\n\n" | |
| 368 | 426 | "Fix the failures above, then finish the task again." |
| 369 | 427 | ) |
| 370 | 428 | self.context.session.append(Message(role=Role.USER, content=failure_prompt)) |
@@ -391,6 +449,7 @@ class TurnFinalizer: | ||
| 391 | 449 | """Collect verification evidence for one DoD.""" |
| 392 | 450 | |
| 393 | 451 | dod.status = "verifying" |
| 452 | + dod.last_verification_signature = _verification_state_signature(dod) | |
| 394 | 453 | self.dod_store.save(dod) |
| 395 | 454 | await self.emit_dod_status(emit, dod) |
| 396 | 455 | attempt = ensure_active_verification_attempt(dod) |
@@ -464,6 +523,7 @@ class TurnFinalizer: | ||
| 464 | 523 | output=outcome.result_output, |
| 465 | 524 | kind=_classify_verification_kind(command), |
| 466 | 525 | ) |
| 526 | + evidence = _maybe_mark_optional_verification_skip(evidence) | |
| 467 | 527 | dod.evidence.append(evidence) |
| 468 | 528 | observation = _verification_observation_from_evidence( |
| 469 | 529 | evidence, |
@@ -474,20 +534,12 @@ class TurnFinalizer: | ||
| 474 | 534 | append_verification_timeline_entry( |
| 475 | 535 | self.context, |
| 476 | 536 | summary, |
| 477 | - reason_code=( | |
| 478 | - "verification_command_passed" | |
| 479 | - if evidence.passed | |
| 480 | - else "verification_command_failed" | |
| 481 | - ), | |
| 482 | - reason_summary=( | |
| 483 | - f"verification passed for `{command}`" | |
| 484 | - if evidence.passed | |
| 485 | - else f"verification failed for `{command}`" | |
| 486 | - ), | |
| 537 | + reason_code=_verification_timeline_reason_code(evidence), | |
| 538 | + reason_summary=_verification_timeline_reason_summary(evidence), | |
| 487 | 539 | evidence_provenance=provenance, |
| 488 | 540 | verification_observations=[observation], |
| 489 | 541 | ) |
| 490 | - all_passed = all_passed and evidence.passed | |
| 542 | + all_passed = all_passed and (evidence.passed or evidence.skipped) | |
| 491 | 543 | summary.tool_result_messages.append(outcome.message) |
| 492 | 544 | self.context.session.append(outcome.message) |
| 493 | 545 | |
@@ -731,15 +783,13 @@ def _verification_observation_from_evidence( | ||
| 731 | 783 | command = evidence.command or "verification" |
| 732 | 784 | return VerificationObservation( |
| 733 | 785 | status=( |
| 734 | - VerificationObservationStatus.PASSED.value | |
| 786 | + VerificationObservationStatus.SKIPPED.value | |
| 787 | + if evidence.skipped | |
| 788 | + else VerificationObservationStatus.PASSED.value | |
| 735 | 789 | if evidence.passed |
| 736 | 790 | else VerificationObservationStatus.FAILED.value |
| 737 | 791 | ), |
| 738 | - summary=( | |
| 739 | - f"verification passed for `{command}`" | |
| 740 | - if evidence.passed | |
| 741 | - else f"verification failed for `{command}`" | |
| 742 | - ), | |
| 792 | + summary=_verification_timeline_reason_summary(evidence), | |
| 743 | 793 | command=evidence.command or None, |
| 744 | 794 | kind=evidence.kind, |
| 745 | 795 | exit_code=evidence.exit_code, |
@@ -757,13 +807,11 @@ def _verification_provenance_from_evidence( | ||
| 757 | 807 | EvidenceProvenance( |
| 758 | 808 | category="verification", |
| 759 | 809 | source="dod.evidence", |
| 760 | - summary=( | |
| 761 | - f"verification passed for `{command}`" | |
| 762 | - if evidence.passed | |
| 763 | - else f"verification failed for `{command}`" | |
| 764 | - ), | |
| 810 | + summary=_verification_timeline_reason_summary(evidence), | |
| 765 | 811 | status=( |
| 766 | - EvidenceProvenanceStatus.SUPPORTS.value | |
| 812 | + EvidenceProvenanceStatus.CONTEXT.value | |
| 813 | + if evidence.skipped | |
| 814 | + else EvidenceProvenanceStatus.SUPPORTS.value | |
| 767 | 815 | if evidence.passed |
| 768 | 816 | else EvidenceProvenanceStatus.CONTRADICTS.value |
| 769 | 817 | ), |
@@ -844,6 +892,113 @@ def _verification_detail(evidence: VerificationEvidence) -> str | None: | ||
| 844 | 892 | return None |
| 845 | 893 | |
| 846 | 894 | |
| 895 | +def _verification_timeline_reason_code(evidence: VerificationEvidence) -> str: | |
| 896 | + if evidence.skipped: | |
| 897 | + return "verification_command_skipped" | |
| 898 | + if evidence.passed: | |
| 899 | + return "verification_command_passed" | |
| 900 | + return "verification_command_failed" | |
| 901 | + | |
| 902 | + | |
| 903 | +def _verification_timeline_reason_summary(evidence: VerificationEvidence) -> str: | |
| 904 | + command = evidence.command or "verification" | |
| 905 | + if evidence.skipped: | |
| 906 | + return f"verification skipped for `{command}`" | |
| 907 | + if evidence.passed: | |
| 908 | + return f"verification passed for `{command}`" | |
| 909 | + return f"verification failed for `{command}`" | |
| 910 | + | |
| 911 | + | |
| 912 | +def _maybe_mark_optional_verification_skip( | |
| 913 | + evidence: VerificationEvidence, | |
| 914 | +) -> VerificationEvidence: | |
| 915 | + detail = "\n".join( | |
| 916 | + part for part in (evidence.stderr, evidence.output) if str(part).strip() | |
| 917 | + ).lower() | |
| 918 | + command = (evidence.command or "").lower() | |
| 919 | + if ( | |
| 920 | + not evidence.passed | |
| 921 | + and evidence.exit_code == 127 | |
| 922 | + and "command not found" in detail | |
| 923 | + and "html5validator" in command | |
| 924 | + ): | |
| 925 | + evidence.skipped = True | |
| 926 | + return evidence | |
| 927 | + | |
| 928 | + | |
| 929 | +def _verification_state_signature(dod: DefinitionOfDone) -> str: | |
| 930 | + touched = "|".join(sorted(set(dod.touched_files))) | |
| 931 | + commands = "|".join(sorted(set(dod.successful_commands))) | |
| 932 | + return ( | |
| 933 | + f"lines={dod.line_changes}" | |
| 934 | + f";touched={touched}" | |
| 935 | + f";actions={len(dod.mutating_actions)}" | |
| 936 | + f";commands={commands}" | |
| 937 | + ) | |
| 938 | + | |
| 939 | + | |
| 940 | +def _build_verification_repair_guidance(dod: DefinitionOfDone) -> str: | |
| 941 | + fixes = _extract_verification_repairs(dod.evidence) | |
| 942 | + if not fixes: | |
| 943 | + return ( | |
| 944 | + "Use the failed verification evidence directly, avoid rereading unrelated " | |
| 945 | + "files, and fix the target file before retrying." | |
| 946 | + ) | |
| 947 | + | |
| 948 | + return "\n".join( | |
| 949 | + [ | |
| 950 | + "Repair focus:", | |
| 951 | + *[f"- {item}" for item in fixes], | |
| 952 | + "- Reuse these exact failures instead of restarting discovery from earlier chapters.", | |
| 953 | + ] | |
| 954 | + ) | |
| 955 | + | |
| 956 | + | |
| 957 | +def _extract_verification_repairs( | |
| 958 | + evidence_items: list[VerificationEvidence], | |
| 959 | +) -> list[str]: | |
| 960 | + fixes: list[str] = [] | |
| 961 | + for evidence in evidence_items: | |
| 962 | + for candidate in (evidence.stderr, evidence.output, evidence.stdout): | |
| 963 | + missing, mismatches = _parse_verification_failures(str(candidate)) | |
| 964 | + for href in missing: | |
| 965 | + item = f"Fix the missing TOC href `{href}` in `index.html`." | |
| 966 | + if item not in fixes: | |
| 967 | + fixes.append(item) | |
| 968 | + for mismatch in mismatches: | |
| 969 | + item = f"Fix the TOC label mismatch `{mismatch}`." | |
| 970 | + if item not in fixes: | |
| 971 | + fixes.append(item) | |
| 972 | + return fixes | |
| 973 | + | |
| 974 | + | |
| 975 | +def _parse_verification_failures(text: str) -> tuple[list[str], list[str]]: | |
| 976 | + missing: list[str] = [] | |
| 977 | + mismatches: list[str] = [] | |
| 978 | + mode: str | None = None | |
| 979 | + | |
| 980 | + for raw_line in text.splitlines(): | |
| 981 | + line = raw_line.strip() | |
| 982 | + if not line: | |
| 983 | + continue | |
| 984 | + lowered = line.lower() | |
| 985 | + if lowered == "missing links:": | |
| 986 | + mode = "missing" | |
| 987 | + continue | |
| 988 | + if lowered == "title mismatches:": | |
| 989 | + mode = "mismatch" | |
| 990 | + continue | |
| 991 | + if mode == "missing" and "->" in line: | |
| 992 | + href = line.split("->", 1)[0].strip() | |
| 993 | + if href and href not in missing: | |
| 994 | + missing.append(href) | |
| 995 | + continue | |
| 996 | + if mode == "mismatch" and "!=" in line and line not in mismatches: | |
| 997 | + mismatches.append(line) | |
| 998 | + | |
| 999 | + return missing, mismatches | |
| 1000 | + | |
| 1001 | + | |
| 847 | 1002 | def _classify_verification_kind(command: str) -> str: |
| 848 | 1003 | """Classify the verification command into a summary kind.""" |
| 849 | 1004 | |
src/loader/runtime/hooks.pymodified@@ -5,6 +5,7 @@ from __future__ import annotations | ||
| 5 | 5 | from collections.abc import Iterable |
| 6 | 6 | from dataclasses import dataclass, field |
| 7 | 7 | from enum import StrEnum |
| 8 | +from pathlib import Path | |
| 8 | 9 | from typing import Any, Protocol |
| 9 | 10 | |
| 10 | 11 | from ..llm.base import ToolCall |
@@ -157,8 +158,93 @@ class SearchPathAliasHook(BaseToolHook): | ||
| 157 | 158 | updated_arguments.pop(cleanup_key, None) |
| 158 | 159 | return HookResult(updated_arguments=updated_arguments) |
| 159 | 160 | |
| 161 | + if context.tool_call.name == "glob": | |
| 162 | + normalized_arguments = self._normalize_glob_pattern_path(arguments) | |
| 163 | + if normalized_arguments is not None: | |
| 164 | + return HookResult(updated_arguments=normalized_arguments) | |
| 165 | + | |
| 160 | 166 | return HookResult() |
| 161 | 167 | |
| 168 | + def _normalize_glob_pattern_path( | |
| 169 | + self, | |
| 170 | + arguments: dict[str, Any], | |
| 171 | + ) -> dict[str, Any] | None: | |
| 172 | + pattern = str(arguments.get("pattern", "")).strip() | |
| 173 | + if not pattern or not pattern.startswith(("/", "~", "./", "../")): | |
| 174 | + return None | |
| 175 | + | |
| 176 | + pattern_path = Path(pattern) | |
| 177 | + parent = str(pattern_path.parent).strip() | |
| 178 | + basename = pattern_path.name.strip() | |
| 179 | + if not parent or not basename: | |
| 180 | + return None | |
| 181 | + if any(token in parent for token in ("*", "?", "[")): | |
| 182 | + return None | |
| 183 | + | |
| 184 | + updated_arguments = dict(arguments) | |
| 185 | + updated_arguments["path"] = parent | |
| 186 | + updated_arguments["pattern"] = basename | |
| 187 | + return updated_arguments | |
| 188 | + | |
| 189 | + | |
| 190 | +class RelativePathContextHook(BaseToolHook): | |
| 191 | + """Recover relative file/search paths against recently-used external directories.""" | |
| 192 | + | |
| 193 | + _FILE_TOOLS = frozenset({"read", "write", "edit", "patch"}) | |
| 194 | + _SEARCH_TOOLS = frozenset({"glob", "grep"}) | |
| 195 | + | |
| 196 | + def __init__(self, action_tracker: ActionTracker, workspace_root: Path) -> None: | |
| 197 | + self.action_tracker = action_tracker | |
| 198 | + self.workspace_root = workspace_root.expanduser().resolve() | |
| 199 | + | |
| 200 | + async def pre_tool_use(self, context: HookContext) -> HookResult: | |
| 201 | + argument_key = self._argument_key(context.tool_call.name) | |
| 202 | + if argument_key is None: | |
| 203 | + return HookResult() | |
| 204 | + | |
| 205 | + arguments = context.tool_call.arguments | |
| 206 | + raw_path = str(arguments.get(argument_key, "")).strip() | |
| 207 | + if not raw_path or raw_path.startswith(("/", "~")): | |
| 208 | + return HookResult() | |
| 209 | + | |
| 210 | + resolved = self._resolve_recent_context_path( | |
| 211 | + raw_path, | |
| 212 | + require_existing=True, | |
| 213 | + ) | |
| 214 | + if resolved is None: | |
| 215 | + return HookResult() | |
| 216 | + | |
| 217 | + updated_arguments = dict(arguments) | |
| 218 | + updated_arguments[argument_key] = resolved | |
| 219 | + return HookResult(updated_arguments=updated_arguments) | |
| 220 | + | |
| 221 | + def _argument_key(self, tool_name: str) -> str | None: | |
| 222 | + if tool_name in self._FILE_TOOLS: | |
| 223 | + return "file_path" | |
| 224 | + if tool_name in self._SEARCH_TOOLS: | |
| 225 | + return "path" | |
| 226 | + return None | |
| 227 | + | |
| 228 | + def _resolve_recent_context_path( | |
| 229 | + self, | |
| 230 | + raw_path: str, | |
| 231 | + *, | |
| 232 | + require_existing: bool, | |
| 233 | + ) -> str | None: | |
| 234 | + workspace_candidate = (self.workspace_root / raw_path).expanduser() | |
| 235 | + if workspace_candidate.exists(): | |
| 236 | + return None | |
| 237 | + | |
| 238 | + for base_dir in self.action_tracker.recent_path_contexts(): | |
| 239 | + candidate = (Path(base_dir) / raw_path).expanduser() | |
| 240 | + if require_existing: | |
| 241 | + if candidate.exists(): | |
| 242 | + return str(candidate) | |
| 243 | + continue | |
| 244 | + if candidate.exists() or candidate.parent.exists(): | |
| 245 | + return str(candidate) | |
| 246 | + return None | |
| 247 | + | |
| 162 | 248 | |
| 163 | 249 | class HookManager: |
| 164 | 250 | """Runs tool hooks across Loader's three lifecycle events.""" |
@@ -350,6 +436,7 @@ def build_default_tool_hooks( | ||
| 350 | 436 | validator: PreActionValidator, |
| 351 | 437 | registry: ToolRegistry, |
| 352 | 438 | rollback_plan: RollbackPlan | None, |
| 439 | + workspace_root: Path, | |
| 353 | 440 | ) -> HookManager: |
| 354 | 441 | """Build Loader's default tool hook stack for one runtime turn.""" |
| 355 | 442 | |
@@ -357,6 +444,7 @@ def build_default_tool_hooks( | ||
| 357 | 444 | [ |
| 358 | 445 | FilePathAliasHook(), |
| 359 | 446 | SearchPathAliasHook(), |
| 447 | + RelativePathContextHook(action_tracker, workspace_root), | |
| 360 | 448 | DuplicateActionHook(action_tracker), |
| 361 | 449 | ActionValidationHook(validator), |
| 362 | 450 | RollbackTrackingHook(registry, rollback_plan), |
src/loader/runtime/parsing.pymodified@@ -4,6 +4,7 @@ from __future__ import annotations | ||
| 4 | 4 | |
| 5 | 5 | import json |
| 6 | 6 | import re |
| 7 | +import shlex | |
| 7 | 8 | from collections.abc import Iterable |
| 8 | 9 | from dataclasses import dataclass |
| 9 | 10 | |
@@ -19,6 +20,17 @@ class ParsedResponse: | ||
| 19 | 20 | is_final_answer: bool = False |
| 20 | 21 | |
| 21 | 22 | |
| 23 | +_TOOL_NAME_ALIASES = { | |
| 24 | + "bashcommand": "bash", | |
| 25 | + "editfile": "edit", | |
| 26 | + "globfile": "glob", | |
| 27 | + "globfiles": "glob", | |
| 28 | + "patchfile": "patch", | |
| 29 | + "readfile": "read", | |
| 30 | + "writefile": "write", | |
| 31 | +} | |
| 32 | + | |
| 33 | + | |
| 22 | 34 | def _extract_arguments(data: dict) -> dict: |
| 23 | 35 | """Extract arguments from tool call data, handling various key names.""" |
| 24 | 36 | |
@@ -52,6 +64,23 @@ def _tool_name_map(allowed_tool_names: Iterable[str] | None) -> dict[str, str] | | ||
| 52 | 64 | return {name.casefold(): name for name in allowed_tool_names} |
| 53 | 65 | |
| 54 | 66 | |
| 67 | +def _normalized_tool_key(name: str) -> str: | |
| 68 | + """Collapse separators and case so near-miss tool names can still match.""" | |
| 69 | + | |
| 70 | + return re.sub(r"[^a-z0-9]+", "", name.casefold()) | |
| 71 | + | |
| 72 | + | |
| 73 | +def _normalized_allowed_tool_map(tool_names: dict[str, str] | None) -> dict[str, str] | None: | |
| 74 | + """Build a separator-insensitive tool-name map.""" | |
| 75 | + | |
| 76 | + if tool_names is None: | |
| 77 | + return None | |
| 78 | + return { | |
| 79 | + _normalized_tool_key(canonical_name): canonical_name | |
| 80 | + for canonical_name in tool_names.values() | |
| 81 | + } | |
| 82 | + | |
| 83 | + | |
| 55 | 84 | def _canonicalize_tool_name( |
| 56 | 85 | name: str, |
| 57 | 86 | tool_names: dict[str, str] | None, |
@@ -62,7 +91,43 @@ def _canonicalize_tool_name( | ||
| 62 | 91 | |
| 63 | 92 | if tool_names is None: |
| 64 | 93 | return name.lower() if lowercase_default else name |
| 65 | - return tool_names.get(name.casefold()) | |
| 94 | + | |
| 95 | + direct_match = tool_names.get(name.casefold()) | |
| 96 | + if direct_match is not None: | |
| 97 | + return direct_match | |
| 98 | + | |
| 99 | + normalized_allowed = _normalized_allowed_tool_map(tool_names) | |
| 100 | + if normalized_allowed is None: | |
| 101 | + return None | |
| 102 | + | |
| 103 | + normalized_name = _normalized_tool_key(name) | |
| 104 | + normalized_match = normalized_allowed.get(normalized_name) | |
| 105 | + if normalized_match is not None: | |
| 106 | + return normalized_match | |
| 107 | + | |
| 108 | + alias_target = _TOOL_NAME_ALIASES.get(normalized_name) | |
| 109 | + if alias_target is None: | |
| 110 | + return None | |
| 111 | + | |
| 112 | + direct_alias_match = tool_names.get(alias_target.casefold()) | |
| 113 | + if direct_alias_match is not None: | |
| 114 | + return direct_alias_match | |
| 115 | + return normalized_allowed.get(_normalized_tool_key(alias_target)) | |
| 116 | + | |
| 117 | + | |
| 118 | +def canonicalize_tool_name( | |
| 119 | + name: str, | |
| 120 | + *, | |
| 121 | + allowed_tool_names: Iterable[str] | None = None, | |
| 122 | + lowercase_default: bool = False, | |
| 123 | +) -> str | None: | |
| 124 | + """Public helper for backend/native tool-call normalization.""" | |
| 125 | + | |
| 126 | + return _canonicalize_tool_name( | |
| 127 | + name, | |
| 128 | + _tool_name_map(allowed_tool_names), | |
| 129 | + lowercase_default=lowercase_default, | |
| 130 | + ) | |
| 66 | 131 | |
| 67 | 132 | |
| 68 | 133 | def _extract_json_tool_calls( |
@@ -168,6 +233,66 @@ def _extract_function_tag_tool_calls( | ||
| 168 | 233 | return tool_calls, spans |
| 169 | 234 | |
| 170 | 235 | |
| 236 | +def _parse_fenced_tool_arguments( | |
| 237 | + tool_name: str, | |
| 238 | + command_line: str, | |
| 239 | +) -> dict[str, str] | None: | |
| 240 | + """Convert one simple fenced command line into Loader tool arguments.""" | |
| 241 | + | |
| 242 | + try: | |
| 243 | + argv = shlex.split(command_line) | |
| 244 | + except ValueError: | |
| 245 | + return None | |
| 246 | + if len(argv) < 2: | |
| 247 | + return None | |
| 248 | + | |
| 249 | + payload = command_line[len(argv[0]) :].strip() | |
| 250 | + if tool_name == "read" and len(argv) == 2: | |
| 251 | + return {"file_path": argv[1]} | |
| 252 | + if tool_name == "glob" and len(argv) == 2: | |
| 253 | + return {"pattern": argv[1]} | |
| 254 | + if tool_name == "bash" and payload: | |
| 255 | + return {"command": payload} | |
| 256 | + return None | |
| 257 | + | |
| 258 | + | |
| 259 | +def _extract_fenced_command_tool_calls( | |
| 260 | + text: str, | |
| 261 | + tool_names: dict[str, str] | None = None, | |
| 262 | +) -> tuple[list[ToolCall], list[tuple[int, int]]]: | |
| 263 | + """Recover simple one-line fenced tool commands from local-model prose.""" | |
| 264 | + | |
| 265 | + fence_pattern = r"```(?:[^\n`]*)\n(.*?)```" | |
| 266 | + tool_calls: list[ToolCall] = [] | |
| 267 | + spans: list[tuple[int, int]] = [] | |
| 268 | + | |
| 269 | + for match in re.finditer(fence_pattern, text, re.DOTALL): | |
| 270 | + body = match.group(1).strip() | |
| 271 | + if not body or "\n" in body: | |
| 272 | + continue | |
| 273 | + raw_name = body.split(None, 1)[0] | |
| 274 | + canonical_name = _canonicalize_tool_name( | |
| 275 | + raw_name, | |
| 276 | + tool_names, | |
| 277 | + lowercase_default=True, | |
| 278 | + ) | |
| 279 | + if canonical_name is None: | |
| 280 | + continue | |
| 281 | + arguments = _parse_fenced_tool_arguments(canonical_name, body) | |
| 282 | + if not arguments: | |
| 283 | + continue | |
| 284 | + tool_calls.append( | |
| 285 | + ToolCall( | |
| 286 | + id=f"call_{len(tool_calls)}", | |
| 287 | + name=canonical_name, | |
| 288 | + arguments=arguments, | |
| 289 | + ) | |
| 290 | + ) | |
| 291 | + spans.append(match.span()) | |
| 292 | + | |
| 293 | + return tool_calls, spans | |
| 294 | + | |
| 295 | + | |
| 171 | 296 | def parse_tool_calls( |
| 172 | 297 | text: str, |
| 173 | 298 | *, |
@@ -253,6 +378,15 @@ def parse_tool_calls( | ||
| 253 | 378 | if tool_calls: |
| 254 | 379 | content = re.sub(bracket_pattern, "", content, flags=re.IGNORECASE) |
| 255 | 380 | |
| 381 | + if not tool_calls: | |
| 382 | + fenced_calls, fenced_spans = _extract_fenced_command_tool_calls( | |
| 383 | + text, | |
| 384 | + tool_names, | |
| 385 | + ) | |
| 386 | + if fenced_calls: | |
| 387 | + tool_calls = fenced_calls | |
| 388 | + content = _remove_spans(content, fenced_spans) | |
| 389 | + | |
| 256 | 390 | if is_final: |
| 257 | 391 | content = final_content |
| 258 | 392 | |
src/loader/runtime/prompting.pymodified@@ -119,6 +119,8 @@ MODE_GUIDANCE = { | ||
| 119 | 119 | - For servers, watchers, preview commands, or anything else that keeps running, |
| 120 | 120 | call `bash` with `background=true`, then inspect it with `bash_wait` or |
| 121 | 121 | `bash_jobs` instead of blocking the turn in the foreground |
| 122 | +- Prefer `edit`/`patch`/`write` over shell one-liners like `sed -i`, `perl -pi`, | |
| 123 | + or heredoc rewrites when modifying text files | |
| 122 | 124 | - If the task names an external directory like `~/Loader/...`, keep operating on |
| 123 | 125 | that exact path instead of falling back to the repo cwd; file tools accept |
| 124 | 126 | absolute and `~` paths, and `glob` works best with `path="~/Loader/..."` |
src/loader/runtime/recovery.pymodified@@ -7,6 +7,8 @@ from enum import Enum, auto | ||
| 7 | 7 | from pathlib import Path |
| 8 | 8 | from typing import Any |
| 9 | 9 | |
| 10 | +from .safeguard_services import extract_shell_text_rewrite_target | |
| 11 | + | |
| 10 | 12 | |
| 11 | 13 | class ErrorCategory(Enum): |
| 12 | 14 | """Categories of errors for recovery strategies.""" |
@@ -174,6 +176,8 @@ class RecoveryContext: | ||
| 174 | 176 | |
| 175 | 177 | if tool_name == "bash": |
| 176 | 178 | command = str(args.get("command", "")) |
| 179 | + if extract_shell_text_rewrite_target(command) is not None: | |
| 180 | + return True | |
| 177 | 181 | mutating_tokens = ( |
| 178 | 182 | "git commit", |
| 179 | 183 | "git add", |
@@ -525,7 +529,11 @@ def categorize_error(error_message: str) -> ErrorCategory: | ||
| 525 | 529 | return ErrorCategory.UNKNOWN |
| 526 | 530 | |
| 527 | 531 | |
| 528 | -def get_recovery_hints(category: ErrorCategory, tool_name: str) -> str: | |
| 532 | +def get_recovery_hints( | |
| 533 | + category: ErrorCategory, | |
| 534 | + tool_name: str, | |
| 535 | + args: dict[str, Any] | None = None, | |
| 536 | +) -> str: | |
| 529 | 537 | """Get hints for recovering from a specific error category.""" |
| 530 | 538 | |
| 531 | 539 | hints = { |
@@ -672,6 +680,14 @@ def get_recovery_hints(category: ErrorCategory, tool_name: str) -> str: | ||
| 672 | 680 | if tool_name == "bash" and category == ErrorCategory.COMMAND_NOT_FOUND: |
| 673 | 681 | category_hints = ["Check if installed: bash(which <command>)"] + category_hints |
| 674 | 682 | |
| 683 | + rewrite_target = extract_shell_text_rewrite_target(str((args or {}).get("command", ""))) | |
| 684 | + if tool_name == "bash" and rewrite_target is not None: | |
| 685 | + category_hints = [ | |
| 686 | + f"Switch to edit/patch/write for `{rewrite_target}` instead of shell rewriting it", | |
| 687 | + "Reuse the evidence you already gathered and apply the file change directly", | |
| 688 | + "If the exact replacement span is unclear, read just the target file and then edit it", | |
| 689 | + ] + category_hints | |
| 690 | + | |
| 675 | 691 | return "\n".join(f"- {hint}" for hint in category_hints) |
| 676 | 692 | |
| 677 | 693 | |
@@ -713,7 +729,7 @@ def format_recovery_prompt( | ||
| 713 | 729 | """Format a prompt asking the LLM to recover from an error.""" |
| 714 | 730 | |
| 715 | 731 | category = categorize_error(error) |
| 716 | - hints = get_recovery_hints(category, tool_name) | |
| 732 | + hints = get_recovery_hints(category, tool_name, args) | |
| 717 | 733 | args_str = ", ".join(f"{key}={value!r}" for key, value in args.items()) |
| 718 | 734 | |
| 719 | 735 | return RECOVERY_PROMPT.format( |
src/loader/runtime/safeguard_services.pymodified@@ -4,10 +4,380 @@ from __future__ import annotations | ||
| 4 | 4 | |
| 5 | 5 | import re |
| 6 | 6 | import shlex |
| 7 | +from difflib import get_close_matches | |
| 7 | 8 | from dataclasses import dataclass |
| 8 | 9 | from pathlib import Path |
| 9 | 10 | |
| 10 | 11 | |
| 12 | +TEXT_REWRITE_SUFFIXES = frozenset( | |
| 13 | + { | |
| 14 | + ".c", | |
| 15 | + ".cc", | |
| 16 | + ".cpp", | |
| 17 | + ".css", | |
| 18 | + ".csv", | |
| 19 | + ".go", | |
| 20 | + ".h", | |
| 21 | + ".hpp", | |
| 22 | + ".html", | |
| 23 | + ".htm", | |
| 24 | + ".java", | |
| 25 | + ".js", | |
| 26 | + ".json", | |
| 27 | + ".jsx", | |
| 28 | + ".md", | |
| 29 | + ".py", | |
| 30 | + ".rb", | |
| 31 | + ".rs", | |
| 32 | + ".sh", | |
| 33 | + ".sql", | |
| 34 | + ".svg", | |
| 35 | + ".toml", | |
| 36 | + ".ts", | |
| 37 | + ".tsx", | |
| 38 | + ".txt", | |
| 39 | + ".xml", | |
| 40 | + ".yaml", | |
| 41 | + ".yml", | |
| 42 | + } | |
| 43 | +) | |
| 44 | +TEXT_REWRITE_FILENAMES = frozenset( | |
| 45 | + { | |
| 46 | + "dockerfile", | |
| 47 | + "index.html", | |
| 48 | + "makefile", | |
| 49 | + "package.json", | |
| 50 | + "pyproject.toml", | |
| 51 | + "readme", | |
| 52 | + "readme.md", | |
| 53 | + } | |
| 54 | +) | |
| 55 | + | |
| 56 | + | |
| 57 | +def _strip_shell_token(token: str) -> str: | |
| 58 | + return token.strip().strip("\"'").rstrip(";|&") | |
| 59 | + | |
| 60 | + | |
| 61 | +def _looks_like_text_rewrite_target(token: str) -> bool: | |
| 62 | + candidate = _strip_shell_token(token) | |
| 63 | + if not candidate or candidate in {"-", "/dev/null"}: | |
| 64 | + return False | |
| 65 | + if candidate.startswith("-"): | |
| 66 | + return False | |
| 67 | + lowered = Path(candidate).name.lower() | |
| 68 | + if lowered in TEXT_REWRITE_FILENAMES: | |
| 69 | + return True | |
| 70 | + return Path(candidate).suffix.lower() in TEXT_REWRITE_SUFFIXES | |
| 71 | + | |
| 72 | + | |
| 73 | +def _extract_redirect_target(argv: list[str]) -> str | None: | |
| 74 | + for index, token in enumerate(argv): | |
| 75 | + if token in {">", ">>"} and index + 1 < len(argv): | |
| 76 | + candidate = argv[index + 1] | |
| 77 | + if _looks_like_text_rewrite_target(candidate): | |
| 78 | + return _strip_shell_token(candidate) | |
| 79 | + if token == "tee": | |
| 80 | + for candidate in argv[index + 1 :]: | |
| 81 | + if candidate.startswith("-"): | |
| 82 | + continue | |
| 83 | + if _looks_like_text_rewrite_target(candidate): | |
| 84 | + return _strip_shell_token(candidate) | |
| 85 | + break | |
| 86 | + return None | |
| 87 | + | |
| 88 | + | |
| 89 | +def extract_shell_text_rewrite_target(command: str) -> str | None: | |
| 90 | + """Return the target file when bash is used as a brittle text editor.""" | |
| 91 | + | |
| 92 | + normalized = " ".join(str(command or "").split()) | |
| 93 | + if not normalized: | |
| 94 | + return None | |
| 95 | + | |
| 96 | + try: | |
| 97 | + argv = shlex.split(normalized) | |
| 98 | + except ValueError: | |
| 99 | + argv = [] | |
| 100 | + | |
| 101 | + if argv: | |
| 102 | + for index, token in enumerate(argv): | |
| 103 | + if token == "sed" and any(part.startswith("-i") for part in argv[index + 1 :]): | |
| 104 | + for candidate in reversed(argv[index + 1 :]): | |
| 105 | + if _looks_like_text_rewrite_target(candidate): | |
| 106 | + return _strip_shell_token(candidate) | |
| 107 | + if token == "perl" and any( | |
| 108 | + part.startswith("-p") or part.startswith("-0p") for part in argv[index + 1 :] | |
| 109 | + ): | |
| 110 | + for candidate in reversed(argv[index + 1 :]): | |
| 111 | + if _looks_like_text_rewrite_target(candidate): | |
| 112 | + return _strip_shell_token(candidate) | |
| 113 | + | |
| 114 | + redirect_target = _extract_redirect_target(argv) | |
| 115 | + if redirect_target is not None: | |
| 116 | + return redirect_target | |
| 117 | + | |
| 118 | + regex_match = re.search( | |
| 119 | + r"(?:sed\s+-i(?:\s+''|\s+\"\"|\s+'[^']*'|\s+\"[^\"]*\")?.*?|perl\s+-[0-9]*p[i0-9-]*.*?)\s+([^\s\"';|&]+(?:\.[A-Za-z0-9]+)?)", | |
| 120 | + normalized, | |
| 121 | + ) | |
| 122 | + if regex_match: | |
| 123 | + candidate = _strip_shell_token(regex_match.group(1)) | |
| 124 | + if _looks_like_text_rewrite_target(candidate): | |
| 125 | + return candidate | |
| 126 | + | |
| 127 | + redirect_match = re.search(r"(?:>>?|tee(?:\s+-a)?)\s+([^\s\"';|&]+)", normalized) | |
| 128 | + if redirect_match: | |
| 129 | + candidate = _strip_shell_token(redirect_match.group(1)) | |
| 130 | + if _looks_like_text_rewrite_target(candidate): | |
| 131 | + return candidate | |
| 132 | + | |
| 133 | + return None | |
| 134 | + | |
| 135 | + | |
| 136 | +def extract_html_title_from_text(payload: str) -> str | None: | |
| 137 | + """Extract one human-readable HTML title from raw file contents.""" | |
| 138 | + | |
| 139 | + for pattern in (r"<h1[^>]*>(.*?)</h1>", r"<title[^>]*>(.*?)</title>"): | |
| 140 | + match = re.search(pattern, payload, re.IGNORECASE | re.DOTALL) | |
| 141 | + if not match: | |
| 142 | + continue | |
| 143 | + title = re.sub(r"<[^>]+>", " ", match.group(1)) | |
| 144 | + normalized = " ".join(title.split()).strip() | |
| 145 | + if normalized: | |
| 146 | + return normalized | |
| 147 | + return None | |
| 148 | + | |
| 149 | + | |
| 150 | +def read_html_title(path: Path) -> str: | |
| 151 | + """Read one HTML file title for inventory and validation helpers.""" | |
| 152 | + | |
| 153 | + try: | |
| 154 | + return extract_html_title_from_text(path.read_text()) or "" | |
| 155 | + except OSError: | |
| 156 | + return "" | |
| 157 | + | |
| 158 | + | |
| 159 | +def format_html_inventory_entry(root: Path, candidate: Path) -> str: | |
| 160 | + """Format one exact href/title pair for model-facing guidance.""" | |
| 161 | + | |
| 162 | + normalized_root = root.expanduser().resolve(strict=False) | |
| 163 | + normalized_candidate = candidate.expanduser().resolve(strict=False) | |
| 164 | + try: | |
| 165 | + href = str(normalized_candidate.relative_to(normalized_root)) | |
| 166 | + except ValueError: | |
| 167 | + href = normalized_candidate.name | |
| 168 | + title = read_html_title(candidate) | |
| 169 | + if title: | |
| 170 | + return f"{href} = {title}" | |
| 171 | + return href | |
| 172 | + | |
| 173 | + | |
| 174 | +def _collect_html_inventory_entries(index_path: str | Path) -> list[tuple[str, str]]: | |
| 175 | + """Return exact href/title pairs for sibling HTML chapters.""" | |
| 176 | + | |
| 177 | + index = Path(index_path).expanduser() | |
| 178 | + if index.name != "index.html": | |
| 179 | + return [] | |
| 180 | + | |
| 181 | + chapters_dir = index.parent / "chapters" | |
| 182 | + if not chapters_dir.is_dir(): | |
| 183 | + return [] | |
| 184 | + | |
| 185 | + entries: list[tuple[str, str]] = [] | |
| 186 | + for candidate in sorted(chapters_dir.glob("*.html")): | |
| 187 | + if not candidate.is_file(): | |
| 188 | + continue | |
| 189 | + title = read_html_title(candidate) | |
| 190 | + if not title: | |
| 191 | + continue | |
| 192 | + href = format_html_inventory_entry(index.parent, candidate).split(" = ", 1)[0] | |
| 193 | + entries.append((href, title)) | |
| 194 | + return entries | |
| 195 | + | |
| 196 | + | |
| 197 | +def summarize_html_inventory( | |
| 198 | + index_path: str | Path, | |
| 199 | + *, | |
| 200 | + limit: int | None = 12, | |
| 201 | +) -> str | None: | |
| 202 | + """Summarize the existing sibling HTML inventory for one index page.""" | |
| 203 | + | |
| 204 | + index = Path(index_path).expanduser() | |
| 205 | + if index.name != "index.html": | |
| 206 | + return None | |
| 207 | + | |
| 208 | + entries = [f"{href} = {title}" for href, title in _collect_html_inventory_entries(index)] | |
| 209 | + if not entries: | |
| 210 | + return None | |
| 211 | + | |
| 212 | + if limit is not None and len(entries) > limit: | |
| 213 | + return "; ".join(entries[:limit]) + "; ..." | |
| 214 | + return "; ".join(entries) | |
| 215 | + | |
| 216 | + | |
| 217 | +def extract_html_toc_excerpt( | |
| 218 | + index_path: str | Path, | |
| 219 | + *, | |
| 220 | + max_lines: int = 16, | |
| 221 | +) -> str | None: | |
| 222 | + """Extract the current HTML table-of-contents block for recovery guidance.""" | |
| 223 | + | |
| 224 | + index = Path(index_path).expanduser() | |
| 225 | + if index.name != "index.html": | |
| 226 | + return None | |
| 227 | + | |
| 228 | + try: | |
| 229 | + text = index.read_text() | |
| 230 | + except OSError: | |
| 231 | + return None | |
| 232 | + | |
| 233 | + match = re.search( | |
| 234 | + r"(<h2[^>]*>\s*Table of Contents\s*</h2>.*?</ul>)", | |
| 235 | + text, | |
| 236 | + re.IGNORECASE | re.DOTALL, | |
| 237 | + ) | |
| 238 | + if not match: | |
| 239 | + match = re.search( | |
| 240 | + r"(<ul[^>]*class=\"[^\"]*chapter-list[^\"]*\"[^>]*>.*?</ul>)", | |
| 241 | + text, | |
| 242 | + re.IGNORECASE | re.DOTALL, | |
| 243 | + ) | |
| 244 | + if not match: | |
| 245 | + return None | |
| 246 | + | |
| 247 | + snippet_lines = [line.rstrip() for line in match.group(1).splitlines() if line.strip()] | |
| 248 | + if not snippet_lines: | |
| 249 | + return None | |
| 250 | + if len(snippet_lines) > max_lines: | |
| 251 | + snippet_lines = snippet_lines[:max_lines] + ["..."] | |
| 252 | + return "\n".join(snippet_lines) | |
| 253 | + | |
| 254 | + | |
| 255 | +def build_html_toc_replacement_block(index_path: str | Path) -> str | None: | |
| 256 | + """Build one exact replacement TOC block from the verified sibling inventory.""" | |
| 257 | + | |
| 258 | + entries = _collect_html_inventory_entries(index_path) | |
| 259 | + if not entries: | |
| 260 | + return None | |
| 261 | + | |
| 262 | + excerpt = extract_html_toc_excerpt(index_path, max_lines=64) | |
| 263 | + excerpt_lines = excerpt.splitlines() if excerpt else [] | |
| 264 | + | |
| 265 | + heading_line = next( | |
| 266 | + (line.rstrip() for line in excerpt_lines if "<h2" in line.lower()), | |
| 267 | + "<h2>Table of Contents</h2>", | |
| 268 | + ) | |
| 269 | + ul_line = next( | |
| 270 | + ( | |
| 271 | + line.rstrip() | |
| 272 | + for line in excerpt_lines | |
| 273 | + if "<ul" in line.lower() and "chapter-list" in line.lower() | |
| 274 | + ), | |
| 275 | + ' <ul class="chapter-list">', | |
| 276 | + ) | |
| 277 | + li_indent = next( | |
| 278 | + ( | |
| 279 | + re.match(r"^\s*", line).group(0) | |
| 280 | + for line in excerpt_lines | |
| 281 | + if "<li><a " in line | |
| 282 | + ), | |
| 283 | + re.match(r"^\s*", ul_line).group(0) + " ", | |
| 284 | + ) | |
| 285 | + closing_line = next( | |
| 286 | + (line.rstrip() for line in excerpt_lines if "</ul>" in line.lower()), | |
| 287 | + f"{re.match(r'^\s*', ul_line).group(0)}</ul>", | |
| 288 | + ) | |
| 289 | + | |
| 290 | + lines = [heading_line, ul_line] | |
| 291 | + lines.extend( | |
| 292 | + f'{li_indent}<li><a href="{href}">{title}</a></li>' | |
| 293 | + for href, title in entries | |
| 294 | + ) | |
| 295 | + lines.append(closing_line) | |
| 296 | + return "\n".join(lines) | |
| 297 | + | |
| 298 | + | |
| 299 | +def build_html_toc_edit_call_template(index_path: str | Path) -> str | None: | |
| 300 | + """Build one concrete `edit(...)` template for replacing the TOC block.""" | |
| 301 | + | |
| 302 | + index = Path(index_path).expanduser() | |
| 303 | + excerpt = extract_html_toc_excerpt(index, max_lines=64) | |
| 304 | + replacement = build_html_toc_replacement_block(index) | |
| 305 | + if not excerpt or not replacement: | |
| 306 | + return None | |
| 307 | + | |
| 308 | + return "\n".join( | |
| 309 | + [ | |
| 310 | + "edit(", | |
| 311 | + f' file_path="{index}",', | |
| 312 | + ' old_string="""', | |
| 313 | + excerpt, | |
| 314 | + '""",', | |
| 315 | + ' new_string="""', | |
| 316 | + replacement, | |
| 317 | + '"""', | |
| 318 | + ")", | |
| 319 | + ] | |
| 320 | + ) | |
| 321 | + | |
| 322 | + | |
| 323 | +@dataclass(frozen=True) | |
| 324 | +class HtmlTocValidationResult: | |
| 325 | + """Semantic validation result for one chapter-list table of contents.""" | |
| 326 | + | |
| 327 | + valid: bool | |
| 328 | + link_count: int | |
| 329 | + missing: tuple[str, ...] = () | |
| 330 | + mismatched: tuple[str, ...] = () | |
| 331 | + | |
| 332 | + | |
| 333 | +def validate_html_toc(index_path: str | Path) -> HtmlTocValidationResult | None: | |
| 334 | + """Validate that one HTML index TOC points at real chapter files with matching titles.""" | |
| 335 | + | |
| 336 | + index = Path(index_path).expanduser() | |
| 337 | + if index.name != "index.html": | |
| 338 | + return None | |
| 339 | + | |
| 340 | + try: | |
| 341 | + text = index.read_text() | |
| 342 | + except OSError: | |
| 343 | + return None | |
| 344 | + | |
| 345 | + section_match = re.search(r'<ul class="chapter-list">(.*?)</ul>', text, re.S) | |
| 346 | + if section_match is None: | |
| 347 | + return HtmlTocValidationResult( | |
| 348 | + valid=False, | |
| 349 | + link_count=0, | |
| 350 | + missing=("Missing chapter-list table of contents",), | |
| 351 | + ) | |
| 352 | + | |
| 353 | + links = re.findall(r'<a href="([^"]+)">([^<]+)</a>', section_match.group(1)) | |
| 354 | + if not links: | |
| 355 | + return HtmlTocValidationResult( | |
| 356 | + valid=False, | |
| 357 | + link_count=0, | |
| 358 | + missing=("No chapter links found in table of contents",), | |
| 359 | + ) | |
| 360 | + | |
| 361 | + root = index.parent | |
| 362 | + missing: list[str] = [] | |
| 363 | + mismatched: list[str] = [] | |
| 364 | + for href, label in links: | |
| 365 | + target = (root / href).expanduser().resolve(strict=False) | |
| 366 | + if not target.exists(): | |
| 367 | + missing.append(f"{href} -> missing") | |
| 368 | + continue | |
| 369 | + title = read_html_title(target) | |
| 370 | + if title and label.strip() != title: | |
| 371 | + mismatched.append(f"{href} -> {label.strip()} != {title}") | |
| 372 | + | |
| 373 | + return HtmlTocValidationResult( | |
| 374 | + valid=not missing and not mismatched, | |
| 375 | + link_count=len(links), | |
| 376 | + missing=tuple(missing), | |
| 377 | + mismatched=tuple(mismatched), | |
| 378 | + ) | |
| 379 | + | |
| 380 | + | |
| 11 | 381 | class ActionTracker: |
| 12 | 382 | """Tracks completed actions to prevent duplicates and detect loops.""" |
| 13 | 383 | |
@@ -19,6 +389,8 @@ class ActionTracker: | ||
| 19 | 389 | READ_REPEAT_THRESHOLD = 3 |
| 20 | 390 | SEARCH_REPEAT_THRESHOLD = 2 |
| 21 | 391 | BASH_OBSERVATION_REPEAT_THRESHOLD = 2 |
| 392 | + HTML_CHAPTER_EVIDENCE_THRESHOLD = 3 | |
| 393 | + RECENT_PATH_CONTEXT_LIMIT = 12 | |
| 22 | 394 | |
| 23 | 395 | def __init__(self) -> None: |
| 24 | 396 | self._file_writes: dict[str, list[str]] = {} |
@@ -32,6 +404,10 @@ class ActionTracker: | ||
| 32 | 404 | self._recent_reads: dict[str, tuple[int, int, int]] = {} |
| 33 | 405 | self._recent_searches: dict[str, tuple[int, int, int]] = {} |
| 34 | 406 | self._recent_bash_observations: dict[str, tuple[int, int, int]] = {} |
| 407 | + self._recent_html_directory_reads: dict[str, tuple[int, set[str]]] = {} | |
| 408 | + self._recent_path_contexts: list[str] = [] | |
| 409 | + self._validated_html_tocs: dict[str, int] = {} | |
| 410 | + self._verified_html_inventory_dirs: set[str] = set() | |
| 35 | 411 | |
| 36 | 412 | def reset(self) -> None: |
| 37 | 413 | self._file_writes.clear() |
@@ -45,6 +421,10 @@ class ActionTracker: | ||
| 45 | 421 | self._recent_reads.clear() |
| 46 | 422 | self._recent_searches.clear() |
| 47 | 423 | self._recent_bash_observations.clear() |
| 424 | + self._recent_html_directory_reads.clear() | |
| 425 | + self._recent_path_contexts.clear() | |
| 426 | + self._validated_html_tocs.clear() | |
| 427 | + self._verified_html_inventory_dirs.clear() | |
| 48 | 428 | |
| 49 | 429 | def _normalize_path(self, path: str) -> str: |
| 50 | 430 | expanded = Path(path).expanduser() |
@@ -111,6 +491,25 @@ class ActionTracker: | ||
| 111 | 491 | def record_mkdir(self, dir_path: str) -> None: |
| 112 | 492 | self._dirs_created.add(self._normalize_path(dir_path)) |
| 113 | 493 | |
| 494 | + def recent_path_contexts(self) -> list[str]: | |
| 495 | + return list(self._recent_path_contexts) | |
| 496 | + | |
| 497 | + def note_validated_html_toc(self, index_path: str) -> None: | |
| 498 | + """Record that one index currently satisfies the semantic chapter-link check.""" | |
| 499 | + | |
| 500 | + normalized = self._normalize_path(index_path) | |
| 501 | + if Path(normalized).name != "index.html": | |
| 502 | + return | |
| 503 | + self._validated_html_tocs[normalized] = self._mutation_epoch | |
| 504 | + | |
| 505 | + def note_verified_html_inventory(self, index_path: str) -> None: | |
| 506 | + """Record that one sibling chapter inventory is already known exactly.""" | |
| 507 | + | |
| 508 | + normalized = self._normalize_path(index_path) | |
| 509 | + path = Path(normalized) | |
| 510 | + chapters_dir = path if path.name == "chapters" else path.parent / "chapters" | |
| 511 | + self._verified_html_inventory_dirs.add(self._normalize_path(str(chapters_dir))) | |
| 512 | + | |
| 114 | 513 | def check_tool_call(self, tool_name: str, arguments: dict) -> tuple[bool, str]: |
| 115 | 514 | if tool_name == "write": |
| 116 | 515 | file_path = arguments.get("file_path", "") |
@@ -136,8 +535,28 @@ class ActionTracker: | ||
| 136 | 535 | return True, f"Same patch already applied to: {file_path}" |
| 137 | 536 | |
| 138 | 537 | elif tool_name == "read": |
| 538 | + inventory_duplicate, inventory_reason = self._check_verified_html_inventory_observation( | |
| 539 | + tool_name, | |
| 540 | + arguments, | |
| 541 | + ) | |
| 542 | + if inventory_duplicate: | |
| 543 | + return True, inventory_reason | |
| 544 | + validated_duplicate, validated_reason = self._check_validated_html_toc_observation( | |
| 545 | + tool_name, | |
| 546 | + arguments, | |
| 547 | + ) | |
| 548 | + if validated_duplicate: | |
| 549 | + return True, validated_reason | |
| 139 | 550 | read_key = self._make_read_key(arguments) |
| 140 | 551 | if read_key: |
| 552 | + sufficiency_duplicate, sufficiency_reason = ( | |
| 553 | + self._check_html_observation_sufficiency( | |
| 554 | + tool_name, | |
| 555 | + arguments, | |
| 556 | + ) | |
| 557 | + ) | |
| 558 | + if sufficiency_duplicate: | |
| 559 | + return True, sufficiency_reason | |
| 141 | 560 | duplicate, reason = self._check_recent_observation( |
| 142 | 561 | self._recent_reads, |
| 143 | 562 | read_key, |
@@ -153,8 +572,28 @@ class ActionTracker: | ||
| 153 | 572 | return True, reason |
| 154 | 573 | |
| 155 | 574 | elif tool_name in {"glob", "grep"}: |
| 575 | + inventory_duplicate, inventory_reason = self._check_verified_html_inventory_observation( | |
| 576 | + tool_name, | |
| 577 | + arguments, | |
| 578 | + ) | |
| 579 | + if inventory_duplicate: | |
| 580 | + return True, inventory_reason | |
| 581 | + validated_duplicate, validated_reason = self._check_validated_html_toc_observation( | |
| 582 | + tool_name, | |
| 583 | + arguments, | |
| 584 | + ) | |
| 585 | + if validated_duplicate: | |
| 586 | + return True, validated_reason | |
| 156 | 587 | observation_key = self._make_search_key(tool_name, arguments) |
| 157 | 588 | if observation_key: |
| 589 | + sufficiency_duplicate, sufficiency_reason = ( | |
| 590 | + self._check_html_observation_sufficiency( | |
| 591 | + tool_name, | |
| 592 | + arguments, | |
| 593 | + ) | |
| 594 | + ) | |
| 595 | + if sufficiency_duplicate: | |
| 596 | + return True, sufficiency_reason | |
| 158 | 597 | duplicate, reason = self._check_recent_observation( |
| 159 | 598 | self._recent_searches, |
| 160 | 599 | observation_key, |
@@ -170,6 +609,18 @@ class ActionTracker: | ||
| 170 | 609 | elif tool_name == "bash": |
| 171 | 610 | command = str(arguments.get("command", "")).strip() |
| 172 | 611 | if self._is_observational_bash(command): |
| 612 | + inventory_duplicate, inventory_reason = self._check_verified_html_inventory_observation( | |
| 613 | + tool_name, | |
| 614 | + arguments, | |
| 615 | + ) | |
| 616 | + if inventory_duplicate: | |
| 617 | + return True, inventory_reason | |
| 618 | + validated_duplicate, validated_reason = self._check_validated_html_toc_observation( | |
| 619 | + tool_name, | |
| 620 | + arguments, | |
| 621 | + ) | |
| 622 | + if validated_duplicate: | |
| 623 | + return True, validated_reason | |
| 173 | 624 | duplicate, reason = self._check_recent_observation( |
| 174 | 625 | self._recent_bash_observations, |
| 175 | 626 | self._normalize_command(command), |
@@ -198,6 +649,8 @@ class ActionTracker: | ||
| 198 | 649 | content = arguments.get("content", "") |
| 199 | 650 | if file_path: |
| 200 | 651 | self.record_file_create(file_path, content) |
| 652 | + self._record_path_context(file_path) | |
| 653 | + self._clear_verified_html_inventory_for_path(file_path) | |
| 201 | 654 | self._note_mutation() |
| 202 | 655 | |
| 203 | 656 | elif tool_name == "edit": |
@@ -206,6 +659,8 @@ class ActionTracker: | ||
| 206 | 659 | new_string = arguments.get("new_string", "") |
| 207 | 660 | if file_path: |
| 208 | 661 | self.record_edit(file_path, old_string, new_string) |
| 662 | + self._record_path_context(file_path) | |
| 663 | + self._clear_verified_html_inventory_for_path(file_path) | |
| 209 | 664 | self._note_mutation() |
| 210 | 665 | |
| 211 | 666 | elif tool_name == "patch": |
@@ -217,6 +672,8 @@ class ActionTracker: | ||
| 217 | 672 | self.record_edit(file_path, str(hunks), "structured_patch") |
| 218 | 673 | elif isinstance(raw_patch, str) and raw_patch.strip(): |
| 219 | 674 | self.record_edit(file_path, raw_patch, "raw_patch") |
| 675 | + self._record_path_context(file_path) | |
| 676 | + self._clear_verified_html_inventory_for_path(file_path) | |
| 220 | 677 | self._note_mutation() |
| 221 | 678 | |
| 222 | 679 | elif tool_name == "read": |
@@ -226,6 +683,10 @@ class ActionTracker: | ||
| 226 | 683 | self._recent_reads, |
| 227 | 684 | read_key, |
| 228 | 685 | ) |
| 686 | + file_path = str(arguments.get("file_path", "")).strip() | |
| 687 | + if file_path: | |
| 688 | + self._record_path_context(file_path) | |
| 689 | + self._record_html_directory_read(arguments) | |
| 229 | 690 | |
| 230 | 691 | elif tool_name in {"glob", "grep"}: |
| 231 | 692 | observation_key = self._make_search_key(tool_name, arguments) |
@@ -234,12 +695,18 @@ class ActionTracker: | ||
| 234 | 695 | self._recent_searches, |
| 235 | 696 | observation_key, |
| 236 | 697 | ) |
| 698 | + search_path = str(arguments.get("path", "")).strip() | |
| 699 | + if search_path: | |
| 700 | + self._record_path_context(search_path, is_directory_hint=True) | |
| 237 | 701 | |
| 238 | 702 | elif tool_name == "bash": |
| 239 | 703 | command = arguments.get("command", "") |
| 240 | 704 | if command: |
| 241 | 705 | self.record_command(command) |
| 242 | 706 | if self._is_mutating_bash(command): |
| 707 | + target = extract_shell_text_rewrite_target(command) | |
| 708 | + if target: | |
| 709 | + self._clear_verified_html_inventory_for_path(target) | |
| 243 | 710 | self._note_mutation() |
| 244 | 711 | elif self._is_observational_bash(command): |
| 245 | 712 | self._record_observation( |
@@ -411,6 +878,8 @@ class ActionTracker: | ||
| 411 | 878 | norm_cmd = self._normalize_command(command) |
| 412 | 879 | if not norm_cmd: |
| 413 | 880 | return False |
| 881 | + if extract_shell_text_rewrite_target(norm_cmd) is not None: | |
| 882 | + return True | |
| 414 | 883 | mutating_fragments = ( |
| 415 | 884 | " >", |
| 416 | 885 | ">>", |
@@ -436,6 +905,248 @@ class ActionTracker: | ||
| 436 | 905 | return False |
| 437 | 906 | return argv[0] in {"touch", "mkdir", "rm", "mv", "cp", "chmod", "chown"} |
| 438 | 907 | |
| 908 | + def _record_path_context(self, path_value: str, *, is_directory_hint: bool = False) -> None: | |
| 909 | + normalized = self._normalize_path(path_value) | |
| 910 | + path = Path(normalized) | |
| 911 | + primary_dir = path if is_directory_hint or path.is_dir() else path.parent | |
| 912 | + candidate_dirs = [primary_dir] | |
| 913 | + if primary_dir.parent != primary_dir: | |
| 914 | + candidate_dirs.append(primary_dir.parent) | |
| 915 | + | |
| 916 | + for candidate_dir in candidate_dirs: | |
| 917 | + normalized_dir = self._normalize_path(str(candidate_dir)) | |
| 918 | + if normalized_dir in self._recent_path_contexts: | |
| 919 | + self._recent_path_contexts.remove(normalized_dir) | |
| 920 | + self._recent_path_contexts.insert(0, normalized_dir) | |
| 921 | + | |
| 922 | + if len(self._recent_path_contexts) > self.RECENT_PATH_CONTEXT_LIMIT: | |
| 923 | + del self._recent_path_contexts[self.RECENT_PATH_CONTEXT_LIMIT :] | |
| 924 | + | |
| 925 | + def _record_html_directory_read(self, arguments: dict) -> None: | |
| 926 | + file_path = str(arguments.get("file_path", "")).strip() | |
| 927 | + if not file_path: | |
| 928 | + return | |
| 929 | + normalized_path = self._normalize_path(file_path) | |
| 930 | + path = Path(normalized_path) | |
| 931 | + if path.suffix != ".html" or path.name == "index.html" or path.parent.name != "chapters": | |
| 932 | + return | |
| 933 | + | |
| 934 | + directory = str(path.parent) | |
| 935 | + last_seen = self._recent_html_directory_reads.get(directory) | |
| 936 | + if last_seen is None or last_seen[0] != self._mutation_epoch: | |
| 937 | + self._recent_html_directory_reads[directory] = ( | |
| 938 | + self._mutation_epoch, | |
| 939 | + {path.name}, | |
| 940 | + ) | |
| 941 | + return | |
| 942 | + | |
| 943 | + _, seen_files = last_seen | |
| 944 | + updated = set(seen_files) | |
| 945 | + updated.add(path.name) | |
| 946 | + self._recent_html_directory_reads[directory] = ( | |
| 947 | + self._mutation_epoch, | |
| 948 | + updated, | |
| 949 | + ) | |
| 950 | + | |
| 951 | + def _check_html_observation_sufficiency( | |
| 952 | + self, | |
| 953 | + tool_name: str, | |
| 954 | + arguments: dict, | |
| 955 | + ) -> tuple[bool, str]: | |
| 956 | + if tool_name == "read": | |
| 957 | + file_path = str(arguments.get("file_path", "")).strip() | |
| 958 | + if not file_path: | |
| 959 | + return False, "" | |
| 960 | + normalized_path = self._normalize_path(file_path) | |
| 961 | + path = Path(normalized_path) | |
| 962 | + if path.name != "index.html": | |
| 963 | + return False, "" | |
| 964 | + chapters_dir = str(path.parent / "chapters") | |
| 965 | + chapter_count = self._chapter_evidence_count(chapters_dir) | |
| 966 | + if chapter_count < self.HTML_CHAPTER_EVIDENCE_THRESHOLD: | |
| 967 | + return False, "" | |
| 968 | + read_key = self._make_read_key(arguments) | |
| 969 | + if read_key is None: | |
| 970 | + return False, "" | |
| 971 | + last_seen = self._recent_reads.get(read_key) | |
| 972 | + if last_seen is None: | |
| 973 | + return False, "" | |
| 974 | + _, _, repeat_count = last_seen | |
| 975 | + if repeat_count < 2: | |
| 976 | + return False, "" | |
| 977 | + return ( | |
| 978 | + True, | |
| 979 | + "Already confirmed multiple chapter files in the sibling chapters " | |
| 980 | + "directory; reuse the known file/title evidence and update index.html " | |
| 981 | + "instead of rereading it", | |
| 982 | + ) | |
| 983 | + | |
| 984 | + if tool_name in {"glob", "grep"}: | |
| 985 | + search_path = str(arguments.get("path", "")).strip() | |
| 986 | + if not search_path: | |
| 987 | + return False, "" | |
| 988 | + normalized_path = self._normalize_path(search_path) | |
| 989 | + path = Path(normalized_path) | |
| 990 | + if path.name != "chapters": | |
| 991 | + return False, "" | |
| 992 | + chapter_count = self._chapter_evidence_count(str(path)) | |
| 993 | + if chapter_count < self.HTML_CHAPTER_EVIDENCE_THRESHOLD: | |
| 994 | + return False, "" | |
| 995 | + observation_key = self._make_search_key(tool_name, arguments) | |
| 996 | + if observation_key is None or observation_key not in self._recent_searches: | |
| 997 | + return False, "" | |
| 998 | + return ( | |
| 999 | + True, | |
| 1000 | + "Already confirmed multiple chapter files in this directory; reuse " | |
| 1001 | + "the known filename/title evidence and update the target index instead " | |
| 1002 | + "of rerunning the directory search", | |
| 1003 | + ) | |
| 1004 | + | |
| 1005 | + return False, "" | |
| 1006 | + | |
| 1007 | + def _chapter_evidence_count(self, directory: str) -> int: | |
| 1008 | + last_seen = self._recent_html_directory_reads.get(directory) | |
| 1009 | + if last_seen is None: | |
| 1010 | + return 0 | |
| 1011 | + last_epoch, seen_files = last_seen | |
| 1012 | + if last_epoch != self._mutation_epoch: | |
| 1013 | + return 0 | |
| 1014 | + return len(seen_files) | |
| 1015 | + | |
| 1016 | + def _check_validated_html_toc_observation( | |
| 1017 | + self, | |
| 1018 | + tool_name: str, | |
| 1019 | + arguments: dict, | |
| 1020 | + ) -> tuple[bool, str]: | |
| 1021 | + related_paths = self._validated_html_related_paths(tool_name, arguments) | |
| 1022 | + if not related_paths: | |
| 1023 | + return False, "" | |
| 1024 | + | |
| 1025 | + for path in related_paths: | |
| 1026 | + if self._matches_validated_html_toc(path): | |
| 1027 | + return ( | |
| 1028 | + True, | |
| 1029 | + "The current index.html already passes the validated chapter-link " | |
| 1030 | + "check; stop rereading index.html or chapters/ and finish the task " | |
| 1031 | + "unless a specific href or title is still unresolved", | |
| 1032 | + ) | |
| 1033 | + return False, "" | |
| 1034 | + | |
| 1035 | + def _check_verified_html_inventory_observation( | |
| 1036 | + self, | |
| 1037 | + tool_name: str, | |
| 1038 | + arguments: dict, | |
| 1039 | + ) -> tuple[bool, str]: | |
| 1040 | + related_paths = self._verified_inventory_related_paths(tool_name, arguments) | |
| 1041 | + if not related_paths: | |
| 1042 | + return False, "" | |
| 1043 | + | |
| 1044 | + for path in related_paths: | |
| 1045 | + if self._matches_verified_html_inventory(path): | |
| 1046 | + return ( | |
| 1047 | + True, | |
| 1048 | + "The verified chapter inventory already lists the exact href/title " | |
| 1049 | + "pairs for this directory; update index.html from that inventory " | |
| 1050 | + "instead of rereading chapter files", | |
| 1051 | + ) | |
| 1052 | + return False, "" | |
| 1053 | + | |
| 1054 | + def _validated_html_related_paths( | |
| 1055 | + self, | |
| 1056 | + tool_name: str, | |
| 1057 | + arguments: dict, | |
| 1058 | + ) -> list[str]: | |
| 1059 | + if tool_name == "read": | |
| 1060 | + file_path = str(arguments.get("file_path", "")).strip() | |
| 1061 | + return [self._normalize_path(file_path)] if file_path else [] | |
| 1062 | + | |
| 1063 | + if tool_name in {"glob", "grep"}: | |
| 1064 | + search_path = str(arguments.get("path", "")).strip() | |
| 1065 | + return [self._normalize_path(search_path)] if search_path else [] | |
| 1066 | + | |
| 1067 | + if tool_name == "bash": | |
| 1068 | + command = str(arguments.get("command", "")).strip() | |
| 1069 | + if not command: | |
| 1070 | + return [] | |
| 1071 | + return self._extract_observational_bash_paths(command) | |
| 1072 | + | |
| 1073 | + return [] | |
| 1074 | + | |
| 1075 | + def _verified_inventory_related_paths( | |
| 1076 | + self, | |
| 1077 | + tool_name: str, | |
| 1078 | + arguments: dict, | |
| 1079 | + ) -> list[str]: | |
| 1080 | + if tool_name == "read": | |
| 1081 | + file_path = str(arguments.get("file_path", "")).strip() | |
| 1082 | + return [self._normalize_path(file_path)] if file_path else [] | |
| 1083 | + | |
| 1084 | + if tool_name in {"glob", "grep"}: | |
| 1085 | + search_path = str(arguments.get("path", "")).strip() | |
| 1086 | + return [self._normalize_path(search_path)] if search_path else [] | |
| 1087 | + | |
| 1088 | + if tool_name == "bash": | |
| 1089 | + command = str(arguments.get("command", "")).strip() | |
| 1090 | + if not command: | |
| 1091 | + return [] | |
| 1092 | + return self._extract_observational_bash_paths(command) | |
| 1093 | + | |
| 1094 | + return [] | |
| 1095 | + | |
| 1096 | + def _matches_validated_html_toc(self, path: str) -> bool: | |
| 1097 | + normalized = self._normalize_path(path) | |
| 1098 | + candidate = Path(normalized) | |
| 1099 | + for index_path, epoch in self._validated_html_tocs.items(): | |
| 1100 | + if epoch != self._mutation_epoch: | |
| 1101 | + continue | |
| 1102 | + index = Path(index_path) | |
| 1103 | + chapters = Path(self._normalize_path(str(index.parent / "chapters"))) | |
| 1104 | + if candidate == index or candidate == chapters: | |
| 1105 | + return True | |
| 1106 | + if candidate.parent == chapters: | |
| 1107 | + return True | |
| 1108 | + return False | |
| 1109 | + | |
| 1110 | + def _matches_verified_html_inventory(self, path: str) -> bool: | |
| 1111 | + normalized = self._normalize_path(path) | |
| 1112 | + candidate = Path(normalized) | |
| 1113 | + for directory in self._verified_html_inventory_dirs: | |
| 1114 | + chapters = Path(directory) | |
| 1115 | + if candidate == chapters or candidate.parent == chapters: | |
| 1116 | + return True | |
| 1117 | + return False | |
| 1118 | + | |
| 1119 | + def _clear_verified_html_inventory_for_path(self, path_value: str) -> None: | |
| 1120 | + normalized = self._normalize_path(path_value) | |
| 1121 | + candidate = Path(normalized) | |
| 1122 | + stale: set[str] = set() | |
| 1123 | + for directory in self._verified_html_inventory_dirs: | |
| 1124 | + chapters = Path(directory) | |
| 1125 | + if candidate == chapters or candidate.parent == chapters: | |
| 1126 | + stale.add(directory) | |
| 1127 | + self._verified_html_inventory_dirs.difference_update(stale) | |
| 1128 | + | |
| 1129 | + def _extract_observational_bash_paths(self, command: str) -> list[str]: | |
| 1130 | + norm_cmd = self._normalize_command(command) | |
| 1131 | + try: | |
| 1132 | + argv = shlex.split(norm_cmd) | |
| 1133 | + except ValueError: | |
| 1134 | + return [] | |
| 1135 | + if not argv: | |
| 1136 | + return [] | |
| 1137 | + | |
| 1138 | + paths: list[str] = [] | |
| 1139 | + for token in argv[1:]: | |
| 1140 | + candidate = _strip_shell_token(token) | |
| 1141 | + if not candidate or candidate.startswith("-"): | |
| 1142 | + continue | |
| 1143 | + if any(marker in candidate for marker in ("/", "~")) or Path(candidate).suffix == ".html": | |
| 1144 | + paths.append(self._normalize_path(candidate)) | |
| 1145 | + continue | |
| 1146 | + if candidate.rstrip("/").endswith("chapters"): | |
| 1147 | + paths.append(self._normalize_path(candidate)) | |
| 1148 | + return paths | |
| 1149 | + | |
| 439 | 1150 | |
| 440 | 1151 | @dataclass |
| 441 | 1152 | class ValidationResult: |
@@ -500,9 +1211,20 @@ class PreActionValidator: | ||
| 500 | 1211 | return ValidationResult( |
| 501 | 1212 | valid=False, |
| 502 | 1213 | reason=reason, |
| 503 | - suggestion="This command is too dangerous to execute", | |
| 504 | - severity="block", | |
| 505 | - ) | |
| 1214 | + suggestion="This command is too dangerous to execute", | |
| 1215 | + severity="block", | |
| 1216 | + ) | |
| 1217 | + | |
| 1218 | + rewrite_target = extract_shell_text_rewrite_target(str(command)) | |
| 1219 | + if rewrite_target is not None: | |
| 1220 | + return ValidationResult( | |
| 1221 | + valid=False, | |
| 1222 | + reason="Shell-based text rewrites are brittle and bypass Loader's safer file tools", | |
| 1223 | + suggestion=( | |
| 1224 | + f"Use edit/patch/write for `{rewrite_target}` instead of rewriting it with bash" | |
| 1225 | + ), | |
| 1226 | + severity="error", | |
| 1227 | + ) | |
| 506 | 1228 | |
| 507 | 1229 | for pattern, reason in self.SUSPICIOUS_PATTERNS: |
| 508 | 1230 | if re.search(pattern, command): |
@@ -607,6 +1329,10 @@ class PreActionValidator: | ||
| 607 | 1329 | severity="error", |
| 608 | 1330 | ) |
| 609 | 1331 | |
| 1332 | + html_index_result = self._validate_html_index_links(str(file_path), str(new_string)) | |
| 1333 | + if not html_index_result.valid: | |
| 1334 | + return html_index_result | |
| 1335 | + | |
| 610 | 1336 | return ValidationResult(valid=True) |
| 611 | 1337 | |
| 612 | 1338 | def _validate_patch(self, arguments: dict) -> ValidationResult: |
@@ -664,6 +1390,130 @@ class PreActionValidator: | ||
| 664 | 1390 | |
| 665 | 1391 | return ValidationResult(valid=True) |
| 666 | 1392 | |
| 1393 | + def _validate_html_index_links( | |
| 1394 | + self, | |
| 1395 | + file_path: str, | |
| 1396 | + content: str, | |
| 1397 | + ) -> ValidationResult: | |
| 1398 | + normalized = Path(file_path).expanduser() | |
| 1399 | + if normalized.name != "index.html" or "<a " not in content: | |
| 1400 | + return ValidationResult(valid=True) | |
| 1401 | + | |
| 1402 | + link_pairs = re.findall(r'<a\s+href="([^"]+)">([^<]+)</a>', content) | |
| 1403 | + if not link_pairs: | |
| 1404 | + return ValidationResult(valid=True) | |
| 1405 | + | |
| 1406 | + root = normalized.parent | |
| 1407 | + missing: list[str] = [] | |
| 1408 | + mismatched: list[str] = [] | |
| 1409 | + for href, label in link_pairs: | |
| 1410 | + target = (root / href).resolve(strict=False) | |
| 1411 | + if not target.exists(): | |
| 1412 | + if href not in missing: | |
| 1413 | + missing.append(href) | |
| 1414 | + continue | |
| 1415 | + | |
| 1416 | + title = read_html_title(target) | |
| 1417 | + if title and label.strip() != title: | |
| 1418 | + if href not in mismatched: | |
| 1419 | + mismatched.append(href) | |
| 1420 | + | |
| 1421 | + if missing: | |
| 1422 | + suggestions = self._suggest_existing_html_targets(root, missing) | |
| 1423 | + preview_items = [ | |
| 1424 | + format_html_inventory_entry(root, root / suggestion) | |
| 1425 | + for suggestion in suggestions | |
| 1426 | + ] | |
| 1427 | + if not preview_items: | |
| 1428 | + preview_items = missing | |
| 1429 | + preview = ", ".join(preview_items[:3]) | |
| 1430 | + if len(preview_items) > 3: | |
| 1431 | + preview += ", ..." | |
| 1432 | + return ValidationResult( | |
| 1433 | + valid=False, | |
| 1434 | + reason="Edited TOC references chapter files that do not exist", | |
| 1435 | + suggestion=( | |
| 1436 | + "Use only existing chapter href/title pairs from beside index.html, for example: " | |
| 1437 | + f"{preview}" | |
| 1438 | + ), | |
| 1439 | + severity="error", | |
| 1440 | + ) | |
| 1441 | + | |
| 1442 | + if mismatched: | |
| 1443 | + exact_entries = [ | |
| 1444 | + format_html_inventory_entry(root, (root / href).resolve(strict=False)) | |
| 1445 | + for href in mismatched | |
| 1446 | + if (root / href).resolve(strict=False).exists() | |
| 1447 | + ] | |
| 1448 | + if not exact_entries: | |
| 1449 | + exact_entries = mismatched | |
| 1450 | + preview = "; ".join(exact_entries[:2]) | |
| 1451 | + if len(exact_entries) > 2: | |
| 1452 | + preview += "; ..." | |
| 1453 | + return ValidationResult( | |
| 1454 | + valid=False, | |
| 1455 | + reason="Edited TOC labels do not match the linked chapter titles", | |
| 1456 | + suggestion=( | |
| 1457 | + "Copy the exact href/title pair from the linked HTML file, for example: " | |
| 1458 | + f"{preview}" | |
| 1459 | + ), | |
| 1460 | + severity="error", | |
| 1461 | + ) | |
| 1462 | + | |
| 1463 | + return ValidationResult(valid=True) | |
| 1464 | + | |
| 1465 | + def _suggest_existing_html_targets(self, root: Path, missing: list[str]) -> list[str]: | |
| 1466 | + available_by_directory: dict[Path, list[str]] = {} | |
| 1467 | + suggestions: list[str] = [] | |
| 1468 | + | |
| 1469 | + for href in missing: | |
| 1470 | + href_path = Path(href) | |
| 1471 | + directory = (root / href_path).parent | |
| 1472 | + if directory not in available_by_directory: | |
| 1473 | + available_by_directory[directory] = sorted( | |
| 1474 | + str(path.relative_to(root)) | |
| 1475 | + for path in directory.glob("*.html") | |
| 1476 | + if path.is_file() | |
| 1477 | + ) | |
| 1478 | + | |
| 1479 | + available = available_by_directory[directory] | |
| 1480 | + if not available: | |
| 1481 | + continue | |
| 1482 | + | |
| 1483 | + missing_name = href_path.name | |
| 1484 | + chapter_match = re.match(r"(\d+)-", missing_name) | |
| 1485 | + preferred = available | |
| 1486 | + if chapter_match is not None: | |
| 1487 | + prefix = f"{chapter_match.group(1)}-" | |
| 1488 | + same_prefix = [ | |
| 1489 | + candidate | |
| 1490 | + for candidate in available | |
| 1491 | + if Path(candidate).name.startswith(prefix) | |
| 1492 | + ] | |
| 1493 | + if same_prefix: | |
| 1494 | + preferred = same_prefix | |
| 1495 | + | |
| 1496 | + matched_names = get_close_matches( | |
| 1497 | + missing_name, | |
| 1498 | + [Path(candidate).name for candidate in preferred], | |
| 1499 | + n=1, | |
| 1500 | + cutoff=0.0, | |
| 1501 | + ) | |
| 1502 | + if matched_names: | |
| 1503 | + matched_name = matched_names[0] | |
| 1504 | + candidate = next( | |
| 1505 | + ( | |
| 1506 | + candidate | |
| 1507 | + for candidate in preferred | |
| 1508 | + if Path(candidate).name == matched_name | |
| 1509 | + ), | |
| 1510 | + None, | |
| 1511 | + ) | |
| 1512 | + if candidate is not None and candidate not in suggestions: | |
| 1513 | + suggestions.append(candidate) | |
| 1514 | + | |
| 1515 | + return suggestions | |
| 1516 | + | |
| 667 | 1517 | def _validate_path(self, file_path: str) -> ValidationResult: |
| 668 | 1518 | if '\x00' in file_path: |
| 669 | 1519 | return ValidationResult( |
src/loader/runtime/tool_batch_recovery.pymodified@@ -13,6 +13,13 @@ from .context import RuntimeContext | ||
| 13 | 13 | from .events import AgentEvent |
| 14 | 14 | from .executor import ToolExecutionOutcome |
| 15 | 15 | from .recovery import RecoveryContext, format_failure_message, format_recovery_prompt |
| 16 | +from .safeguard_services import ( | |
| 17 | + build_html_toc_edit_call_template, | |
| 18 | + build_html_toc_replacement_block, | |
| 19 | + extract_html_toc_excerpt, | |
| 20 | + read_html_title, | |
| 21 | + summarize_html_inventory, | |
| 22 | +) | |
| 16 | 23 | |
| 17 | 24 | EventSink = Callable[[AgentEvent], Awaitable[None]] |
| 18 | 25 | |
@@ -130,34 +137,36 @@ class ToolBatchRecoveryController: | ||
| 130 | 137 | current_task=current_task, |
| 131 | 138 | ) |
| 132 | 139 | actionable_known_state = bool(confirmed_facts and preferred_next_step) |
| 133 | - if not confirmed_facts and not preferred_next_step and not current_task: | |
| 134 | - return prompt | |
| 135 | - | |
| 136 | - lines = [prompt, "", "## CONTINUE FROM KNOWN STATE"] | |
| 137 | - if current_task: | |
| 138 | - lines.append(f"- Current task: {current_task}") | |
| 139 | - if confirmed_facts: | |
| 140 | - lines.append(f"- Confirmed facts: {confirmed_facts}") | |
| 141 | - if preferred_next_step: | |
| 142 | - lines.append(f"- Preferred next step: {preferred_next_step}") | |
| 143 | - lines.append( | |
| 144 | - "- Preserve progress: do not restart by rereading already-confirmed files " | |
| 145 | - "unless you need genuinely new evidence." | |
| 146 | - ) | |
| 147 | - if actionable_known_state: | |
| 148 | - lines.extend( | |
| 149 | - [ | |
| 150 | - "", | |
| 151 | - "## ACTION BIAS FOR THIS RECOVERY", | |
| 152 | - "- The confirmed findings above are already enough to keep moving.", | |
| 153 | - "- Prefer edit/write/patch on the target file over rereading the same files.", | |
| 154 | - "- Only inspect one more file if a specific filename, href, or title is still unknown.", | |
| 155 | - "- Treat the preferred next step as the default path forward.", | |
| 156 | - ] | |
| 140 | + lines = [prompt] | |
| 141 | + if confirmed_facts or preferred_next_step or current_task: | |
| 142 | + lines.extend(["", "## CONTINUE FROM KNOWN STATE"]) | |
| 143 | + if current_task: | |
| 144 | + lines.append(f"- Current task: {current_task}") | |
| 145 | + if confirmed_facts: | |
| 146 | + lines.append(f"- Confirmed facts: {confirmed_facts}") | |
| 147 | + if preferred_next_step: | |
| 148 | + lines.append(f"- Preferred next step: {preferred_next_step}") | |
| 149 | + lines.append( | |
| 150 | + "- Preserve progress: do not restart by rereading already-confirmed files " | |
| 151 | + "unless you need genuinely new evidence." | |
| 157 | 152 | ) |
| 153 | + if actionable_known_state: | |
| 154 | + lines.extend( | |
| 155 | + [ | |
| 156 | + "", | |
| 157 | + "## ACTION BIAS FOR THIS RECOVERY", | |
| 158 | + "- The confirmed findings above are already enough to keep moving.", | |
| 159 | + "- Prefer edit/write/patch on the target file over rereading the same files.", | |
| 160 | + "- Only inspect one more file if a specific filename, href, or title is still unknown.", | |
| 161 | + "- Treat the preferred next step as the default path forward.", | |
| 162 | + ] | |
| 163 | + ) | |
| 158 | 164 | candidate_lines = self._file_not_found_candidate_lines(tool_call, outcome) |
| 159 | 165 | if candidate_lines: |
| 160 | 166 | lines.extend(["", "## LIKELY FILE CANDIDATES", *candidate_lines]) |
| 167 | + target_excerpt_lines = self._target_excerpt_lines(tool_call) | |
| 168 | + if target_excerpt_lines: | |
| 169 | + lines.extend(["", "## CURRENT TARGET EXCERPT", *target_excerpt_lines]) | |
| 161 | 170 | return "\n".join(lines) |
| 162 | 171 | |
| 163 | 172 | def _file_not_found_candidate_lines( |
@@ -184,7 +193,7 @@ class ToolBatchRecoveryController: | ||
| 184 | 193 | if not candidates: |
| 185 | 194 | return [] |
| 186 | 195 | |
| 187 | - names = ", ".join(f"`{Path(candidate).name}`" for candidate in candidates[:3]) | |
| 196 | + names = ", ".join(self._describe_candidate(candidate) for candidate in candidates[:3]) | |
| 188 | 197 | return [ |
| 189 | 198 | f"- Requested file does not exist: `{missing_path}`", |
| 190 | 199 | f"- Closest known files in the same directory: {names}", |
@@ -198,7 +207,7 @@ class ToolBatchRecoveryController: | ||
| 198 | 207 | |
| 199 | 208 | ranked: list[tuple[float, str]] = [] |
| 200 | 209 | seen: set[str] = set() |
| 201 | - for candidate in self._known_file_paths(): | |
| 210 | + for candidate in self._known_file_paths(missing_path): | |
| 202 | 211 | if candidate == missing_path: |
| 203 | 212 | continue |
| 204 | 213 | if str(Path(candidate).parent) != missing_parent: |
@@ -216,7 +225,7 @@ class ToolBatchRecoveryController: | ||
| 216 | 225 | ranked.sort(key=lambda item: (-item[0], item[1])) |
| 217 | 226 | return [candidate for _, candidate in ranked] |
| 218 | 227 | |
| 219 | - def _known_file_paths(self) -> list[str]: | |
| 228 | + def _known_file_paths(self, missing_path: str | None = None) -> list[str]: | |
| 220 | 229 | pattern = re.compile(r"(?:~|/)[^\s`\"']+\.html") |
| 221 | 230 | discovered: list[str] = [] |
| 222 | 231 | seen: set[str] = set() |
@@ -227,8 +236,73 @@ class ToolBatchRecoveryController: | ||
| 227 | 236 | continue |
| 228 | 237 | seen.add(candidate) |
| 229 | 238 | discovered.append(candidate) |
| 239 | + if missing_path: | |
| 240 | + missing = Path(missing_path) | |
| 241 | + parent = missing.parent | |
| 242 | + if parent.is_dir(): | |
| 243 | + sibling_candidates = sorted( | |
| 244 | + child.resolve(strict=False) | |
| 245 | + for child in parent.iterdir() | |
| 246 | + if child.is_file() | |
| 247 | + and child.name != missing.name | |
| 248 | + and ( | |
| 249 | + not missing.suffix | |
| 250 | + or child.suffix == missing.suffix | |
| 251 | + ) | |
| 252 | + ) | |
| 253 | + for child in sibling_candidates: | |
| 254 | + candidate = str(child) | |
| 255 | + if candidate in seen: | |
| 256 | + continue | |
| 257 | + seen.add(candidate) | |
| 258 | + discovered.append(candidate) | |
| 230 | 259 | return discovered |
| 231 | 260 | |
| 261 | + def _describe_candidate(self, candidate: str) -> str: | |
| 262 | + path = Path(candidate) | |
| 263 | + label = f"`{path.name}`" | |
| 264 | + if path.suffix == ".html": | |
| 265 | + title = read_html_title(path) | |
| 266 | + if title: | |
| 267 | + return f"{label} = {title}" | |
| 268 | + return label | |
| 269 | + | |
| 270 | + def _target_excerpt_lines(self, tool_call: ToolCall) -> list[str]: | |
| 271 | + file_path = str( | |
| 272 | + tool_call.arguments.get("file_path") | |
| 273 | + or tool_call.arguments.get("path") | |
| 274 | + or "" | |
| 275 | + ).strip() | |
| 276 | + if not file_path: | |
| 277 | + return [] | |
| 278 | + | |
| 279 | + inventory = summarize_html_inventory(file_path, limit=12) | |
| 280 | + excerpt = extract_html_toc_excerpt(file_path) | |
| 281 | + if not inventory and not excerpt: | |
| 282 | + return [] | |
| 283 | + | |
| 284 | + lines: list[str] = [] | |
| 285 | + if inventory: | |
| 286 | + lines.append(f"- Verified chapter inventory: {inventory}") | |
| 287 | + if excerpt: | |
| 288 | + lines.append("- Current TOC block:") | |
| 289 | + lines.extend(f" {line}" for line in excerpt.splitlines()) | |
| 290 | + replacement = build_html_toc_replacement_block(file_path) | |
| 291 | + if replacement: | |
| 292 | + lines.append("- Suggested replacement block:") | |
| 293 | + lines.extend(f" {line}" for line in replacement.splitlines()) | |
| 294 | + if excerpt and replacement: | |
| 295 | + lines.append("- Exact edit guidance:") | |
| 296 | + lines.append(f" file_path: {file_path}") | |
| 297 | + lines.append(" old_string: use the Current TOC block above exactly") | |
| 298 | + lines.append(" new_string: use the Suggested replacement block above exactly") | |
| 299 | + lines.append(" Do not rewrite the whole file.") | |
| 300 | + edit_template = build_html_toc_edit_call_template(file_path) | |
| 301 | + if edit_template: | |
| 302 | + lines.append("- Suggested edit call:") | |
| 303 | + lines.extend(f" {line}" for line in edit_template.splitlines()) | |
| 304 | + return lines | |
| 305 | + | |
| 232 | 306 | def _canonicalize_path(self, raw_path: str) -> str: |
| 233 | 307 | if not raw_path: |
| 234 | 308 | return "" |
src/loader/runtime/tool_batches.pymodified@@ -32,7 +32,15 @@ from .verification_observations import ( | ||
| 32 | 32 | ) |
| 33 | 33 | from .workflow import sync_todos_to_definition_of_done |
| 34 | 34 | from .workflow import advance_todos_from_tool_call |
| 35 | -from .compaction import infer_preferred_next_step | |
| 35 | +from .compaction import infer_preferred_next_step, summarize_confirmed_facts | |
| 36 | +from .safeguard_services import ( | |
| 37 | + build_html_toc_edit_call_template, | |
| 38 | + build_html_toc_replacement_block, | |
| 39 | + extract_html_toc_excerpt, | |
| 40 | + extract_shell_text_rewrite_target, | |
| 41 | + summarize_html_inventory, | |
| 42 | + validate_html_toc, | |
| 43 | +) | |
| 36 | 44 | |
| 37 | 45 | EventSink = Callable[[AgentEvent], Awaitable[None]] |
| 38 | 46 | ConfirmationHandler = ( |
@@ -70,6 +78,7 @@ class ToolBatchRunner: | ||
| 70 | 78 | self.confidence_gate = confidence_gate or ToolBatchConfidenceGate(context) |
| 71 | 79 | self.recovery_controller = recovery_controller or ToolBatchRecoveryController(context) |
| 72 | 80 | self.verification_gate = verification_gate or ToolBatchVerificationGate(context) |
| 81 | + self._inventory_hint_targets: set[str] = set() | |
| 73 | 82 | |
| 74 | 83 | async def execute_batch( |
| 75 | 84 | self, |
@@ -143,6 +152,7 @@ class ToolBatchRunner: | ||
| 143 | 152 | emit_confirmation=emit_confirmation, |
| 144 | 153 | source=tool_source, |
| 145 | 154 | ) |
| 155 | + executed_tool_call = outcome.tool_call | |
| 146 | 156 | if ( |
| 147 | 157 | outcome.rollback_action is not None |
| 148 | 158 | and self.context.config.reasoning.show_rollback_plan |
@@ -163,7 +173,7 @@ class ToolBatchRunner: | ||
| 163 | 173 | and self.context.config.auto_recover |
| 164 | 174 | ): |
| 165 | 175 | recovery_result = await self.recovery_controller.build_follow_up( |
| 166 | - tool_call=tool_call, | |
| 176 | + tool_call=executed_tool_call, | |
| 167 | 177 | outcome=outcome, |
| 168 | 178 | emit=emit, |
| 169 | 179 | ) |
@@ -174,17 +184,21 @@ class ToolBatchRunner: | ||
| 174 | 184 | |
| 175 | 185 | if outcome.state == ToolExecutionState.EXECUTED and not outcome.is_error: |
| 176 | 186 | loop_response = await self._record_successful_execution( |
| 177 | - tool_call=tool_call, | |
| 187 | + tool_call=executed_tool_call, | |
| 178 | 188 | outcome=outcome, |
| 179 | 189 | dod=dod, |
| 180 | 190 | emit=emit, |
| 181 | 191 | summary=summary, |
| 182 | 192 | ) |
| 183 | 193 | # Mark this tool's label as completed and emit live progress |
| 184 | - label = _tool_call_label(tool_call) | |
| 194 | + label = _tool_call_label(executed_tool_call) | |
| 185 | 195 | if label: |
| 186 | 196 | completed_labels.append(label) |
| 187 | 197 | await _emit_batch_todos() |
| 198 | + self._annotate_verified_html_inventory(executed_tool_call, outcome) | |
| 199 | + self._queue_verified_html_inventory_nudge(executed_tool_call) | |
| 200 | + self._annotate_validated_html_toc_completion(executed_tool_call, outcome) | |
| 201 | + self._queue_validated_html_toc_completion_nudge(executed_tool_call) | |
| 188 | 202 | if loop_response is not None: |
| 189 | 203 | result.halted = True |
| 190 | 204 | result.final_response = loop_response |
@@ -199,7 +213,7 @@ class ToolBatchRunner: | ||
| 199 | 213 | AgentEvent( |
| 200 | 214 | type="tool_result", |
| 201 | 215 | content=outcome.event_content, |
| 202 | - tool_name=tool_call.name, | |
| 216 | + tool_name=executed_tool_call.name, | |
| 203 | 217 | tool_call_id=outcome.tool_call.id, |
| 204 | 218 | tool_metadata=( |
| 205 | 219 | outcome.registry_result.metadata |
@@ -219,6 +233,9 @@ class ToolBatchRunner: | ||
| 219 | 233 | summary.tool_result_messages.append(outcome.message) |
| 220 | 234 | if outcome.state == ToolExecutionState.DUPLICATE: |
| 221 | 235 | self._queue_duplicate_observation_nudge(tool_call) |
| 236 | + elif outcome.state == ToolExecutionState.BLOCKED: | |
| 237 | + self._queue_blocked_shell_rewrite_nudge(tool_call) | |
| 238 | + self._queue_blocked_html_edit_nudge(tool_call, outcome.event_content) | |
| 222 | 239 | |
| 223 | 240 | should_continue = await self.verification_gate.should_continue( |
| 224 | 241 | tool_call=tool_call, |
@@ -258,10 +275,23 @@ class ToolBatchRunner: | ||
| 258 | 275 | return |
| 259 | 276 | |
| 260 | 277 | current_task = getattr(self.context.session, "current_task", None) |
| 278 | + confirmed_facts = summarize_confirmed_facts( | |
| 279 | + self.context.session.messages, | |
| 280 | + max_items=2, | |
| 281 | + ) | |
| 261 | 282 | preferred_next_step = infer_preferred_next_step( |
| 262 | 283 | self.context.session.messages, |
| 263 | 284 | current_task=current_task, |
| 264 | 285 | ) |
| 286 | + if preferred_next_step and confirmed_facts: | |
| 287 | + self.context.queue_steering_message( | |
| 288 | + "Reuse the earlier observation instead of repeating it. " | |
| 289 | + f"Confirmed facts: {confirmed_facts}. " | |
| 290 | + f"{preferred_next_step} " | |
| 291 | + "Only gather more evidence if a specific filename, href, or title is still unknown." | |
| 292 | + ) | |
| 293 | + return | |
| 294 | + | |
| 265 | 295 | if preferred_next_step: |
| 266 | 296 | self.context.queue_steering_message( |
| 267 | 297 | "Reuse the earlier observation instead of repeating it. " |
@@ -288,6 +318,259 @@ class ToolBatchRunner: | ||
| 288 | 318 | "Choose a different next step that makes progress." |
| 289 | 319 | ) |
| 290 | 320 | |
| 321 | + def _queue_blocked_shell_rewrite_nudge(self, tool_call: ToolCall) -> None: | |
| 322 | + """Steer the model back to file tools after a blocked shell text rewrite.""" | |
| 323 | + | |
| 324 | + if tool_call.name != "bash": | |
| 325 | + return | |
| 326 | + | |
| 327 | + target = extract_shell_text_rewrite_target( | |
| 328 | + str(tool_call.arguments.get("command", "")) | |
| 329 | + ) | |
| 330 | + if target is None: | |
| 331 | + return | |
| 332 | + | |
| 333 | + current_task = getattr(self.context.session, "current_task", None) | |
| 334 | + confirmed_facts = summarize_confirmed_facts( | |
| 335 | + self.context.session.messages, | |
| 336 | + max_items=2, | |
| 337 | + ) | |
| 338 | + preferred_next_step = infer_preferred_next_step( | |
| 339 | + self.context.session.messages, | |
| 340 | + current_task=current_task, | |
| 341 | + ) | |
| 342 | + | |
| 343 | + if preferred_next_step and confirmed_facts: | |
| 344 | + self.context.queue_steering_message( | |
| 345 | + "Use Loader's file tools for this text edit instead of a shell rewrite. " | |
| 346 | + f"Confirmed facts: {confirmed_facts}. " | |
| 347 | + f"{preferred_next_step} " | |
| 348 | + f"Target `{target}` with edit/patch/write rather than `bash`." | |
| 349 | + ) | |
| 350 | + return | |
| 351 | + | |
| 352 | + self.context.queue_steering_message( | |
| 353 | + "Use Loader's file tools for this text edit instead of a shell rewrite. " | |
| 354 | + f"Apply the change to `{target}` with edit/patch/write." | |
| 355 | + ) | |
| 356 | + | |
| 357 | + def _queue_blocked_html_edit_nudge(self, tool_call: ToolCall, event_content: str) -> None: | |
| 358 | + """Steer blocked TOC edits back to the confirmed chapter inventory.""" | |
| 359 | + | |
| 360 | + if tool_call.name not in {"edit", "patch"}: | |
| 361 | + return | |
| 362 | + | |
| 363 | + target_path = str(tool_call.arguments.get("file_path", "")).strip() | |
| 364 | + if not target_path.endswith("index.html"): | |
| 365 | + return | |
| 366 | + | |
| 367 | + current_task = getattr(self.context.session, "current_task", None) | |
| 368 | + confirmed_facts = summarize_confirmed_facts( | |
| 369 | + self.context.session.messages, | |
| 370 | + max_items=2, | |
| 371 | + ) | |
| 372 | + preferred_next_step = infer_preferred_next_step( | |
| 373 | + self.context.session.messages, | |
| 374 | + current_task=current_task, | |
| 375 | + ) | |
| 376 | + verified_inventory = summarize_html_inventory(target_path, limit=12) | |
| 377 | + current_excerpt = extract_html_toc_excerpt(target_path) | |
| 378 | + suggested_replacement = build_html_toc_replacement_block(target_path) | |
| 379 | + suggested_call = build_html_toc_edit_call_template(target_path) | |
| 380 | + excerpt_suffix = ( | |
| 381 | + f"\nCurrent TOC block:\n{current_excerpt}" | |
| 382 | + if current_excerpt | |
| 383 | + else "" | |
| 384 | + ) | |
| 385 | + replacement_suffix = ( | |
| 386 | + f"\nSuggested replacement block:\n{suggested_replacement}" | |
| 387 | + if suggested_replacement | |
| 388 | + else "" | |
| 389 | + ) | |
| 390 | + call_suffix = ( | |
| 391 | + f"\nSuggested edit call:\n{suggested_call}" | |
| 392 | + if suggested_call | |
| 393 | + else "" | |
| 394 | + ) | |
| 395 | + | |
| 396 | + if preferred_next_step and confirmed_facts and verified_inventory: | |
| 397 | + self.context.queue_steering_message( | |
| 398 | + "Use the current target contents plus the verified sibling inventory instead of guessing. " | |
| 399 | + f"Confirmed facts: {confirmed_facts}. " | |
| 400 | + f"Known chapter inventory: {verified_inventory}. " | |
| 401 | + f"{preferred_next_step} " | |
| 402 | + "Apply those exact href/title pairs in `index.html`. " | |
| 403 | + "Do not rewrite the whole document. For `edit`, set `old_string` to the " | |
| 404 | + "current TOC block above exactly and set `new_string` to the suggested " | |
| 405 | + "replacement block below exactly." | |
| 406 | + f"{excerpt_suffix}" | |
| 407 | + f"{replacement_suffix}" | |
| 408 | + f"{call_suffix}" | |
| 409 | + ) | |
| 410 | + return | |
| 411 | + | |
| 412 | + if verified_inventory: | |
| 413 | + self.context.queue_steering_message( | |
| 414 | + "Use the current target contents plus the verified sibling inventory instead of guessing. " | |
| 415 | + f"Known chapter inventory: {verified_inventory}. " | |
| 416 | + "Apply those exact href/title pairs in `index.html`. " | |
| 417 | + "Do not rewrite the whole document. For `edit`, set `old_string` to the " | |
| 418 | + "current TOC block above exactly and set `new_string` to the suggested " | |
| 419 | + "replacement block below exactly." | |
| 420 | + f"{excerpt_suffix}" | |
| 421 | + f"{replacement_suffix}" | |
| 422 | + f"{call_suffix}" | |
| 423 | + ) | |
| 424 | + return | |
| 425 | + | |
| 426 | + self.context.queue_steering_message( | |
| 427 | + "Use the current target contents when retrying this `index.html` edit instead of guessing. " | |
| 428 | + f"{excerpt_suffix}".strip() | |
| 429 | + ) | |
| 430 | + | |
| 431 | + def _queue_verified_html_inventory_nudge(self, tool_call: ToolCall) -> None: | |
| 432 | + """Proactively hand off verified chapter inventory after sibling discovery.""" | |
| 433 | + | |
| 434 | + if tool_call.name != "glob": | |
| 435 | + return | |
| 436 | + | |
| 437 | + chapters_path = str(tool_call.arguments.get("path", "")).strip() | |
| 438 | + if not chapters_path.endswith("chapters"): | |
| 439 | + return | |
| 440 | + | |
| 441 | + index_path = str(Path(chapters_path).expanduser().parent / "index.html") | |
| 442 | + if index_path in self._inventory_hint_targets: | |
| 443 | + return | |
| 444 | + | |
| 445 | + current_task = str(getattr(self.context.session, "current_task", "") or "").lower() | |
| 446 | + if not any( | |
| 447 | + hint in current_task | |
| 448 | + for hint in ("href", "link", "links", "table of contents", "chapter", "index.html") | |
| 449 | + ): | |
| 450 | + return | |
| 451 | + | |
| 452 | + verified_inventory = summarize_html_inventory(index_path, limit=12) | |
| 453 | + if not verified_inventory: | |
| 454 | + return | |
| 455 | + | |
| 456 | + self._inventory_hint_targets.add(index_path) | |
| 457 | + self.context.queue_steering_message( | |
| 458 | + "You already have the verified sibling inventory needed for this edit. " | |
| 459 | + f"Known chapter inventory: {verified_inventory}. " | |
| 460 | + f"Update `{index_path}` using those exact href/title pairs instead of rereading files " | |
| 461 | + "unless one specific title is still unknown." | |
| 462 | + ) | |
| 463 | + | |
| 464 | + def _annotate_verified_html_inventory(self, tool_call: ToolCall, outcome) -> None: | |
| 465 | + """Attach verified chapter inventory directly to a successful discovery result.""" | |
| 466 | + | |
| 467 | + if tool_call.name != "glob": | |
| 468 | + return | |
| 469 | + | |
| 470 | + chapters_path = str(tool_call.arguments.get("path", "")).strip() | |
| 471 | + if not chapters_path.endswith("chapters"): | |
| 472 | + return | |
| 473 | + | |
| 474 | + current_task = str(getattr(self.context.session, "current_task", "") or "").lower() | |
| 475 | + if not any( | |
| 476 | + hint in current_task | |
| 477 | + for hint in ("href", "link", "links", "table of contents", "chapter", "index.html") | |
| 478 | + ): | |
| 479 | + return | |
| 480 | + | |
| 481 | + index_path = str(Path(chapters_path).expanduser().parent / "index.html") | |
| 482 | + verified_inventory = summarize_html_inventory(index_path, limit=12) | |
| 483 | + if not verified_inventory: | |
| 484 | + return | |
| 485 | + | |
| 486 | + action_tracker = getattr(self.context.safeguards, "action_tracker", None) | |
| 487 | + note_inventory = getattr(action_tracker, "note_verified_html_inventory", None) | |
| 488 | + if callable(note_inventory): | |
| 489 | + note_inventory(index_path) | |
| 490 | + | |
| 491 | + note = ( | |
| 492 | + "Verified chapter inventory: " | |
| 493 | + f"{verified_inventory}" | |
| 494 | + ) | |
| 495 | + merged_event = outcome.event_content | |
| 496 | + if note not in merged_event: | |
| 497 | + merged_event = f"{note}\n{merged_event}".strip() | |
| 498 | + outcome.event_content = merged_event | |
| 499 | + outcome.result_output = merged_event | |
| 500 | + outcome.message.content = f"{note}\n{outcome.message.content}".strip() | |
| 501 | + if outcome.message.tool_results: | |
| 502 | + outcome.message.tool_results[0].content = merged_event | |
| 503 | + | |
| 504 | + def _annotate_validated_html_toc_completion(self, tool_call: ToolCall, outcome) -> None: | |
| 505 | + """Attach semantic TOC validation evidence to a successful mutating result.""" | |
| 506 | + | |
| 507 | + target_path = self._validated_html_toc_target(tool_call) | |
| 508 | + if target_path is None: | |
| 509 | + return | |
| 510 | + | |
| 511 | + validation = validate_html_toc(target_path) | |
| 512 | + if validation is None or not validation.valid: | |
| 513 | + return | |
| 514 | + | |
| 515 | + action_tracker = getattr(self.context.safeguards, "action_tracker", None) | |
| 516 | + note_validated = getattr(action_tracker, "note_validated_html_toc", None) | |
| 517 | + if callable(note_validated): | |
| 518 | + note_validated(target_path) | |
| 519 | + | |
| 520 | + note = ( | |
| 521 | + "Semantic verification preview: " | |
| 522 | + f"validated {validation.link_count} toc links in {Path(target_path).name}" | |
| 523 | + ) | |
| 524 | + merged_event = outcome.event_content | |
| 525 | + if note not in merged_event: | |
| 526 | + merged_event = f"{merged_event}\n{note}".strip() | |
| 527 | + outcome.event_content = merged_event | |
| 528 | + outcome.result_output = merged_event | |
| 529 | + outcome.message.content = f"{outcome.message.content}\n{note}".strip() | |
| 530 | + if outcome.message.tool_results: | |
| 531 | + outcome.message.tool_results[0].content = merged_event | |
| 532 | + | |
| 533 | + def _queue_validated_html_toc_completion_nudge(self, tool_call: ToolCall) -> None: | |
| 534 | + """Push the next model turn toward finishing once the TOC already validates.""" | |
| 535 | + | |
| 536 | + target_path = self._validated_html_toc_target(tool_call) | |
| 537 | + if target_path is None: | |
| 538 | + return | |
| 539 | + | |
| 540 | + validation = validate_html_toc(target_path) | |
| 541 | + if validation is None or not validation.valid: | |
| 542 | + return | |
| 543 | + | |
| 544 | + self.context.queue_steering_message( | |
| 545 | + "The current `index.html` already satisfies the verified chapter-link constraints. " | |
| 546 | + f"Semantic verification preview: validated {validation.link_count} toc links in " | |
| 547 | + f"`{Path(target_path).name}`. " | |
| 548 | + "Do not reread `index.html` or files in `chapters/` unless a specific href or " | |
| 549 | + "title is still unresolved. Briefly state that the table of contents has been " | |
| 550 | + "updated so Loader can run the verification gate." | |
| 551 | + ) | |
| 552 | + | |
| 553 | + @staticmethod | |
| 554 | + def _validated_html_toc_target(tool_call: ToolCall) -> str | None: | |
| 555 | + """Return the index target for a successful HTML TOC mutation.""" | |
| 556 | + | |
| 557 | + target_path = "" | |
| 558 | + if tool_call.name in {"write", "edit", "patch"}: | |
| 559 | + target_path = str(tool_call.arguments.get("file_path", "")).strip() | |
| 560 | + elif tool_call.name == "bash": | |
| 561 | + target_path = ( | |
| 562 | + extract_shell_text_rewrite_target( | |
| 563 | + str(tool_call.arguments.get("command", "")) | |
| 564 | + ) | |
| 565 | + or "" | |
| 566 | + ).strip() | |
| 567 | + | |
| 568 | + if not target_path: | |
| 569 | + return None | |
| 570 | + if not target_path.endswith("index.html"): | |
| 571 | + return None | |
| 572 | + return str(Path(target_path).expanduser()) | |
| 573 | + | |
| 291 | 574 | async def _record_successful_execution( |
| 292 | 575 | self, |
| 293 | 576 | *, |
src/loader/runtime/turn_preparation.pymodified@@ -104,6 +104,7 @@ class TurnPreparationController: | ||
| 104 | 104 | dod = self.dod_store.create_or_resume( |
| 105 | 105 | effective_task, |
| 106 | 106 | retry_budget=self.context.config.verification_retry_budget, |
| 107 | + resume_path=self.context.session.active_dod_path, | |
| 107 | 108 | ) |
| 108 | 109 | summary.definition_of_done = dod |
| 109 | 110 | |
@@ -158,6 +159,7 @@ class TurnPreparationController: | ||
| 158 | 159 | validator=self.context.safeguards.validator, |
| 159 | 160 | registry=self.context.registry, |
| 160 | 161 | rollback_plan=rollback_plan, |
| 162 | + workspace_root=self.context.project_root, | |
| 161 | 163 | ), |
| 162 | 164 | ) |
| 163 | 165 | return executor, rollback_plan |
src/loader/runtime/workflow.pymodified@@ -124,7 +124,7 @@ _VERIFY_STEP_HINTS = ( | ||
| 124 | 124 | ) |
| 125 | 125 | _SHELL_COMMAND_START = re.compile( |
| 126 | 126 | r"(?<![\w/.-])(" |
| 127 | - r"ls|grep|pytest|uv|python3?|html5validator|cargo|npm|node|mypy|ruff|find|git|cat|sed|head|tail" | |
| 127 | + r"ls|grep|pytest|uv|python3?|html5validator|cargo|npm|node|mypy|ruff|find|git|cat|sed|head|tail|test|diff|cmp|bash|sh|make" | |
| 128 | 128 | r")\b" |
| 129 | 129 | ) |
| 130 | 130 | |
@@ -875,13 +875,11 @@ def _extract_commands(items: list[str]) -> list[str]: | ||
| 875 | 875 | candidate = re.sub(r"^-\s+", "", candidate) |
| 876 | 876 | match = re.match(r"^`(.+)`$", candidate) |
| 877 | 877 | candidate = (match.group(1) if match else candidate).strip() |
| 878 | - if candidate.startswith("#"): | |
| 879 | - candidate = _extract_shell_command_from_text(candidate) | |
| 880 | - if not candidate: | |
| 881 | - continue | |
| 878 | + candidate = _extract_shell_command_from_text(candidate) | |
| 879 | + candidate = candidate.strip().strip("`") | |
| 882 | 880 | if candidate: |
| 883 | 881 | commands.append(candidate) |
| 884 | - return [command for command in commands if command] | |
| 882 | + return _merge_continued_shell_commands([command for command in commands if command]) | |
| 885 | 883 | |
| 886 | 884 | |
| 887 | 885 | def _extract_collapsed_shell_commands(text: str) -> list[str]: |
@@ -911,6 +909,40 @@ def _extract_shell_command_from_text(text: str) -> str: | ||
| 911 | 909 | return text[match.start():].strip() |
| 912 | 910 | |
| 913 | 911 | |
| 912 | +def _merge_continued_shell_commands(commands: list[str]) -> list[str]: | |
| 913 | + merged: list[str] = [] | |
| 914 | + pending: str | None = None | |
| 915 | + | |
| 916 | + for command in commands: | |
| 917 | + stripped = command.strip() | |
| 918 | + if not stripped: | |
| 919 | + continue | |
| 920 | + | |
| 921 | + if pending is not None: | |
| 922 | + combined = f"{pending} {stripped}".strip() | |
| 923 | + if _has_dangling_shell_continuation(combined): | |
| 924 | + pending = combined | |
| 925 | + continue | |
| 926 | + merged.append(combined) | |
| 927 | + pending = None | |
| 928 | + continue | |
| 929 | + | |
| 930 | + if _has_dangling_shell_continuation(stripped): | |
| 931 | + pending = stripped | |
| 932 | + continue | |
| 933 | + merged.append(stripped) | |
| 934 | + | |
| 935 | + if pending is not None: | |
| 936 | + merged.append(pending.rstrip("|& ").strip()) | |
| 937 | + | |
| 938 | + return [command for command in merged if command] | |
| 939 | + | |
| 940 | + | |
| 941 | +def _has_dangling_shell_continuation(command: str) -> bool: | |
| 942 | + stripped = command.rstrip() | |
| 943 | + return stripped.endswith("|") or stripped.endswith("&&") or stripped.endswith("||") | |
| 944 | + | |
| 945 | + | |
| 914 | 946 | def _has_concrete_anchor(task: str) -> bool: |
| 915 | 947 | return any( |
| 916 | 948 | re.search(pattern, task) |
tests/test_compaction.pymodified@@ -8,7 +8,9 @@ from loader.runtime.compaction import ( | ||
| 8 | 8 | build_session_summary, |
| 9 | 9 | compact_session_messages, |
| 10 | 10 | compress_summary, |
| 11 | + infer_preferred_next_step, | |
| 11 | 12 | resolve_auto_compaction_input_tokens_threshold, |
| 13 | + summarize_confirmed_facts, | |
| 12 | 14 | ) |
| 13 | 15 | |
| 14 | 16 | |
@@ -100,6 +102,32 @@ def test_build_session_summary_preserves_confirmed_facts_and_next_step() -> None | ||
| 100 | 102 | ) |
| 101 | 103 | ], |
| 102 | 104 | ), |
| 105 | + Message( | |
| 106 | + role=Role.ASSISTANT, | |
| 107 | + content="Inspecting the setup chapter title.", | |
| 108 | + tool_calls=[ | |
| 109 | + ToolCall( | |
| 110 | + id="read-2", | |
| 111 | + name="read", | |
| 112 | + arguments={"file_path": "~/Loader/guides/fortran/chapters/02-setup.html"}, | |
| 113 | + ) | |
| 114 | + ], | |
| 115 | + ), | |
| 116 | + Message.tool_result_message( | |
| 117 | + tool_call_id="read-2", | |
| 118 | + display_content=( | |
| 119 | + " 1\t<!DOCTYPE html>\n" | |
| 120 | + " 2\t<html>\n" | |
| 121 | + " 61\t<h1>Chapter 2: Setting Up Fortran</h1>\n" | |
| 122 | + " 62\t</html>\n" | |
| 123 | + ), | |
| 124 | + result_content=( | |
| 125 | + " 1\t<!DOCTYPE html>\n" | |
| 126 | + " 2\t<html>\n" | |
| 127 | + " 61\t<h1>Chapter 2: Setting Up Fortran</h1>\n" | |
| 128 | + " 62\t</html>\n" | |
| 129 | + ), | |
| 130 | + ), | |
| 103 | 131 | Message( |
| 104 | 132 | role=Role.TOOL, |
| 105 | 133 | content=( |
@@ -121,11 +149,144 @@ def test_build_session_summary_preserves_confirmed_facts_and_next_step() -> None | ||
| 121 | 149 | |
| 122 | 150 | assert "Confirmed facts:" in summary |
| 123 | 151 | assert "02-basic-syntax.html -> 02-setup.html" in summary |
| 124 | - assert "Existing files include 01-introduction.html" in summary | |
| 152 | + assert "02-setup.html = Chapter 2: Setting Up Fortran" in summary | |
| 125 | 153 | assert "Preferred next step:" in summary |
| 126 | 154 | assert "`~/Loader/guides/fortran/index.html`" in summary |
| 127 | 155 | |
| 128 | 156 | |
| 157 | +def test_summarize_confirmed_facts_extracts_chapter_titles_from_read_results() -> None: | |
| 158 | + messages = [ | |
| 159 | + Message( | |
| 160 | + role=Role.ASSISTANT, | |
| 161 | + content="I will inspect the chapter files.", | |
| 162 | + tool_calls=[ | |
| 163 | + ToolCall( | |
| 164 | + id="read-1", | |
| 165 | + name="read", | |
| 166 | + arguments={"file_path": "/tmp/fortran/chapters/01-introduction.html"}, | |
| 167 | + ), | |
| 168 | + ToolCall( | |
| 169 | + id="read-2", | |
| 170 | + name="read", | |
| 171 | + arguments={"file_path": "/tmp/fortran/chapters/02-setup.html"}, | |
| 172 | + ), | |
| 173 | + ], | |
| 174 | + ), | |
| 175 | + Message.tool_result_message( | |
| 176 | + tool_call_id="read-1", | |
| 177 | + display_content="<h1>Chapter 1: Introduction to Fortran</h1>\n", | |
| 178 | + result_content="<h1>Chapter 1: Introduction to Fortran</h1>\n", | |
| 179 | + ), | |
| 180 | + Message.tool_result_message( | |
| 181 | + tool_call_id="read-2", | |
| 182 | + display_content="<title>Chapter 2: Setting Up Fortran</title>\n", | |
| 183 | + result_content="<title>Chapter 2: Setting Up Fortran</title>\n", | |
| 184 | + ), | |
| 185 | + ] | |
| 186 | + | |
| 187 | + confirmed_facts = summarize_confirmed_facts(messages, max_items=2) | |
| 188 | + | |
| 189 | + assert confirmed_facts is not None | |
| 190 | + assert "Chapter titles confirmed:" in confirmed_facts | |
| 191 | + assert "01-introduction.html = Chapter 1: Introduction to Fortran" in confirmed_facts | |
| 192 | + assert "02-setup.html = Chapter 2: Setting Up Fortran" in confirmed_facts | |
| 193 | + | |
| 194 | + | |
| 195 | +def test_infer_preferred_next_step_uses_confirmed_chapter_pairs() -> None: | |
| 196 | + messages = [ | |
| 197 | + Message( | |
| 198 | + role=Role.ASSISTANT, | |
| 199 | + content="I should inspect the chapter and then update the index.", | |
| 200 | + tool_calls=[ | |
| 201 | + ToolCall( | |
| 202 | + id="read-index", | |
| 203 | + name="read", | |
| 204 | + arguments={"file_path": "/tmp/fortran/index.html"}, | |
| 205 | + ), | |
| 206 | + ToolCall( | |
| 207 | + id="read-1", | |
| 208 | + name="read", | |
| 209 | + arguments={"file_path": "/tmp/fortran/chapters/01-introduction.html"}, | |
| 210 | + ), | |
| 211 | + ], | |
| 212 | + ), | |
| 213 | + Message.tool_result_message( | |
| 214 | + tool_call_id="read-1", | |
| 215 | + display_content="<h1>Chapter 1: Introduction to Fortran</h1>\n", | |
| 216 | + result_content="<h1>Chapter 1: Introduction to Fortran</h1>\n", | |
| 217 | + ), | |
| 218 | + ] | |
| 219 | + | |
| 220 | + next_step = infer_preferred_next_step( | |
| 221 | + messages, | |
| 222 | + current_task="Update /tmp/fortran/index.html so the chapter list matches the real files.", | |
| 223 | + ) | |
| 224 | + | |
| 225 | + assert next_step == ( | |
| 226 | + "Update `/tmp/fortran/index.html` using the confirmed chapter file/title pairs " | |
| 227 | + "instead of rereading files." | |
| 228 | + ) | |
| 229 | + | |
| 230 | + | |
| 231 | +def test_infer_preferred_next_step_uses_latest_verification_gap() -> None: | |
| 232 | + messages = [ | |
| 233 | + Message( | |
| 234 | + role=Role.ASSISTANT, | |
| 235 | + content="I should inspect the chapter and then update the index.", | |
| 236 | + tool_calls=[ | |
| 237 | + ToolCall( | |
| 238 | + id="read-index", | |
| 239 | + name="read", | |
| 240 | + arguments={"file_path": "/tmp/fortran/index.html"}, | |
| 241 | + ), | |
| 242 | + ToolCall( | |
| 243 | + id="read-1", | |
| 244 | + name="read", | |
| 245 | + arguments={"file_path": "/tmp/fortran/chapters/01-introduction.html"}, | |
| 246 | + ), | |
| 247 | + ToolCall( | |
| 248 | + id="verify-1", | |
| 249 | + name="bash", | |
| 250 | + arguments={"command": "python3 - <<'PY'\n...\nPY"}, | |
| 251 | + ), | |
| 252 | + ], | |
| 253 | + ), | |
| 254 | + Message.tool_result_message( | |
| 255 | + tool_call_id="read-1", | |
| 256 | + display_content="<h1>Chapter 1: Introduction to Fortran</h1>\n", | |
| 257 | + result_content="<h1>Chapter 1: Introduction to Fortran</h1>\n", | |
| 258 | + ), | |
| 259 | + Message.tool_result_message( | |
| 260 | + tool_call_id="verify-1", | |
| 261 | + display_content=( | |
| 262 | + "Missing links:\n" | |
| 263 | + "chapters/05-control-structures.html -> missing\n" | |
| 264 | + "chapters/06-input-output.html -> missing\n" | |
| 265 | + ), | |
| 266 | + result_content=( | |
| 267 | + "Missing links:\n" | |
| 268 | + "chapters/05-control-structures.html -> missing\n" | |
| 269 | + "chapters/06-input-output.html -> missing\n" | |
| 270 | + ), | |
| 271 | + is_error=True, | |
| 272 | + ), | |
| 273 | + ] | |
| 274 | + | |
| 275 | + confirmed_facts = summarize_confirmed_facts(messages, max_items=2) | |
| 276 | + next_step = infer_preferred_next_step( | |
| 277 | + messages, | |
| 278 | + current_task="Update /tmp/fortran/index.html so the chapter list matches the real files.", | |
| 279 | + ) | |
| 280 | + | |
| 281 | + assert confirmed_facts is not None | |
| 282 | + assert "Verification gaps: missing TOC links chapters/05-control-structures.html" in confirmed_facts | |
| 283 | + assert next_step == ( | |
| 284 | + "Update `/tmp/fortran/index.html` to fix the specific verification failures " | |
| 285 | + "(missing TOC links chapters/05-control-structures.html, " | |
| 286 | + "chapters/06-input-output.html) instead of restarting discovery." | |
| 287 | + ) | |
| 288 | + | |
| 289 | + | |
| 129 | 290 | def test_compact_session_messages_uses_single_continuation_instruction_block() -> None: |
| 130 | 291 | messages = [ |
| 131 | 292 | Message(role=Role.USER, content="Task framing"), |
tests/test_dod.pymodified@@ -5,7 +5,9 @@ from pathlib import Path | ||
| 5 | 5 | from loader.llm.base import ToolCall |
| 6 | 6 | from loader.runtime.dod import ( |
| 7 | 7 | DefinitionOfDoneStore, |
| 8 | + VerificationEvidence, | |
| 8 | 9 | begin_new_verification_attempt, |
| 10 | + build_verification_summary, | |
| 9 | 11 | create_definition_of_done, |
| 10 | 12 | derive_verification_commands, |
| 11 | 13 | determine_task_size, |
@@ -137,5 +139,25 @@ def test_derive_verification_commands_adds_semantic_html_toc_check(tmp_path: Pat | ||
| 137 | 139 | task_statement=dod.task_statement, |
| 138 | 140 | ) |
| 139 | 141 | |
| 140 | - assert any(command.startswith("/usr/bin/python3 - <<'PY'") for command in commands) | |
| 142 | + assert any(command.startswith("python3 - <<'PY'") for command in commands) | |
| 141 | 143 | assert not any(command == f"test -f {index}" for command in commands) |
| 144 | + | |
| 145 | + | |
| 146 | +def test_build_verification_summary_keeps_concrete_missing_link_details() -> None: | |
| 147 | + summary = build_verification_summary( | |
| 148 | + [ | |
| 149 | + VerificationEvidence( | |
| 150 | + command="python3 - <<'PY' ... PY", | |
| 151 | + passed=False, | |
| 152 | + stderr=( | |
| 153 | + "Missing links:\n" | |
| 154 | + "chapters/05-control-structures.html -> missing\n" | |
| 155 | + "chapters/06-input-output.html -> missing\n" | |
| 156 | + ), | |
| 157 | + ) | |
| 158 | + ] | |
| 159 | + ) | |
| 160 | + | |
| 161 | + assert "Missing links:" in summary | |
| 162 | + assert "chapters/05-control-structures.html -> missing" in summary | |
| 163 | + assert "chapters/06-input-output.html -> missing" in summary | |
tests/test_finalization.pymodified@@ -427,7 +427,7 @@ async def test_turn_finalizer_appends_runtime_semantic_verifier_to_planned_comma | ||
| 427 | 427 | |
| 428 | 428 | assert result.should_continue is False |
| 429 | 429 | assert any(command == 'grep -n "href=" index.html' for command in executor.commands) |
| 430 | - assert any(command.startswith("/usr/bin/python3 - <<'PY'") for command in executor.commands) | |
| 430 | + assert any(command.startswith("python3 - <<'PY'") for command in executor.commands) | |
| 431 | 431 | assert ( |
| 432 | 432 | session.workflow_timeline[-1].verification_observations[0].attempt_id |
| 433 | 433 | == "verification-attempt-1" |
@@ -483,3 +483,130 @@ async def test_turn_finalizer_records_missing_verification_observation( | ||
| 483 | 483 | ) |
| 484 | 484 | assert session.messages[-1].role == Role.USER |
| 485 | 485 | assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK FAILED]") |
| 486 | + | |
| 487 | + | |
| 488 | +@pytest.mark.asyncio | |
| 489 | +async def test_turn_finalizer_does_not_reverify_without_new_changes( | |
| 490 | + temp_dir: Path, | |
| 491 | +) -> None: | |
| 492 | + session = FakeSession() | |
| 493 | + context = build_context(temp_dir, session) | |
| 494 | + finalizer = TurnFinalizer( | |
| 495 | + context, | |
| 496 | + RuntimeTracer(), | |
| 497 | + DefinitionOfDoneStore(temp_dir), | |
| 498 | + set_workflow_mode=_noop_set_workflow_mode, | |
| 499 | + ) | |
| 500 | + index = temp_dir / "index.html" | |
| 501 | + index.write_text("<ul></ul>\n") | |
| 502 | + dod = create_definition_of_done("Fix the chapter list in index.html.") | |
| 503 | + dod.mutating_actions.append("edit") | |
| 504 | + dod.touched_files.append(str(index)) | |
| 505 | + dod.line_changes = 12 | |
| 506 | + dod.last_verification_result = "failed" | |
| 507 | + dod.last_verification_signature = ( | |
| 508 | + f"lines={dod.line_changes};touched={index};actions=1;commands=" | |
| 509 | + ) | |
| 510 | + dod.evidence = [] | |
| 511 | + summary = TurnSummary(final_response="") | |
| 512 | + executor = RecordingExecutor() | |
| 513 | + | |
| 514 | + async def capture(event) -> None: | |
| 515 | + return None | |
| 516 | + | |
| 517 | + result = await finalizer.run_definition_of_done_gate( | |
| 518 | + dod=dod, | |
| 519 | + candidate_response="I checked the file again.", | |
| 520 | + emit=capture, | |
| 521 | + summary=summary, | |
| 522 | + executor=executor, # type: ignore[arg-type] | |
| 523 | + ) | |
| 524 | + | |
| 525 | + assert result.should_continue is True | |
| 526 | + assert result.reason_code == "verification_failed_no_new_changes" | |
| 527 | + assert executor.commands == [] | |
| 528 | + assert summary.verification_status == "failed" | |
| 529 | + assert session.messages[-1].content.startswith("[DEFINITION OF DONE CHECK STILL FAILING]") | |
| 530 | + | |
| 531 | + | |
| 532 | +@pytest.mark.asyncio | |
| 533 | +async def test_turn_finalizer_accepts_missing_optional_html5validator_when_semantic_check_passes( | |
| 534 | + temp_dir: Path, | |
| 535 | + monkeypatch: pytest.MonkeyPatch, | |
| 536 | +) -> None: | |
| 537 | + session = FakeSession() | |
| 538 | + context = build_context(temp_dir, session) | |
| 539 | + finalizer = TurnFinalizer( | |
| 540 | + context, | |
| 541 | + RuntimeTracer(), | |
| 542 | + DefinitionOfDoneStore(temp_dir), | |
| 543 | + set_workflow_mode=_noop_set_workflow_mode, | |
| 544 | + ) | |
| 545 | + dod = create_definition_of_done( | |
| 546 | + "Update index.html so the table of contents links and chapter titles are correct." | |
| 547 | + ) | |
| 548 | + dod.mutating_actions.append("edit") | |
| 549 | + dod.touched_files.append(str(temp_dir / "index.html")) | |
| 550 | + dod.verification_commands = [ | |
| 551 | + "python3 - <<'PY'\nprint('semantic ok')\nPY", | |
| 552 | + "html5validator --root /tmp/fortran-qwen-recovery-check/", | |
| 553 | + ] | |
| 554 | + summary = TurnSummary(final_response="") | |
| 555 | + semantic_call = ToolCall( | |
| 556 | + id="verify-1-1", | |
| 557 | + name="bash", | |
| 558 | + arguments={"command": dod.verification_commands[0], "cwd": str(temp_dir)}, | |
| 559 | + ) | |
| 560 | + html5validator_call = ToolCall( | |
| 561 | + id="verify-1-2", | |
| 562 | + name="bash", | |
| 563 | + arguments={"command": dod.verification_commands[1], "cwd": str(temp_dir)}, | |
| 564 | + ) | |
| 565 | + | |
| 566 | + async def capture(event) -> None: | |
| 567 | + return None | |
| 568 | + | |
| 569 | + monkeypatch.setattr( | |
| 570 | + "loader.runtime.finalization.derive_verification_commands", | |
| 571 | + lambda *args, **kwargs: [], | |
| 572 | + ) | |
| 573 | + | |
| 574 | + result = await finalizer.run_definition_of_done_gate( | |
| 575 | + dod=dod, | |
| 576 | + candidate_response="Updated the chapter links and titles.", | |
| 577 | + emit=capture, | |
| 578 | + summary=summary, | |
| 579 | + executor=FakeExecutor( | |
| 580 | + [ | |
| 581 | + tool_outcome( | |
| 582 | + tool_call=semantic_call, | |
| 583 | + output="semantic ok", | |
| 584 | + is_error=False, | |
| 585 | + exit_code=0, | |
| 586 | + stdout="semantic ok", | |
| 587 | + ), | |
| 588 | + tool_outcome( | |
| 589 | + tool_call=html5validator_call, | |
| 590 | + output="/bin/sh: html5validator: command not found", | |
| 591 | + is_error=True, | |
| 592 | + exit_code=127, | |
| 593 | + stderr="/bin/sh: html5validator: command not found", | |
| 594 | + ), | |
| 595 | + ] | |
| 596 | + ), # type: ignore[arg-type] | |
| 597 | + ) | |
| 598 | + | |
| 599 | + assert result.should_continue is False | |
| 600 | + assert result.reason_code == "verification_passed" | |
| 601 | + assert summary.verification_status == "passed" | |
| 602 | + assert dod.status == "done" | |
| 603 | + assert dod.last_verification_result == "passed" | |
| 604 | + assert [item.passed for item in dod.evidence] == [True, False] | |
| 605 | + assert [item.skipped for item in dod.evidence] == [False, True] | |
| 606 | + assert "SKIP" in result.final_response | |
| 607 | + assert "html5validator" in result.final_response | |
| 608 | + assert session.workflow_timeline[-2].reason_code == "verification_command_passed" | |
| 609 | + assert session.workflow_timeline[-1].reason_code == "verification_command_skipped" | |
| 610 | + assert [item.status for item in session.workflow_timeline[-1].verification_observations] == [ | |
| 611 | + VerificationObservationStatus.SKIPPED.value | |
| 612 | + ] | |
tests/test_ollama_backend.pymodified@@ -103,6 +103,47 @@ async def test_ollama_complete_uses_shared_parser_with_allowed_tool_names() -> N | ||
| 103 | 103 | await backend.close() |
| 104 | 104 | |
| 105 | 105 | |
| 106 | +@pytest.mark.asyncio | |
| 107 | +async def test_ollama_complete_canonicalizes_native_tool_aliases() -> None: | |
| 108 | + backend = OllamaBackend() | |
| 109 | + | |
| 110 | + async def fake_describe_model() -> None: | |
| 111 | + return None | |
| 112 | + | |
| 113 | + backend.describe_model = fake_describe_model # type: ignore[method-assign] | |
| 114 | + backend._client = FakeClient( | |
| 115 | + [ | |
| 116 | + FakeResponse( | |
| 117 | + { | |
| 118 | + "message": { | |
| 119 | + "content": "", | |
| 120 | + "tool_calls": [ | |
| 121 | + { | |
| 122 | + "id": "call_read", | |
| 123 | + "function": { | |
| 124 | + "name": "read_file", | |
| 125 | + "arguments": {"file_path": "/tmp/test.txt"}, | |
| 126 | + }, | |
| 127 | + } | |
| 128 | + ], | |
| 129 | + }, | |
| 130 | + "prompt_eval_count": 4, | |
| 131 | + "eval_count": 2, | |
| 132 | + } | |
| 133 | + ) | |
| 134 | + ] | |
| 135 | + ) | |
| 136 | + | |
| 137 | + response = await backend.complete( | |
| 138 | + messages=[], | |
| 139 | + tools=[{"name": "read"}, {"name": "write"}, {"name": "patch"}], | |
| 140 | + ) | |
| 141 | + | |
| 142 | + assert response.tool_calls[0].name == "read" | |
| 143 | + assert response.tool_calls[0].arguments == {"file_path": "/tmp/test.txt"} | |
| 144 | + await backend.close() | |
| 145 | + | |
| 146 | + | |
| 106 | 147 | @pytest.mark.asyncio |
| 107 | 148 | async def test_ollama_stream_response_uses_shared_parser_for_text_tool_calls() -> None: |
| 108 | 149 | backend = OllamaBackend() |
@@ -142,6 +183,80 @@ async def test_ollama_stream_response_uses_shared_parser_for_text_tool_calls() - | ||
| 142 | 183 | await backend.close() |
| 143 | 184 | |
| 144 | 185 | |
| 186 | +@pytest.mark.asyncio | |
| 187 | +async def test_ollama_stream_response_canonicalizes_native_tool_aliases() -> None: | |
| 188 | + backend = OllamaBackend() | |
| 189 | + | |
| 190 | + chunks = [ | |
| 191 | + chunk | |
| 192 | + async for chunk in backend._stream_response( | |
| 193 | + FakeStreamResponse( | |
| 194 | + [ | |
| 195 | + { | |
| 196 | + "message": { | |
| 197 | + "content": "", | |
| 198 | + "tool_calls": [ | |
| 199 | + { | |
| 200 | + "id": "call_read", | |
| 201 | + "function": { | |
| 202 | + "name": "read_file", | |
| 203 | + "arguments": {"file_path": "/tmp/test.txt"}, | |
| 204 | + }, | |
| 205 | + } | |
| 206 | + ], | |
| 207 | + }, | |
| 208 | + "done": True, | |
| 209 | + "prompt_eval_count": 4, | |
| 210 | + "eval_count": 2, | |
| 211 | + } | |
| 212 | + ] | |
| 213 | + ), | |
| 214 | + tools=[{"name": "read"}, {"name": "write"}, {"name": "patch"}], | |
| 215 | + ) | |
| 216 | + ] | |
| 217 | + | |
| 218 | + final_chunk = chunks[-1] | |
| 219 | + assert final_chunk.tool_calls[0].name == "read" | |
| 220 | + assert final_chunk.tool_calls[0].arguments == {"file_path": "/tmp/test.txt"} | |
| 221 | + await backend.close() | |
| 222 | + | |
| 223 | + | |
| 224 | +@pytest.mark.asyncio | |
| 225 | +async def test_ollama_stream_response_parses_fenced_read_command() -> None: | |
| 226 | + backend = OllamaBackend() | |
| 227 | + | |
| 228 | + chunks = [ | |
| 229 | + chunk | |
| 230 | + async for chunk in backend._stream_response( | |
| 231 | + FakeStreamResponse( | |
| 232 | + [ | |
| 233 | + { | |
| 234 | + "message": { | |
| 235 | + "content": ( | |
| 236 | + "I need to inspect the file first.\n" | |
| 237 | + "```bash\nread /tmp/test.txt\n```" | |
| 238 | + ) | |
| 239 | + }, | |
| 240 | + "done": False, | |
| 241 | + }, | |
| 242 | + { | |
| 243 | + "message": {"content": ""}, | |
| 244 | + "done": True, | |
| 245 | + "prompt_eval_count": 4, | |
| 246 | + "eval_count": 2, | |
| 247 | + }, | |
| 248 | + ] | |
| 249 | + ), | |
| 250 | + tools=[{"name": "read"}, {"name": "glob"}, {"name": "bash"}], | |
| 251 | + ) | |
| 252 | + ] | |
| 253 | + | |
| 254 | + final_chunk = chunks[-1] | |
| 255 | + assert final_chunk.tool_calls[0].name == "read" | |
| 256 | + assert final_chunk.tool_calls[0].arguments == {"file_path": "/tmp/test.txt"} | |
| 257 | + await backend.close() | |
| 258 | + | |
| 259 | + | |
| 145 | 260 | @pytest.mark.asyncio |
| 146 | 261 | async def test_ollama_stream_response_defers_raw_json_detection_to_final_parse() -> None: |
| 147 | 262 | backend = OllamaBackend() |
tests/test_parsing.pymodified@@ -225,6 +225,38 @@ Created the file.''' | ||
| 225 | 225 | assert result.tool_calls == [] |
| 226 | 226 | assert "TotallyUnknownTool" in result.content |
| 227 | 227 | |
| 228 | + def test_parse_bare_json_maps_read_file_alias_to_read(self): | |
| 229 | + text = '{"name": "read_file", "arguments": {"file_path": "/tmp/test.txt"}}' | |
| 230 | + result = parse_tool_calls( | |
| 231 | + text, | |
| 232 | + allowed_tool_names=["read", "write", "patch"], | |
| 233 | + ) | |
| 234 | + assert len(result.tool_calls) == 1 | |
| 235 | + assert result.tool_calls[0].name == "read" | |
| 236 | + assert result.tool_calls[0].arguments == {"file_path": "/tmp/test.txt"} | |
| 237 | + | |
| 238 | + def test_parse_fenced_read_command_into_tool_call(self): | |
| 239 | + text = "Let me inspect the file first.\n```bash\nread /tmp/test.txt\n```" | |
| 240 | + result = parse_tool_calls( | |
| 241 | + text, | |
| 242 | + allowed_tool_names=["read", "glob", "bash"], | |
| 243 | + ) | |
| 244 | + assert len(result.tool_calls) == 1 | |
| 245 | + assert result.tool_calls[0].name == "read" | |
| 246 | + assert result.tool_calls[0].arguments == {"file_path": "/tmp/test.txt"} | |
| 247 | + | |
| 248 | + def test_parse_fenced_glob_command_into_tool_call(self): | |
| 249 | + text = "```bash\nglob /tmp/guide/chapters/*.html\n```" | |
| 250 | + result = parse_tool_calls( | |
| 251 | + text, | |
| 252 | + allowed_tool_names=["read", "glob", "bash"], | |
| 253 | + ) | |
| 254 | + assert len(result.tool_calls) == 1 | |
| 255 | + assert result.tool_calls[0].name == "glob" | |
| 256 | + assert result.tool_calls[0].arguments == { | |
| 257 | + "pattern": "/tmp/guide/chapters/*.html" | |
| 258 | + } | |
| 259 | + | |
| 228 | 260 | |
| 229 | 261 | class TestFormatToolResult: |
| 230 | 262 | """Tests for format_tool_result function.""" |
tests/test_permissions.pymodified@@ -381,3 +381,35 @@ async def test_search_path_alias_hook_canonicalizes_common_aliases( | ||
| 381 | 381 | assert result.updated_arguments["path"] == expected_path |
| 382 | 382 | for alias in ("directory", "dir", "folder"): |
| 383 | 383 | assert alias not in result.updated_arguments |
| 384 | + | |
| 385 | + | |
| 386 | +@pytest.mark.asyncio | |
| 387 | +async def test_search_path_alias_hook_splits_full_glob_pattern( | |
| 388 | + temp_dir: Path, | |
| 389 | +) -> None: | |
| 390 | + registry = create_default_registry(temp_dir) | |
| 391 | + policy = build_permission_policy( | |
| 392 | + active_mode=PermissionMode.WORKSPACE_WRITE, | |
| 393 | + workspace_root=temp_dir, | |
| 394 | + tool_requirements=registry.get_tool_requirements(), | |
| 395 | + ) | |
| 396 | + hook = SearchPathAliasHook() | |
| 397 | + chapters = temp_dir / "chapters" | |
| 398 | + | |
| 399 | + result = await hook.pre_tool_use( | |
| 400 | + HookContext( | |
| 401 | + tool_call=ToolCall( | |
| 402 | + id="glob-1", | |
| 403 | + name="glob", | |
| 404 | + arguments={"pattern": f"{chapters}/*.html"}, | |
| 405 | + ), | |
| 406 | + tool=registry.get("glob"), | |
| 407 | + registry=registry, | |
| 408 | + permission_policy=policy, | |
| 409 | + source="native", | |
| 410 | + ) | |
| 411 | + ) | |
| 412 | + | |
| 413 | + assert result.updated_arguments is not None | |
| 414 | + assert result.updated_arguments["path"] == str(chapters) | |
| 415 | + assert result.updated_arguments["pattern"] == "*.html" | |
tests/test_prompt_builder.pymodified@@ -85,3 +85,15 @@ def test_prompt_builder_keeps_sections_stable_across_formats(temp_dir: Path) -> | ||
| 85 | 85 | assert "`react`" in react.content |
| 86 | 86 | assert "<tool_call>" in react.content |
| 87 | 87 | assert "call tools" in native.content.lower() |
| 88 | + | |
| 89 | + | |
| 90 | +def test_execute_mode_guidance_prefers_file_tools_for_text_edits(temp_dir: Path) -> None: | |
| 91 | + result = build_system_prompt_result( | |
| 92 | + tools=[_tool_schema("edit")], | |
| 93 | + use_react=False, | |
| 94 | + workflow_mode="execute", | |
| 95 | + permission_mode="workspace-write", | |
| 96 | + cwd=temp_dir, | |
| 97 | + ) | |
| 98 | + | |
| 99 | + assert "Prefer `edit`/`patch`/`write` over shell one-liners" in result.content | |
tests/test_recovery.pymodified@@ -122,6 +122,15 @@ class TestGetRecoveryHints: | ||
| 122 | 122 | hints = get_recovery_hints(ErrorCategory.COMMAND_NOT_FOUND, "bash") |
| 123 | 123 | assert "which" in hints.lower() |
| 124 | 124 | |
| 125 | + def test_bash_text_rewrite_hint_prefers_file_tools(self): | |
| 126 | + hints = get_recovery_hints( | |
| 127 | + ErrorCategory.UNKNOWN, | |
| 128 | + "bash", | |
| 129 | + {"command": "sed -i '1,3c\\updated' index.html"}, | |
| 130 | + ) | |
| 131 | + assert "edit/patch/write" in hints.lower() | |
| 132 | + assert "index.html" in hints | |
| 133 | + | |
| 125 | 134 | |
| 126 | 135 | class TestFormatRecoveryPrompt: |
| 127 | 136 | """Tests for recovery prompt formatting.""" |
@@ -140,6 +149,27 @@ class TestFormatRecoveryPrompt: | ||
| 140 | 149 | assert "1/3" in prompt |
| 141 | 150 | assert "retry the same command with slight variations" in prompt |
| 142 | 151 | |
| 152 | + def test_format_recovery_prompt_for_failed_shell_rewrite_points_to_file_tools(self): | |
| 153 | + ctx = RecoveryContext( | |
| 154 | + original_tool="bash", | |
| 155 | + original_args={"command": "sed -i '1,3c\\updated' index.html"}, | |
| 156 | + ) | |
| 157 | + ctx.add_attempt( | |
| 158 | + "bash", | |
| 159 | + {"command": "sed -i '1,3c\\updated' index.html"}, | |
| 160 | + "Exit code 1", | |
| 161 | + ) | |
| 162 | + | |
| 163 | + prompt = format_recovery_prompt( | |
| 164 | + ctx, | |
| 165 | + "bash", | |
| 166 | + {"command": "sed -i '1,3c\\updated' index.html"}, | |
| 167 | + "Exit code 1", | |
| 168 | + ) | |
| 169 | + | |
| 170 | + assert "edit/patch/write" in prompt.lower() | |
| 171 | + assert "index.html" in prompt | |
| 172 | + | |
| 143 | 173 | |
| 144 | 174 | class TestFormatFailureMessage: |
| 145 | 175 | """Tests for failure message formatting.""" |
tests/test_runtime_harness.pymodified@@ -1866,6 +1866,394 @@ async def test_duplicate_observation_queues_steering_to_reuse_prior_evidence( | ||
| 1866 | 1866 | assert any("index.html" in message for message in steering_messages) |
| 1867 | 1867 | |
| 1868 | 1868 | |
| 1869 | +@pytest.mark.asyncio | |
| 1870 | +async def test_relative_file_read_stays_on_recent_external_context( | |
| 1871 | + temp_dir: Path, | |
| 1872 | +) -> None: | |
| 1873 | + external_dir = temp_dir.parent / f"{temp_dir.name}-external-guide" | |
| 1874 | + external_dir.mkdir(exist_ok=True) | |
| 1875 | + external_index = external_dir / "index.html" | |
| 1876 | + external_index.write_text("external guide index\n") | |
| 1877 | + | |
| 1878 | + backend = ScriptedBackend( | |
| 1879 | + completions=[ | |
| 1880 | + native_tool_response( | |
| 1881 | + ToolCall( | |
| 1882 | + id="read-1", | |
| 1883 | + name="read", | |
| 1884 | + arguments={"file_path": str(external_index)}, | |
| 1885 | + ), | |
| 1886 | + content="I'll inspect the external index first.", | |
| 1887 | + ), | |
| 1888 | + native_tool_response( | |
| 1889 | + ToolCall( | |
| 1890 | + id="read-2", | |
| 1891 | + name="read", | |
| 1892 | + arguments={"file_path": "index.html"}, | |
| 1893 | + ), | |
| 1894 | + content="I'll reopen index.html in the same guide.", | |
| 1895 | + ), | |
| 1896 | + final_response("I stayed on the external guide instead of snapping back to the repo."), | |
| 1897 | + ] | |
| 1898 | + ) | |
| 1899 | + | |
| 1900 | + run = await run_scenario( | |
| 1901 | + "Inspect the external guide index twice.", | |
| 1902 | + backend, | |
| 1903 | + config=non_streaming_config(), | |
| 1904 | + project_root=temp_dir, | |
| 1905 | + ) | |
| 1906 | + | |
| 1907 | + assert tool_event_names(run) == ["read", "read"] | |
| 1908 | + messages = tool_result_messages(run) | |
| 1909 | + assert any("external guide index" in message for message in messages) | |
| 1910 | + assert not any("File not found: index.html" in message for message in messages) | |
| 1911 | + assert any( | |
| 1912 | + "Skipped - duplicate action" in message or "external guide index" in message | |
| 1913 | + for message in messages[1:] | |
| 1914 | + ) | |
| 1915 | + | |
| 1916 | + | |
| 1917 | +@pytest.mark.asyncio | |
| 1918 | +async def test_blocked_shell_text_rewrite_queues_file_tool_steering( | |
| 1919 | + temp_dir: Path, | |
| 1920 | +) -> None: | |
| 1921 | + target = temp_dir / "notes.txt" | |
| 1922 | + target.write_text("old value\n") | |
| 1923 | + | |
| 1924 | + backend = ScriptedBackend( | |
| 1925 | + completions=[ | |
| 1926 | + native_tool_response( | |
| 1927 | + ToolCall( | |
| 1928 | + id="bash-1", | |
| 1929 | + name="bash", | |
| 1930 | + arguments={"command": "sed -i '1s/old/new/' notes.txt"}, | |
| 1931 | + ), | |
| 1932 | + content="I'll update the file with sed.", | |
| 1933 | + ), | |
| 1934 | + native_tool_response( | |
| 1935 | + ToolCall( | |
| 1936 | + id="edit-1", | |
| 1937 | + name="edit", | |
| 1938 | + arguments={ | |
| 1939 | + "file_path": str(target), | |
| 1940 | + "old_string": "old value", | |
| 1941 | + "new_string": "new value", | |
| 1942 | + }, | |
| 1943 | + ), | |
| 1944 | + content="I'll switch to the edit tool instead.", | |
| 1945 | + ), | |
| 1946 | + final_response("Updated the file with Loader's file tools."), | |
| 1947 | + ] | |
| 1948 | + ) | |
| 1949 | + | |
| 1950 | + run = await run_scenario( | |
| 1951 | + "Update notes.txt from old value to new value.", | |
| 1952 | + backend, | |
| 1953 | + config=non_streaming_config(), | |
| 1954 | + project_root=temp_dir, | |
| 1955 | + ) | |
| 1956 | + | |
| 1957 | + assert tool_event_names(run) == ["bash", "edit"] | |
| 1958 | + assert target.read_text() == "new value\n" | |
| 1959 | + messages = tool_result_messages(run) | |
| 1960 | + assert any("Shell-based text rewrites are brittle" in message for message in messages) | |
| 1961 | + steering_messages = [ | |
| 1962 | + event.content | |
| 1963 | + for event in run.events | |
| 1964 | + if event.type == "steering" and event.content | |
| 1965 | + ] | |
| 1966 | + assert any("Use Loader's file tools for this text edit" in message for message in steering_messages) | |
| 1967 | + | |
| 1968 | + | |
| 1969 | +@pytest.mark.asyncio | |
| 1970 | +async def test_blocked_html_index_edit_queues_inventory_reuse_steering( | |
| 1971 | + temp_dir: Path, | |
| 1972 | +) -> None: | |
| 1973 | + chapters = temp_dir / "chapters" | |
| 1974 | + chapters.mkdir() | |
| 1975 | + (chapters / "05-input-output.html").write_text("<h1>Chapter 5: Input and Output</h1>\n") | |
| 1976 | + index_file = temp_dir / "index.html" | |
| 1977 | + index_file.write_text( | |
| 1978 | + '<ul class="chapter-list">\n' | |
| 1979 | + ' <li><a href="chapters/05-input-output.html">Chapter 5: Input and Output</a></li>\n' | |
| 1980 | + '</ul>\n' | |
| 1981 | + ) | |
| 1982 | + | |
| 1983 | + backend = ScriptedBackend( | |
| 1984 | + completions=[ | |
| 1985 | + native_tool_response( | |
| 1986 | + ToolCall( | |
| 1987 | + id="glob-1", | |
| 1988 | + name="glob", | |
| 1989 | + arguments={"path": str(chapters), "pattern": "*.html"}, | |
| 1990 | + ), | |
| 1991 | + content="I'll check which chapter files exist first.", | |
| 1992 | + ), | |
| 1993 | + native_tool_response( | |
| 1994 | + ToolCall( | |
| 1995 | + id="edit-1", | |
| 1996 | + name="edit", | |
| 1997 | + arguments={ | |
| 1998 | + "file_path": str(index_file), | |
| 1999 | + "old_string": '<li><a href="chapters/05-input-output.html">Chapter 5: Input and Output</a></li>', | |
| 2000 | + "new_string": '<li><a href="chapters/05-control-structures.html">Chapter 5: Control Structures</a></li>', | |
| 2001 | + }, | |
| 2002 | + ), | |
| 2003 | + content="I'll update the TOC entry.", | |
| 2004 | + ), | |
| 2005 | + final_response("I'll reuse the known chapter inventory and correct the TOC."), | |
| 2006 | + ] | |
| 2007 | + ) | |
| 2008 | + | |
| 2009 | + run = await run_scenario( | |
| 2010 | + "Fix the index table of contents so it matches the chapters directory.", | |
| 2011 | + backend, | |
| 2012 | + config=non_streaming_config(), | |
| 2013 | + project_root=temp_dir, | |
| 2014 | + ) | |
| 2015 | + | |
| 2016 | + messages = tool_result_messages(run) | |
| 2017 | + steering_messages = [ | |
| 2018 | + event.content | |
| 2019 | + for event in run.events | |
| 2020 | + if event.type == "steering" and event.content | |
| 2021 | + ] | |
| 2022 | + | |
| 2023 | + assert any("TOC references chapter files that do not exist" in message for message in messages) | |
| 2024 | + assert any( | |
| 2025 | + "Use the current target contents plus the verified sibling inventory instead of guessing." in message | |
| 2026 | + for message in steering_messages | |
| 2027 | + ) | |
| 2028 | + assert any( | |
| 2029 | + "chapters/05-input-output.html = Chapter 5: Input and Output" in message | |
| 2030 | + for message in steering_messages | |
| 2031 | + ) | |
| 2032 | + assert any("<ul class=\"chapter-list\">" in message for message in steering_messages) | |
| 2033 | + assert any("Suggested replacement block:" in message for message in steering_messages) | |
| 2034 | + assert any("Do not rewrite the whole document." in message for message in steering_messages) | |
| 2035 | + assert any("set `old_string` to the current TOC block above exactly" in message for message in steering_messages) | |
| 2036 | + assert any("Suggested edit call:" in message for message in steering_messages) | |
| 2037 | + assert any('old_string="""' in message for message in steering_messages) | |
| 2038 | + assert any( | |
| 2039 | + '<li><a href="chapters/05-input-output.html">Chapter 5: Input and Output</a></li>' in message | |
| 2040 | + for message in steering_messages | |
| 2041 | + ) | |
| 2042 | + | |
| 2043 | + | |
| 2044 | +@pytest.mark.asyncio | |
| 2045 | +async def test_full_path_glob_pattern_still_injects_verified_html_inventory( | |
| 2046 | + temp_dir: Path, | |
| 2047 | +) -> None: | |
| 2048 | + chapters = temp_dir / "chapters" | |
| 2049 | + chapters.mkdir() | |
| 2050 | + (chapters / "01-introduction.html").write_text( | |
| 2051 | + "<h1>Chapter 1: Introduction to Fortran</h1>\n" | |
| 2052 | + ) | |
| 2053 | + (chapters / "02-setup.html").write_text( | |
| 2054 | + "<h1>Chapter 2: Setting Up Fortran</h1>\n" | |
| 2055 | + ) | |
| 2056 | + index_file = temp_dir / "index.html" | |
| 2057 | + index_file.write_text("broken table of contents\n") | |
| 2058 | + | |
| 2059 | + backend = ScriptedBackend( | |
| 2060 | + completions=[ | |
| 2061 | + native_tool_response( | |
| 2062 | + ToolCall( | |
| 2063 | + id="glob-1", | |
| 2064 | + name="glob", | |
| 2065 | + arguments={"pattern": f"{chapters}/*.html"}, | |
| 2066 | + ), | |
| 2067 | + content="I'll inspect the chapter inventory first.", | |
| 2068 | + ), | |
| 2069 | + final_response("I'll update index.html using the verified inventory."), | |
| 2070 | + ] | |
| 2071 | + ) | |
| 2072 | + | |
| 2073 | + run = await run_scenario( | |
| 2074 | + "Fix index.html so the chapter links match the real chapter files.", | |
| 2075 | + backend, | |
| 2076 | + config=non_streaming_config(), | |
| 2077 | + project_root=temp_dir, | |
| 2078 | + ) | |
| 2079 | + | |
| 2080 | + assert tool_event_names(run) == ["glob"] | |
| 2081 | + messages = tool_result_messages(run) | |
| 2082 | + assert any( | |
| 2083 | + "Verified chapter inventory: chapters/01-introduction.html = Chapter 1: Introduction to Fortran" | |
| 2084 | + in message | |
| 2085 | + for message in messages | |
| 2086 | + ) | |
| 2087 | + assert any( | |
| 2088 | + "chapters/02-setup.html = Chapter 2: Setting Up Fortran" in message | |
| 2089 | + for message in messages | |
| 2090 | + ) | |
| 2091 | + | |
| 2092 | + | |
| 2093 | +@pytest.mark.asyncio | |
| 2094 | +async def test_verified_html_inventory_blocks_redundant_chapter_reread( | |
| 2095 | + temp_dir: Path, | |
| 2096 | +) -> None: | |
| 2097 | + chapters = temp_dir / "chapters" | |
| 2098 | + chapters.mkdir() | |
| 2099 | + (chapters / "01-introduction.html").write_text( | |
| 2100 | + "<h1>Chapter 1: Introduction to Fortran</h1>\n" | |
| 2101 | + ) | |
| 2102 | + (chapters / "02-setup.html").write_text( | |
| 2103 | + "<h1>Chapter 2: Setting Up Your Environment</h1>\n" | |
| 2104 | + ) | |
| 2105 | + index_file = temp_dir / "index.html" | |
| 2106 | + index_file.write_text("broken table of contents\n") | |
| 2107 | + | |
| 2108 | + backend = ScriptedBackend( | |
| 2109 | + completions=[ | |
| 2110 | + native_tool_response( | |
| 2111 | + ToolCall( | |
| 2112 | + id="glob-1", | |
| 2113 | + name="glob", | |
| 2114 | + arguments={"path": str(chapters), "pattern": "*.html"}, | |
| 2115 | + ), | |
| 2116 | + content="I'll inspect the chapter inventory first.", | |
| 2117 | + ), | |
| 2118 | + native_tool_response( | |
| 2119 | + ToolCall( | |
| 2120 | + id="read-1", | |
| 2121 | + name="read", | |
| 2122 | + arguments={"file_path": str(chapters / '01-introduction.html')}, | |
| 2123 | + ), | |
| 2124 | + content="I'll open the first chapter file to extract its title.", | |
| 2125 | + ), | |
| 2126 | + final_response("I'll update index.html using the verified chapter inventory."), | |
| 2127 | + ] | |
| 2128 | + ) | |
| 2129 | + | |
| 2130 | + run = await run_scenario( | |
| 2131 | + "Fix index.html so the chapter links and titles match the real chapter files.", | |
| 2132 | + backend, | |
| 2133 | + config=non_streaming_config(), | |
| 2134 | + project_root=temp_dir, | |
| 2135 | + ) | |
| 2136 | + | |
| 2137 | + messages = tool_result_messages(run) | |
| 2138 | + assert any( | |
| 2139 | + "Verified chapter inventory: chapters/01-introduction.html = Chapter 1: Introduction to Fortran" | |
| 2140 | + in message | |
| 2141 | + for message in messages | |
| 2142 | + ) | |
| 2143 | + assert any( | |
| 2144 | + "The verified chapter inventory already lists the exact href/title pairs for this directory" | |
| 2145 | + in message | |
| 2146 | + for message in messages | |
| 2147 | + ) | |
| 2148 | + | |
| 2149 | + | |
| 2150 | +@pytest.mark.asyncio | |
| 2151 | +async def test_successful_html_toc_edit_blocks_post_success_reread_and_steers_to_finish( | |
| 2152 | + temp_dir: Path, | |
| 2153 | +) -> None: | |
| 2154 | + chapters = temp_dir / "chapters" | |
| 2155 | + chapters.mkdir() | |
| 2156 | + (chapters / "01-introduction.html").write_text( | |
| 2157 | + "<h1>Chapter 1: Introduction to Fortran</h1>\n" | |
| 2158 | + ) | |
| 2159 | + (chapters / "02-setup.html").write_text( | |
| 2160 | + "<h1>Chapter 2: Setting Up Your Environment</h1>\n" | |
| 2161 | + ) | |
| 2162 | + index_file = temp_dir / "index.html" | |
| 2163 | + old_block = ( | |
| 2164 | + '<h2>Table of Contents</h2>\n' | |
| 2165 | + '<ul class="chapter-list">\n' | |
| 2166 | + ' <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n' | |
| 2167 | + ' <li><a href="chapters/02-old.html">Chapter 2: Old</a></li>\n' | |
| 2168 | + '</ul>\n' | |
| 2169 | + ) | |
| 2170 | + new_block = ( | |
| 2171 | + '<h2>Table of Contents</h2>\n' | |
| 2172 | + '<ul class="chapter-list">\n' | |
| 2173 | + ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n' | |
| 2174 | + ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n' | |
| 2175 | + '</ul>\n' | |
| 2176 | + ) | |
| 2177 | + index_file.write_text(new_block.replace("01-introduction.html", "01-old.html").replace("02-setup.html", "02-old.html").replace("Introduction to Fortran", "Old").replace("Setting Up Your Environment", "Old")) | |
| 2178 | + | |
| 2179 | + backend = ScriptedBackend( | |
| 2180 | + completions=[ | |
| 2181 | + native_tool_response( | |
| 2182 | + ToolCall( | |
| 2183 | + id="glob-1", | |
| 2184 | + name="glob", | |
| 2185 | + arguments={"path": str(chapters), "pattern": "*.html"}, | |
| 2186 | + ), | |
| 2187 | + content="I'll inspect the chapter inventory first.", | |
| 2188 | + ), | |
| 2189 | + native_tool_response( | |
| 2190 | + ToolCall( | |
| 2191 | + id="read-1", | |
| 2192 | + name="read", | |
| 2193 | + arguments={"file_path": str(index_file)}, | |
| 2194 | + ), | |
| 2195 | + content="I'll inspect index.html next.", | |
| 2196 | + ), | |
| 2197 | + native_tool_response( | |
| 2198 | + ToolCall( | |
| 2199 | + id="edit-1", | |
| 2200 | + name="edit", | |
| 2201 | + arguments={ | |
| 2202 | + "file_path": str(index_file), | |
| 2203 | + "old_string": old_block, | |
| 2204 | + "new_string": new_block, | |
| 2205 | + }, | |
| 2206 | + ), | |
| 2207 | + content="I'll fix the TOC now.", | |
| 2208 | + ), | |
| 2209 | + native_tool_response( | |
| 2210 | + ToolCall( | |
| 2211 | + id="read-2", | |
| 2212 | + name="read", | |
| 2213 | + arguments={"file_path": str(index_file)}, | |
| 2214 | + ), | |
| 2215 | + content="I'll reread index.html to confirm the change.", | |
| 2216 | + ), | |
| 2217 | + final_response( | |
| 2218 | + "I updated index.html so the table of contents matches the real chapter files." | |
| 2219 | + ), | |
| 2220 | + ] | |
| 2221 | + ) | |
| 2222 | + | |
| 2223 | + run = await run_scenario( | |
| 2224 | + "Update index.html so every chapter link and title matches the real HTML files in chapters/.", | |
| 2225 | + backend, | |
| 2226 | + config=non_streaming_config(), | |
| 2227 | + project_root=temp_dir, | |
| 2228 | + ) | |
| 2229 | + | |
| 2230 | + messages = tool_result_messages(run) | |
| 2231 | + steering_messages = [ | |
| 2232 | + event.content | |
| 2233 | + for event in run.events | |
| 2234 | + if event.type == "steering" and event.content | |
| 2235 | + ] | |
| 2236 | + | |
| 2237 | + assert any( | |
| 2238 | + "Semantic verification preview: validated 2 toc links in index.html" | |
| 2239 | + in message | |
| 2240 | + for message in messages | |
| 2241 | + ) | |
| 2242 | + assert any( | |
| 2243 | + "already passes the validated chapter-link check" in message | |
| 2244 | + for message in messages | |
| 2245 | + ) | |
| 2246 | + assert any( | |
| 2247 | + "already satisfies the verified chapter-link constraints" in message | |
| 2248 | + for message in steering_messages | |
| 2249 | + ) | |
| 2250 | + assert any( | |
| 2251 | + "Do not reread `index.html` or files in `chapters/`" in message | |
| 2252 | + for message in steering_messages | |
| 2253 | + ) | |
| 2254 | + assert "validated 2 toc links in index.html" in run.response | |
| 2255 | + | |
| 2256 | + | |
| 1869 | 2257 | @pytest.mark.asyncio |
| 1870 | 2258 | async def test_interleaved_reread_is_allowed_once_without_intervening_mutation( |
| 1871 | 2259 | temp_dir: Path, |
tests/test_safeguard_services.pymodified@@ -2,12 +2,19 @@ | ||
| 2 | 2 | |
| 3 | 3 | from __future__ import annotations |
| 4 | 4 | |
| 5 | +import tempfile | |
| 6 | +from pathlib import Path | |
| 7 | + | |
| 5 | 8 | import loader.agent.safeguards as agent_safeguards |
| 6 | 9 | from loader.agent.safeguards import RuntimeSafeguards as AgentRuntimeSafeguards |
| 7 | 10 | from loader.runtime.safeguard_services import ( |
| 8 | 11 | ActionTracker, |
| 9 | 12 | PreActionValidator, |
| 10 | 13 | ValidationResult, |
| 14 | + build_html_toc_edit_call_template, | |
| 15 | + build_html_toc_replacement_block, | |
| 16 | + format_html_inventory_entry, | |
| 17 | + validate_html_toc, | |
| 11 | 18 | ) |
| 12 | 19 | from loader.runtime.safeguards import RuntimeSafeguards |
| 13 | 20 | |
@@ -27,6 +34,109 @@ def test_action_tracker_detects_duplicate_write_after_recording(tmp_path) -> Non | ||
| 27 | 34 | assert str(file_path) in reason |
| 28 | 35 | |
| 29 | 36 | |
| 37 | +def test_build_html_toc_replacement_block_uses_verified_inventory(tmp_path) -> None: | |
| 38 | + chapters = tmp_path / "chapters" | |
| 39 | + chapters.mkdir() | |
| 40 | + (chapters / "01-introduction.html").write_text( | |
| 41 | + "<h1>Chapter 1: Introduction to Fortran</h1>\n" | |
| 42 | + ) | |
| 43 | + (chapters / "02-setup.html").write_text( | |
| 44 | + "<h1>Chapter 2: Setting Up Your Environment</h1>\n" | |
| 45 | + ) | |
| 46 | + index_path = tmp_path / "index.html" | |
| 47 | + index_path.write_text( | |
| 48 | + "<h2>Table of Contents</h2>\n" | |
| 49 | + "<ul class=\"chapter-list\">\n" | |
| 50 | + " <li><a href=\"chapters/01-old.html\">Chapter 1: Old</a></li>\n" | |
| 51 | + "</ul>\n" | |
| 52 | + ) | |
| 53 | + | |
| 54 | + replacement = build_html_toc_replacement_block(index_path) | |
| 55 | + | |
| 56 | + assert replacement is not None | |
| 57 | + assert "<h2>Table of Contents</h2>" in replacement | |
| 58 | + assert '<li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>' in replacement | |
| 59 | + assert '<li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>' in replacement | |
| 60 | + | |
| 61 | + | |
| 62 | +def test_build_html_toc_edit_call_template_uses_current_and_replacement_blocks(tmp_path) -> None: | |
| 63 | + chapters = tmp_path / "chapters" | |
| 64 | + chapters.mkdir() | |
| 65 | + (chapters / "01-introduction.html").write_text( | |
| 66 | + "<h1>Chapter 1: Introduction to Fortran</h1>\n" | |
| 67 | + ) | |
| 68 | + index_path = tmp_path / "index.html" | |
| 69 | + index_path.write_text( | |
| 70 | + "<h2>Table of Contents</h2>\n" | |
| 71 | + '<ul class="chapter-list">\n' | |
| 72 | + ' <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n' | |
| 73 | + "</ul>\n" | |
| 74 | + ) | |
| 75 | + | |
| 76 | + template = build_html_toc_edit_call_template(index_path) | |
| 77 | + | |
| 78 | + assert template is not None | |
| 79 | + assert template.startswith("edit(") | |
| 80 | + assert f'file_path="{index_path}"' in template | |
| 81 | + assert 'old_string="""' in template | |
| 82 | + assert 'new_string="""' in template | |
| 83 | + assert '<li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>' in template | |
| 84 | + | |
| 85 | + | |
| 86 | +def test_validate_html_toc_reports_missing_and_mismatched_links(tmp_path) -> None: | |
| 87 | + chapters = tmp_path / "chapters" | |
| 88 | + chapters.mkdir() | |
| 89 | + (chapters / "01-introduction.html").write_text( | |
| 90 | + "<h1>Chapter 1: Introduction to Fortran</h1>\n" | |
| 91 | + ) | |
| 92 | + index_path = tmp_path / "index.html" | |
| 93 | + index_path.write_text( | |
| 94 | + '<ul class="chapter-list">\n' | |
| 95 | + ' <li><a href="chapters/01-introduction.html">Chapter 1: Wrong Title</a></li>\n' | |
| 96 | + ' <li><a href="chapters/02-missing.html">Chapter 2: Missing</a></li>\n' | |
| 97 | + "</ul>\n" | |
| 98 | + ) | |
| 99 | + | |
| 100 | + result = validate_html_toc(index_path) | |
| 101 | + | |
| 102 | + assert result is not None | |
| 103 | + assert result.valid is False | |
| 104 | + assert result.link_count == 2 | |
| 105 | + assert result.missing == ("chapters/02-missing.html -> missing",) | |
| 106 | + assert ( | |
| 107 | + result.mismatched | |
| 108 | + == ( | |
| 109 | + "chapters/01-introduction.html -> Chapter 1: Wrong Title != Chapter 1: Introduction to Fortran", | |
| 110 | + ) | |
| 111 | + ) | |
| 112 | + | |
| 113 | + | |
| 114 | +def test_validate_html_toc_reports_success(tmp_path) -> None: | |
| 115 | + chapters = tmp_path / "chapters" | |
| 116 | + chapters.mkdir() | |
| 117 | + (chapters / "01-introduction.html").write_text( | |
| 118 | + "<h1>Chapter 1: Introduction to Fortran</h1>\n" | |
| 119 | + ) | |
| 120 | + (chapters / "02-setup.html").write_text( | |
| 121 | + "<h1>Chapter 2: Setting Up Your Environment</h1>\n" | |
| 122 | + ) | |
| 123 | + index_path = tmp_path / "index.html" | |
| 124 | + index_path.write_text( | |
| 125 | + '<ul class="chapter-list">\n' | |
| 126 | + ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n' | |
| 127 | + ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n' | |
| 128 | + "</ul>\n" | |
| 129 | + ) | |
| 130 | + | |
| 131 | + result = validate_html_toc(index_path) | |
| 132 | + | |
| 133 | + assert result is not None | |
| 134 | + assert result.valid is True | |
| 135 | + assert result.link_count == 2 | |
| 136 | + assert result.missing == () | |
| 137 | + assert result.mismatched == () | |
| 138 | + | |
| 139 | + | |
| 30 | 140 | def test_action_tracker_preserves_loop_description_format() -> None: |
| 31 | 141 | tracker = ActionTracker() |
| 32 | 142 | |
@@ -88,6 +198,88 @@ def test_action_tracker_blocks_repeated_read_without_changes(tmp_path) -> None: | ||
| 88 | 198 | assert str(file_path) in reason |
| 89 | 199 | |
| 90 | 200 | |
| 201 | +def test_action_tracker_blocks_post_validation_html_rereads_until_new_mutation(tmp_path) -> None: | |
| 202 | + tracker = ActionTracker() | |
| 203 | + chapters = tmp_path / "chapters" | |
| 204 | + chapters.mkdir() | |
| 205 | + chapter_path = chapters / "01-introduction.html" | |
| 206 | + chapter_path.write_text("<h1>Chapter 1: Introduction to Fortran</h1>\n") | |
| 207 | + index_path = tmp_path / "index.html" | |
| 208 | + index_path.write_text( | |
| 209 | + '<ul class="chapter-list">\n' | |
| 210 | + ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n' | |
| 211 | + "</ul>\n" | |
| 212 | + ) | |
| 213 | + | |
| 214 | + tracker.note_validated_html_toc(str(index_path)) | |
| 215 | + | |
| 216 | + assert tracker.check_tool_call("read", {"file_path": str(index_path)}) == ( | |
| 217 | + True, | |
| 218 | + "The current index.html already passes the validated chapter-link check; stop rereading index.html or chapters/ and finish the task unless a specific href or title is still unresolved", | |
| 219 | + ) | |
| 220 | + assert tracker.check_tool_call("read", {"file_path": str(chapter_path)}) == ( | |
| 221 | + True, | |
| 222 | + "The current index.html already passes the validated chapter-link check; stop rereading index.html or chapters/ and finish the task unless a specific href or title is still unresolved", | |
| 223 | + ) | |
| 224 | + assert tracker.check_tool_call( | |
| 225 | + "glob", | |
| 226 | + {"path": str(chapters), "pattern": "*.html"}, | |
| 227 | + ) == ( | |
| 228 | + True, | |
| 229 | + "The current index.html already passes the validated chapter-link check; stop rereading index.html or chapters/ and finish the task unless a specific href or title is still unresolved", | |
| 230 | + ) | |
| 231 | + assert tracker.check_tool_call( | |
| 232 | + "bash", | |
| 233 | + {"command": f"cat {index_path}"}, | |
| 234 | + ) == ( | |
| 235 | + True, | |
| 236 | + "The current index.html already passes the validated chapter-link check; stop rereading index.html or chapters/ and finish the task unless a specific href or title is still unresolved", | |
| 237 | + ) | |
| 238 | + | |
| 239 | + tracker.record_tool_call( | |
| 240 | + "edit", | |
| 241 | + { | |
| 242 | + "file_path": str(index_path), | |
| 243 | + "old_string": "Chapter 1", | |
| 244 | + "new_string": "Chapter One", | |
| 245 | + }, | |
| 246 | + ) | |
| 247 | + | |
| 248 | + assert tracker.check_tool_call("read", {"file_path": str(index_path)}) == (False, "") | |
| 249 | + | |
| 250 | + | |
| 251 | +def test_action_tracker_blocks_chapter_rereads_after_verified_inventory(tmp_path) -> None: | |
| 252 | + tracker = ActionTracker() | |
| 253 | + chapters = tmp_path / "chapters" | |
| 254 | + chapters.mkdir() | |
| 255 | + chapter_path = chapters / "01-introduction.html" | |
| 256 | + chapter_path.write_text("<h1>Chapter 1: Introduction to Fortran</h1>\n") | |
| 257 | + index_path = tmp_path / "index.html" | |
| 258 | + index_path.write_text("<ul></ul>\n") | |
| 259 | + | |
| 260 | + tracker.note_verified_html_inventory(str(index_path)) | |
| 261 | + | |
| 262 | + assert tracker.check_tool_call("read", {"file_path": str(index_path)}) == (False, "") | |
| 263 | + assert tracker.check_tool_call("read", {"file_path": str(chapter_path)}) == ( | |
| 264 | + True, | |
| 265 | + "The verified chapter inventory already lists the exact href/title pairs for this directory; update index.html from that inventory instead of rereading chapter files", | |
| 266 | + ) | |
| 267 | + assert tracker.check_tool_call( | |
| 268 | + "glob", | |
| 269 | + {"path": str(chapters), "pattern": "*.html"}, | |
| 270 | + ) == ( | |
| 271 | + True, | |
| 272 | + "The verified chapter inventory already lists the exact href/title pairs for this directory; update index.html from that inventory instead of rereading chapter files", | |
| 273 | + ) | |
| 274 | + assert tracker.check_tool_call( | |
| 275 | + "bash", | |
| 276 | + {"command": f"head -20 {chapter_path}"}, | |
| 277 | + ) == ( | |
| 278 | + True, | |
| 279 | + "The verified chapter inventory already lists the exact href/title pairs for this directory; update index.html from that inventory instead of rereading chapter files", | |
| 280 | + ) | |
| 281 | + | |
| 282 | + | |
| 91 | 283 | def test_action_tracker_allows_one_interleaved_reread_without_changes(tmp_path) -> None: |
| 92 | 284 | tracker = ActionTracker() |
| 93 | 285 | index_path = tmp_path / "index.html" |
@@ -131,6 +323,60 @@ def test_action_tracker_blocks_fourth_interleaved_reread_without_changes(tmp_pat | ||
| 131 | 323 | assert str(index_path) in reason |
| 132 | 324 | |
| 133 | 325 | |
| 326 | +def test_action_tracker_allows_one_target_index_reread_after_chapter_discovery(tmp_path) -> None: | |
| 327 | + tracker = ActionTracker() | |
| 328 | + index_path = tmp_path / "index.html" | |
| 329 | + chapters = tmp_path / "chapters" | |
| 330 | + chapter_a = chapters / "01-introduction.html" | |
| 331 | + chapter_b = chapters / "02-setup.html" | |
| 332 | + chapter_c = chapters / "03-basics.html" | |
| 333 | + | |
| 334 | + tracker.record_tool_call("read", {"file_path": str(index_path)}) | |
| 335 | + tracker.record_tool_call("read", {"file_path": str(chapter_a)}) | |
| 336 | + tracker.record_tool_call("read", {"file_path": str(chapter_b)}) | |
| 337 | + tracker.record_tool_call("read", {"file_path": str(chapter_c)}) | |
| 338 | + | |
| 339 | + is_duplicate, reason = tracker.check_tool_call("read", {"file_path": str(index_path)}) | |
| 340 | + | |
| 341 | + assert is_duplicate is False | |
| 342 | + assert reason == "" | |
| 343 | + | |
| 344 | + | |
| 345 | +def test_action_tracker_blocks_second_target_index_reread_after_chapter_discovery(tmp_path) -> None: | |
| 346 | + tracker = ActionTracker() | |
| 347 | + index_path = tmp_path / "index.html" | |
| 348 | + chapters = tmp_path / "chapters" | |
| 349 | + | |
| 350 | + tracker.record_tool_call("read", {"file_path": str(index_path)}) | |
| 351 | + tracker.record_tool_call("read", {"file_path": str(chapters / "01-introduction.html")}) | |
| 352 | + tracker.record_tool_call("read", {"file_path": str(chapters / "02-setup.html")}) | |
| 353 | + tracker.record_tool_call("read", {"file_path": str(chapters / "03-basics.html")}) | |
| 354 | + tracker.record_tool_call("read", {"file_path": str(index_path)}) | |
| 355 | + | |
| 356 | + is_duplicate, reason = tracker.check_tool_call("read", {"file_path": str(index_path)}) | |
| 357 | + | |
| 358 | + assert is_duplicate is True | |
| 359 | + assert "known file/title evidence" in reason | |
| 360 | + | |
| 361 | + | |
| 362 | +def test_action_tracker_blocks_repeated_chapter_directory_search_once_titles_are_known( | |
| 363 | + tmp_path, | |
| 364 | +) -> None: | |
| 365 | + tracker = ActionTracker() | |
| 366 | + chapters = tmp_path / "chapters" | |
| 367 | + search_args = {"pattern": "*.html", "path": str(chapters)} | |
| 368 | + | |
| 369 | + tracker.record_tool_call("glob", search_args) | |
| 370 | + tracker.record_tool_call("read", {"file_path": str(chapters / "01-introduction.html")}) | |
| 371 | + tracker.record_tool_call("read", {"file_path": str(chapters / "02-setup.html")}) | |
| 372 | + tracker.record_tool_call("read", {"file_path": str(chapters / "03-basics.html")}) | |
| 373 | + | |
| 374 | + is_duplicate, reason = tracker.check_tool_call("glob", search_args) | |
| 375 | + | |
| 376 | + assert is_duplicate is True | |
| 377 | + assert "known filename/title evidence" in reason | |
| 378 | + | |
| 379 | + | |
| 134 | 380 | def test_action_tracker_allows_repeated_read_after_mutation(tmp_path) -> None: |
| 135 | 381 | tracker = ActionTracker() |
| 136 | 382 | file_path = tmp_path / "index.html" |
@@ -177,6 +423,99 @@ def test_pre_action_validator_allows_patch_string_without_hunks() -> None: | ||
| 177 | 423 | assert result == ValidationResult(valid=True) |
| 178 | 424 | |
| 179 | 425 | |
| 426 | +def test_pre_action_validator_blocks_shell_text_rewrite_for_html_target() -> None: | |
| 427 | + validator = PreActionValidator() | |
| 428 | + | |
| 429 | + result = validator.validate( | |
| 430 | + "bash", | |
| 431 | + { | |
| 432 | + "command": ( | |
| 433 | + "cd /tmp/fortran-qwen-recovery-check && " | |
| 434 | + "sed -i '1,3c\\<li>updated</li>' index.html" | |
| 435 | + ) | |
| 436 | + }, | |
| 437 | + ) | |
| 438 | + | |
| 439 | + assert result.valid is False | |
| 440 | + assert result.reason == ( | |
| 441 | + "Shell-based text rewrites are brittle and bypass Loader's safer file tools" | |
| 442 | + ) | |
| 443 | + assert "edit/patch/write" in result.suggestion | |
| 444 | + assert "index.html" in result.suggestion | |
| 445 | + | |
| 446 | + | |
| 447 | +def test_pre_action_validator_allows_non_mutating_sed_probe() -> None: | |
| 448 | + validator = PreActionValidator() | |
| 449 | + | |
| 450 | + result = validator.validate( | |
| 451 | + "bash", | |
| 452 | + {"command": "sed -n '1,20p' index.html"}, | |
| 453 | + ) | |
| 454 | + | |
| 455 | + assert result == ValidationResult(valid=True) | |
| 456 | + | |
| 457 | + | |
| 458 | +def test_pre_action_validator_blocks_index_edit_with_missing_chapter_href(tmp_path) -> None: | |
| 459 | + validator = PreActionValidator() | |
| 460 | + index = tmp_path / "index.html" | |
| 461 | + chapters = tmp_path / "chapters" | |
| 462 | + chapters.mkdir() | |
| 463 | + (chapters / "05-input-output.html").write_text( | |
| 464 | + "<h1>Chapter 5: Input and Output</h1>\n" | |
| 465 | + ) | |
| 466 | + | |
| 467 | + result = validator.validate( | |
| 468 | + "edit", | |
| 469 | + { | |
| 470 | + "file_path": str(index), | |
| 471 | + "old_string": '<li><a href="chapters/05-input-output.html">Chapter 5: Input and Output</a></li>', | |
| 472 | + "new_string": '<li><a href="chapters/05-control-structures.html">Chapter 5: Control Structures</a></li>', | |
| 473 | + }, | |
| 474 | + ) | |
| 475 | + | |
| 476 | + assert result.valid is False | |
| 477 | + assert result.reason == "Edited TOC references chapter files that do not exist" | |
| 478 | + assert "chapters/05-input-output.html = Chapter 5: Input and Output" in result.suggestion | |
| 479 | + | |
| 480 | + | |
| 481 | +def test_pre_action_validator_blocks_index_edit_with_title_mismatch(tmp_path) -> None: | |
| 482 | + validator = PreActionValidator() | |
| 483 | + index = tmp_path / "index.html" | |
| 484 | + chapters = tmp_path / "chapters" | |
| 485 | + chapters.mkdir() | |
| 486 | + (chapters / "12-troubleshooting-tips.html").write_text( | |
| 487 | + "<h1>Chapter 12: Troubleshooting and Tips</h1>\n" | |
| 488 | + ) | |
| 489 | + | |
| 490 | + result = validator.validate( | |
| 491 | + "edit", | |
| 492 | + { | |
| 493 | + "file_path": str(index), | |
| 494 | + "old_string": '<li><a href="chapters/12-troubleshooting-tips.html">Chapter 12: Troubleshooting and Tips</a></li>', | |
| 495 | + "new_string": '<li><a href="chapters/12-troubleshooting-tips.html">Chapter 12: Troubleshooting Tips</a></li>', | |
| 496 | + }, | |
| 497 | + ) | |
| 498 | + | |
| 499 | + assert result.valid is False | |
| 500 | + assert result.reason == "Edited TOC labels do not match the linked chapter titles" | |
| 501 | + assert ( | |
| 502 | + "chapters/12-troubleshooting-tips.html = Chapter 12: Troubleshooting and Tips" | |
| 503 | + in result.suggestion | |
| 504 | + ) | |
| 505 | + | |
| 506 | + | |
| 507 | +def test_format_html_inventory_entry_handles_tmp_alias_paths() -> None: | |
| 508 | + root = Path(tempfile.mkdtemp(dir="/tmp")) | |
| 509 | + chapters = root / "chapters" | |
| 510 | + chapters.mkdir() | |
| 511 | + candidate = chapters / "05-input-output.html" | |
| 512 | + candidate.write_text("<h1>Chapter 5: Input and Output</h1>\n") | |
| 513 | + | |
| 514 | + entry = format_html_inventory_entry(root, candidate.resolve(strict=False)) | |
| 515 | + | |
| 516 | + assert entry == "chapters/05-input-output.html = Chapter 5: Input and Output" | |
| 517 | + | |
| 518 | + | |
| 180 | 519 | def test_runtime_safeguards_wrap_runtime_owned_services() -> None: |
| 181 | 520 | safeguards = RuntimeSafeguards() |
| 182 | 521 | |
tests/test_tool_batch_policies.pymodified@@ -293,6 +293,22 @@ async def test_tool_batch_recovery_controller_includes_known_state_for_missing_f | ||
| 293 | 293 | ), |
| 294 | 294 | tool_results=[], |
| 295 | 295 | ), |
| 296 | + Message( | |
| 297 | + role=Role.ASSISTANT, | |
| 298 | + content="I already inspected the setup chapter.", | |
| 299 | + tool_calls=[ | |
| 300 | + ToolCall( | |
| 301 | + id="read-setup", | |
| 302 | + name="read", | |
| 303 | + arguments={"file_path": "~/Loader/guides/fortran/chapters/02-setup.html"}, | |
| 304 | + ) | |
| 305 | + ], | |
| 306 | + ), | |
| 307 | + Message.tool_result_message( | |
| 308 | + tool_call_id="read-setup", | |
| 309 | + display_content="<h1>Chapter 2: Setting Up Fortran</h1>\n", | |
| 310 | + result_content="<h1>Chapter 2: Setting Up Fortran</h1>\n", | |
| 311 | + ), | |
| 296 | 312 | Message( |
| 297 | 313 | role=Role.TOOL, |
| 298 | 314 | content=( |
@@ -354,6 +370,7 @@ async def test_tool_batch_recovery_controller_includes_known_state_for_missing_f | ||
| 354 | 370 | assert "Prefer edit/write/patch on the target file" in follow_up.content |
| 355 | 371 | assert "04-variables.html" in follow_up.content |
| 356 | 372 | assert "02-basic-syntax.html -> 02-setup.html" in follow_up.content |
| 373 | + assert "02-setup.html = Chapter 2: Setting Up Fortran" in follow_up.content | |
| 357 | 374 | assert "`~/Loader/guides/fortran/index.html`" in follow_up.content |
| 358 | 375 | assert any(event.type == "recovery" for event in events) |
| 359 | 376 | |
@@ -368,20 +385,16 @@ async def test_tool_batch_recovery_controller_suggests_known_sibling_files( | ||
| 368 | 385 | async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification: |
| 369 | 386 | raise AssertionError("Verification should not run here") |
| 370 | 387 | |
| 371 | - messages = [ | |
| 372 | - Message( | |
| 373 | - role=Role.TOOL, | |
| 374 | - content=( | |
| 375 | - "Observation [glob]: Result: " | |
| 376 | - "/private/tmp/fortran-qwen-recovery-check/chapters/01-introduction.html\n" | |
| 377 | - "/private/tmp/fortran-qwen-recovery-check/chapters/02-setup.html\n" | |
| 378 | - "/private/tmp/fortran-qwen-recovery-check/chapters/03-basics.html\n" | |
| 379 | - "/private/tmp/fortran-qwen-recovery-check/chapters/04-variables.html\n" | |
| 380 | - "/private/tmp/fortran-qwen-recovery-check/chapters/05-input-output.html" | |
| 381 | - ), | |
| 382 | - tool_results=[], | |
| 383 | - ), | |
| 384 | - ] | |
| 388 | + chapters = temp_dir / "chapters" | |
| 389 | + chapters.mkdir() | |
| 390 | + (chapters / "04-variables.html").write_text( | |
| 391 | + "<h1>Chapter 4: Variables and Data Types</h1>\n" | |
| 392 | + ) | |
| 393 | + (chapters / "05-input-output.html").write_text( | |
| 394 | + "<h1>Chapter 5: Input and Output</h1>\n" | |
| 395 | + ) | |
| 396 | + | |
| 397 | + messages: list[Message] = [] | |
| 385 | 398 | context = build_context( |
| 386 | 399 | temp_dir=temp_dir, |
| 387 | 400 | messages=messages, |
@@ -392,11 +405,11 @@ async def test_tool_batch_recovery_controller_suggests_known_sibling_files( | ||
| 392 | 405 | tool_call = ToolCall( |
| 393 | 406 | id="read-missing", |
| 394 | 407 | name="read", |
| 395 | - arguments={"file_path": "/tmp/fortran-qwen-recovery-check/chapters/04-data-types.html"}, | |
| 408 | + arguments={"file_path": str(chapters / "04-data-types.html")}, | |
| 396 | 409 | ) |
| 397 | 410 | outcome = tool_outcome( |
| 398 | 411 | tool_call=tool_call, |
| 399 | - output="File not found: /tmp/fortran-qwen-recovery-check/chapters/04-data-types.html", | |
| 412 | + output=f"File not found: {chapters / '04-data-types.html'}", | |
| 400 | 413 | is_error=True, |
| 401 | 414 | ) |
| 402 | 415 | |
@@ -414,9 +427,92 @@ async def test_tool_batch_recovery_controller_suggests_known_sibling_files( | ||
| 414 | 427 | assert follow_up is not None |
| 415 | 428 | assert "## LIKELY FILE CANDIDATES" in follow_up.content |
| 416 | 429 | assert "`04-variables.html`" in follow_up.content |
| 430 | + assert "Chapter 4: Variables and Data Types" in follow_up.content | |
| 417 | 431 | assert "instead of retrying the missing path" in follow_up.content |
| 418 | 432 | |
| 419 | 433 | |
| 434 | +@pytest.mark.asyncio | |
| 435 | +async def test_tool_batch_recovery_controller_includes_current_html_target_excerpt( | |
| 436 | + temp_dir: Path, | |
| 437 | +) -> None: | |
| 438 | + async def assess_confidence(tool_name: str, tool_args: dict, context: str) -> ConfidenceAssessment: | |
| 439 | + raise AssertionError("Confidence should not run here") | |
| 440 | + | |
| 441 | + async def verify_action(tool_name: str, tool_args: dict, result: str, expected: str = "") -> ActionVerification: | |
| 442 | + raise AssertionError("Verification should not run here") | |
| 443 | + | |
| 444 | + chapters = temp_dir / "chapters" | |
| 445 | + chapters.mkdir() | |
| 446 | + (chapters / "01-introduction.html").write_text( | |
| 447 | + "<h1>Chapter 1: Introduction to Fortran</h1>\n" | |
| 448 | + ) | |
| 449 | + (chapters / "02-setup.html").write_text( | |
| 450 | + "<h1>Chapter 2: Setting Up Your Environment</h1>\n" | |
| 451 | + ) | |
| 452 | + index = temp_dir / "index.html" | |
| 453 | + index.write_text( | |
| 454 | + "<h2>Table of Contents</h2>\n" | |
| 455 | + "<ul class=\"chapter-list\">\n" | |
| 456 | + " <li><a href=\"chapters/01-introduction.html\">Chapter 1: Introduction to Fortran</a></li>\n" | |
| 457 | + " <li><a href=\"chapters/02-basic-syntax.html\">Chapter 2: Basic Syntax</a></li>\n" | |
| 458 | + "</ul>\n" | |
| 459 | + ) | |
| 460 | + | |
| 461 | + context = build_context( | |
| 462 | + temp_dir=temp_dir, | |
| 463 | + messages=[], | |
| 464 | + assess_confidence=assess_confidence, | |
| 465 | + verify_action=verify_action, | |
| 466 | + ) | |
| 467 | + controller = ToolBatchRecoveryController(context) | |
| 468 | + tool_call = ToolCall( | |
| 469 | + id="patch-index", | |
| 470 | + name="patch", | |
| 471 | + arguments={ | |
| 472 | + "file_path": str(index), | |
| 473 | + "hunks": [ | |
| 474 | + { | |
| 475 | + "old_start": 1, | |
| 476 | + "old_lines": 1, | |
| 477 | + "new_start": 1, | |
| 478 | + "new_lines": 1, | |
| 479 | + "lines": ["-bad", "+good"], | |
| 480 | + } | |
| 481 | + ], | |
| 482 | + }, | |
| 483 | + ) | |
| 484 | + outcome = tool_outcome( | |
| 485 | + tool_call=tool_call, | |
| 486 | + output="Patch failed: hunk did not apply cleanly", | |
| 487 | + is_error=True, | |
| 488 | + ) | |
| 489 | + | |
| 490 | + events: list[AgentEvent] = [] | |
| 491 | + | |
| 492 | + async def emit(event: AgentEvent) -> None: | |
| 493 | + events.append(event) | |
| 494 | + | |
| 495 | + follow_up = await controller.build_follow_up( | |
| 496 | + tool_call=tool_call, | |
| 497 | + outcome=outcome, | |
| 498 | + emit=emit, | |
| 499 | + ) | |
| 500 | + | |
| 501 | + assert follow_up is not None | |
| 502 | + assert "## CURRENT TARGET EXCERPT" in follow_up.content | |
| 503 | + assert "Verified chapter inventory:" in follow_up.content | |
| 504 | + assert "<ul class=\"chapter-list\">" in follow_up.content | |
| 505 | + assert "chapters/02-setup.html = Chapter 2: Setting Up Your Environment" in follow_up.content | |
| 506 | + assert "Suggested replacement block:" in follow_up.content | |
| 507 | + assert '<li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>' in follow_up.content | |
| 508 | + assert "Exact edit guidance:" in follow_up.content | |
| 509 | + assert "old_string: use the Current TOC block above exactly" in follow_up.content | |
| 510 | + assert "new_string: use the Suggested replacement block above exactly" in follow_up.content | |
| 511 | + assert "Do not rewrite the whole file." in follow_up.content | |
| 512 | + assert "Suggested edit call:" in follow_up.content | |
| 513 | + assert 'old_string="""' in follow_up.content | |
| 514 | + | |
| 515 | + | |
| 420 | 516 | @pytest.mark.asyncio |
| 421 | 517 | async def test_tool_batch_recovery_controller_reuses_context_for_related_missing_files( |
| 422 | 518 | temp_dir: Path, |
tests/test_tool_batches.pymodified@@ -453,12 +453,13 @@ async def test_tool_batch_runner_preserves_recovery_context_across_diagnostic_su | ||
| 453 | 453 | [tool_outcome(tool_call=tool_call, output="01-introduction.html", is_error=False)] |
| 454 | 454 | ) |
| 455 | 455 | |
| 456 | + summary = TurnSummary(final_response="") | |
| 456 | 457 | await runner.execute_batch( |
| 457 | 458 | tool_calls=[tool_call], |
| 458 | 459 | tool_source="assistant", |
| 459 | 460 | pending_tool_calls_seen=set(), |
| 460 | 461 | emit=_noop_emit, |
| 461 | - summary=TurnSummary(final_response=""), | |
| 462 | + summary=summary, | |
| 462 | 463 | dod=create_definition_of_done("Fix the chapter links"), |
| 463 | 464 | executor=executor, # type: ignore[arg-type] |
| 464 | 465 | on_confirmation=None, |
@@ -523,12 +524,13 @@ async def test_tool_batch_runner_clears_recovery_context_after_successful_mutati | ||
| 523 | 524 | [tool_outcome(tool_call=tool_call, output="Patched index.html", is_error=False)] |
| 524 | 525 | ) |
| 525 | 526 | |
| 527 | + summary = TurnSummary(final_response="") | |
| 526 | 528 | await runner.execute_batch( |
| 527 | 529 | tool_calls=[tool_call], |
| 528 | 530 | tool_source="assistant", |
| 529 | 531 | pending_tool_calls_seen=set(), |
| 530 | 532 | emit=_noop_emit, |
| 531 | - summary=TurnSummary(final_response=""), | |
| 533 | + summary=summary, | |
| 532 | 534 | dod=create_definition_of_done("Fix the chapter links"), |
| 533 | 535 | executor=executor, # type: ignore[arg-type] |
| 534 | 536 | on_confirmation=None, |
@@ -570,6 +572,22 @@ async def test_tool_batch_runner_queues_duplicate_observation_nudge( | ||
| 570 | 572 | ), |
| 571 | 573 | tool_results=[], |
| 572 | 574 | ), |
| 575 | + Message( | |
| 576 | + role=Role.ASSISTANT, | |
| 577 | + content="I already inspected the first chapter title.", | |
| 578 | + tool_calls=[ | |
| 579 | + ToolCall( | |
| 580 | + id="read-ch1", | |
| 581 | + name="read", | |
| 582 | + arguments={"file_path": str(temp_dir / 'chapters' / '01-introduction.html')}, | |
| 583 | + ) | |
| 584 | + ], | |
| 585 | + ), | |
| 586 | + Message.tool_result_message( | |
| 587 | + tool_call_id="read-ch1", | |
| 588 | + display_content="<h1>Chapter 1: Introduction to Fortran</h1>\n", | |
| 589 | + result_content="<h1>Chapter 1: Introduction to Fortran</h1>\n", | |
| 590 | + ), | |
| 573 | 591 | Message( |
| 574 | 592 | role=Role.ASSISTANT, |
| 575 | 593 | content="I should update the index now.", |
@@ -623,12 +641,13 @@ async def test_tool_batch_runner_queues_duplicate_observation_nudge( | ||
| 623 | 641 | ] |
| 624 | 642 | ) |
| 625 | 643 | |
| 644 | + summary = TurnSummary(final_response="") | |
| 626 | 645 | await runner.execute_batch( |
| 627 | 646 | tool_calls=[tool_call], |
| 628 | 647 | tool_source="assistant", |
| 629 | 648 | pending_tool_calls_seen=set(), |
| 630 | 649 | emit=_noop_emit, |
| 631 | - summary=TurnSummary(final_response=""), | |
| 650 | + summary=summary, | |
| 632 | 651 | dod=create_definition_of_done("Fix the chapter links"), |
| 633 | 652 | executor=executor, # type: ignore[arg-type] |
| 634 | 653 | on_confirmation=None, |
@@ -639,9 +658,196 @@ async def test_tool_batch_runner_queues_duplicate_observation_nudge( | ||
| 639 | 658 | |
| 640 | 659 | assert len(queued_messages) == 1 |
| 641 | 660 | assert "Reuse the earlier observation instead of repeating it." in queued_messages[0] |
| 661 | + assert "01-introduction.html = Chapter 1: Introduction to Fortran" in queued_messages[0] | |
| 642 | 662 | assert "index.html" in queued_messages[0] |
| 643 | 663 | |
| 644 | 664 | |
| 665 | +@pytest.mark.asyncio | |
| 666 | +async def test_tool_batch_runner_proactively_queues_verified_html_inventory( | |
| 667 | + temp_dir: Path, | |
| 668 | +) -> None: | |
| 669 | + async def assess_confidence( | |
| 670 | + tool_name: str, | |
| 671 | + tool_args: dict, | |
| 672 | + context: str, | |
| 673 | + ) -> ConfidenceAssessment: | |
| 674 | + raise AssertionError("Confidence scoring should be disabled in this scenario") | |
| 675 | + | |
| 676 | + async def verify_action( | |
| 677 | + tool_name: str, | |
| 678 | + tool_args: dict, | |
| 679 | + result: str, | |
| 680 | + expected: str = "", | |
| 681 | + ) -> ActionVerification: | |
| 682 | + raise AssertionError("Verification should not run for this scenario") | |
| 683 | + | |
| 684 | + chapters = temp_dir / "chapters" | |
| 685 | + chapters.mkdir() | |
| 686 | + (chapters / "01-introduction.html").write_text( | |
| 687 | + "<h1>Chapter 1: Introduction to Fortran</h1>\n" | |
| 688 | + ) | |
| 689 | + (chapters / "02-setup.html").write_text( | |
| 690 | + "<h1>Chapter 2: Setting Up Your Environment</h1>\n" | |
| 691 | + ) | |
| 692 | + (temp_dir / "index.html").write_text("<ul></ul>\n") | |
| 693 | + | |
| 694 | + context = build_context( | |
| 695 | + temp_dir=temp_dir, | |
| 696 | + messages=[], | |
| 697 | + safeguards=FakeSafeguards(), | |
| 698 | + assess_confidence=assess_confidence, | |
| 699 | + verify_action=verify_action, | |
| 700 | + auto_recover=False, | |
| 701 | + ) | |
| 702 | + context.session.current_task = ( | |
| 703 | + f"Update {temp_dir / 'index.html'} so the chapter links match the sibling files." | |
| 704 | + ) | |
| 705 | + queued_messages: list[str] = [] | |
| 706 | + context.queue_steering_message_callback = queued_messages.append | |
| 707 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 708 | + tool_call = ToolCall( | |
| 709 | + id="glob-1", | |
| 710 | + name="glob", | |
| 711 | + arguments={"path": str(chapters), "pattern": "*.html"}, | |
| 712 | + ) | |
| 713 | + executor = FakeExecutor( | |
| 714 | + [ | |
| 715 | + tool_outcome( | |
| 716 | + tool_call=tool_call, | |
| 717 | + output="\n".join( | |
| 718 | + [ | |
| 719 | + str(chapters / "01-introduction.html"), | |
| 720 | + str(chapters / "02-setup.html"), | |
| 721 | + ] | |
| 722 | + ), | |
| 723 | + is_error=False, | |
| 724 | + ) | |
| 725 | + ] | |
| 726 | + ) | |
| 727 | + | |
| 728 | + summary = TurnSummary(final_response="") | |
| 729 | + await runner.execute_batch( | |
| 730 | + tool_calls=[tool_call], | |
| 731 | + tool_source="assistant", | |
| 732 | + pending_tool_calls_seen=set(), | |
| 733 | + emit=_noop_emit, | |
| 734 | + summary=summary, | |
| 735 | + dod=create_definition_of_done("Fix the chapter links"), | |
| 736 | + executor=executor, # type: ignore[arg-type] | |
| 737 | + on_confirmation=None, | |
| 738 | + on_user_question=None, | |
| 739 | + emit_confirmation=None, | |
| 740 | + consecutive_errors=0, | |
| 741 | + ) | |
| 742 | + | |
| 743 | + assert len(queued_messages) == 1 | |
| 744 | + assert "verified sibling inventory" in queued_messages[0] | |
| 745 | + assert "chapters/01-introduction.html = Chapter 1: Introduction to Fortran" in queued_messages[0] | |
| 746 | + assert str(temp_dir / "index.html") in queued_messages[0] | |
| 747 | + assert len(summary.tool_result_messages) == 1 | |
| 748 | + assert ( | |
| 749 | + "Verified chapter inventory: chapters/01-introduction.html = Chapter 1: Introduction to Fortran" | |
| 750 | + in summary.tool_result_messages[0].content | |
| 751 | + ) | |
| 752 | + | |
| 753 | + | |
| 754 | +@pytest.mark.asyncio | |
| 755 | +async def test_tool_batch_runner_marks_validated_html_toc_completion_after_successful_edit( | |
| 756 | + temp_dir: Path, | |
| 757 | +) -> None: | |
| 758 | + async def assess_confidence( | |
| 759 | + tool_name: str, | |
| 760 | + tool_args: dict, | |
| 761 | + context: str, | |
| 762 | + ) -> ConfidenceAssessment: | |
| 763 | + raise AssertionError("Confidence scoring should be disabled in this scenario") | |
| 764 | + | |
| 765 | + async def verify_action( | |
| 766 | + tool_name: str, | |
| 767 | + tool_args: dict, | |
| 768 | + result: str, | |
| 769 | + expected: str = "", | |
| 770 | + ) -> ActionVerification: | |
| 771 | + raise AssertionError("Verification should not run for this scenario") | |
| 772 | + | |
| 773 | + chapters = temp_dir / "chapters" | |
| 774 | + chapters.mkdir() | |
| 775 | + (chapters / "01-introduction.html").write_text( | |
| 776 | + "<h1>Chapter 1: Introduction to Fortran</h1>\n" | |
| 777 | + ) | |
| 778 | + (chapters / "02-setup.html").write_text( | |
| 779 | + "<h1>Chapter 2: Setting Up Your Environment</h1>\n" | |
| 780 | + ) | |
| 781 | + index_path = temp_dir / "index.html" | |
| 782 | + old_block = ( | |
| 783 | + '<ul class="chapter-list">\n' | |
| 784 | + ' <li><a href="chapters/01-old.html">Chapter 1: Old</a></li>\n' | |
| 785 | + ' <li><a href="chapters/02-old.html">Chapter 2: Old</a></li>\n' | |
| 786 | + "</ul>\n" | |
| 787 | + ) | |
| 788 | + new_block = ( | |
| 789 | + '<ul class="chapter-list">\n' | |
| 790 | + ' <li><a href="chapters/01-introduction.html">Chapter 1: Introduction to Fortran</a></li>\n' | |
| 791 | + ' <li><a href="chapters/02-setup.html">Chapter 2: Setting Up Your Environment</a></li>\n' | |
| 792 | + "</ul>\n" | |
| 793 | + ) | |
| 794 | + index_path.write_text(new_block) | |
| 795 | + | |
| 796 | + context = build_context( | |
| 797 | + temp_dir=temp_dir, | |
| 798 | + messages=[], | |
| 799 | + safeguards=FakeSafeguards(), | |
| 800 | + assess_confidence=assess_confidence, | |
| 801 | + verify_action=verify_action, | |
| 802 | + auto_recover=False, | |
| 803 | + ) | |
| 804 | + queued_messages: list[str] = [] | |
| 805 | + context.queue_steering_message_callback = queued_messages.append | |
| 806 | + runner = ToolBatchRunner(context, DefinitionOfDoneStore(temp_dir)) | |
| 807 | + tool_call = ToolCall( | |
| 808 | + id="edit-1", | |
| 809 | + name="edit", | |
| 810 | + arguments={ | |
| 811 | + "file_path": str(index_path), | |
| 812 | + "old_string": old_block, | |
| 813 | + "new_string": new_block, | |
| 814 | + }, | |
| 815 | + ) | |
| 816 | + executor = FakeExecutor( | |
| 817 | + [ | |
| 818 | + tool_outcome( | |
| 819 | + tool_call=tool_call, | |
| 820 | + output=f"Successfully edited {index_path}", | |
| 821 | + is_error=False, | |
| 822 | + ) | |
| 823 | + ] | |
| 824 | + ) | |
| 825 | + | |
| 826 | + summary = TurnSummary(final_response="") | |
| 827 | + await runner.execute_batch( | |
| 828 | + tool_calls=[tool_call], | |
| 829 | + tool_source="assistant", | |
| 830 | + pending_tool_calls_seen=set(), | |
| 831 | + emit=_noop_emit, | |
| 832 | + summary=summary, | |
| 833 | + dod=create_definition_of_done("Fix the chapter links"), | |
| 834 | + executor=executor, # type: ignore[arg-type] | |
| 835 | + on_confirmation=None, | |
| 836 | + on_user_question=None, | |
| 837 | + emit_confirmation=None, | |
| 838 | + consecutive_errors=0, | |
| 839 | + ) | |
| 840 | + | |
| 841 | + assert any( | |
| 842 | + "Semantic verification preview: validated 2 toc links in index.html" | |
| 843 | + in message.content | |
| 844 | + for message in summary.tool_result_messages | |
| 845 | + ) | |
| 846 | + assert len(queued_messages) == 1 | |
| 847 | + assert "already satisfies the verified chapter-link constraints" in queued_messages[0] | |
| 848 | + assert "Do not reread `index.html` or files in `chapters/`" in queued_messages[0] | |
| 849 | + | |
| 850 | + | |
| 645 | 851 | async def _noop_emit(event: AgentEvent) -> None: |
| 646 | 852 | return None |
| 647 | 853 | |
tests/test_turn_preparation.pymodified@@ -10,6 +10,7 @@ from loader.agent.loop import AgentConfig | ||
| 10 | 10 | from loader.llm.base import CompletionResponse, ToolCall |
| 11 | 11 | from loader.runtime.completion_trace import CompletionTraceEntry |
| 12 | 12 | from loader.runtime.conversation import ConversationRuntime |
| 13 | +from loader.runtime.dod import DefinitionOfDoneStore, create_definition_of_done | |
| 13 | 14 | from loader.runtime.runtime_handle import RuntimeHandle |
| 14 | 15 | from tests.helpers.runtime_harness import ScriptedBackend |
| 15 | 16 | |
@@ -170,3 +171,79 @@ async def test_turn_preparation_can_bootstrap_clarify_handoff( | ||
| 170 | 171 | for event in events |
| 171 | 172 | if event.type == "workflow_mode" and event.workflow_mode |
| 172 | 173 | ] == ["clarify", "execute"] |
| 174 | + | |
| 175 | + | |
| 176 | +@pytest.mark.asyncio | |
| 177 | +async def test_turn_preparation_does_not_resume_latest_dod_from_older_session( | |
| 178 | + temp_dir: Path, | |
| 179 | +) -> None: | |
| 180 | + backend = ScriptedBackend() | |
| 181 | + handle = RuntimeHandle( | |
| 182 | + backend=backend, | |
| 183 | + config=non_streaming_config(), | |
| 184 | + project_root=temp_dir, | |
| 185 | + ) | |
| 186 | + runtime = ConversationRuntime(handle) | |
| 187 | + task = "Update /tmp/fortran/index.html so the chapter list matches the real files." | |
| 188 | + | |
| 189 | + stale_dod = create_definition_of_done(task) | |
| 190 | + stale_dod.status = "fixing" | |
| 191 | + stale_dod.touched_files.append("/tmp/fortran/index.html") | |
| 192 | + stale_dod.mutating_actions.append("edit") | |
| 193 | + stale_path = DefinitionOfDoneStore(temp_dir).save(stale_dod) | |
| 194 | + | |
| 195 | + events = [] | |
| 196 | + | |
| 197 | + async def capture(event) -> None: | |
| 198 | + events.append(event) | |
| 199 | + | |
| 200 | + prepared = await runtime.turn_preparation.prepare( | |
| 201 | + task=task, | |
| 202 | + emit=capture, | |
| 203 | + requested_mode="execute", | |
| 204 | + original_task=None, | |
| 205 | + on_user_question=None, | |
| 206 | + ) | |
| 207 | + | |
| 208 | + assert prepared.definition_of_done.storage_path != str(stale_path) | |
| 209 | + assert prepared.definition_of_done.touched_files == [] | |
| 210 | + assert prepared.definition_of_done.mutating_actions == [] | |
| 211 | + assert prepared.definition_of_done.pending_items == ["Complete the requested work"] | |
| 212 | + | |
| 213 | + | |
| 214 | +@pytest.mark.asyncio | |
| 215 | +async def test_turn_preparation_resumes_active_session_dod( | |
| 216 | + temp_dir: Path, | |
| 217 | +) -> None: | |
| 218 | + backend = ScriptedBackend() | |
| 219 | + handle = RuntimeHandle( | |
| 220 | + backend=backend, | |
| 221 | + config=non_streaming_config(), | |
| 222 | + project_root=temp_dir, | |
| 223 | + ) | |
| 224 | + runtime = ConversationRuntime(handle) | |
| 225 | + task = "Keep repairing the runtime state controller." | |
| 226 | + | |
| 227 | + existing_dod = create_definition_of_done(task) | |
| 228 | + existing_dod.status = "fixing" | |
| 229 | + existing_dod.pending_items.append("Collect verification evidence") | |
| 230 | + existing_dod.touched_files.append(str(temp_dir / "index.html")) | |
| 231 | + existing_path = DefinitionOfDoneStore(temp_dir).save(existing_dod) | |
| 232 | + handle.session.active_dod_path = str(existing_path) | |
| 233 | + | |
| 234 | + events = [] | |
| 235 | + | |
| 236 | + async def capture(event) -> None: | |
| 237 | + events.append(event) | |
| 238 | + | |
| 239 | + prepared = await runtime.turn_preparation.prepare( | |
| 240 | + task=task, | |
| 241 | + emit=capture, | |
| 242 | + requested_mode="execute", | |
| 243 | + original_task=None, | |
| 244 | + on_user_question=None, | |
| 245 | + ) | |
| 246 | + | |
| 247 | + assert prepared.definition_of_done.storage_path == str(existing_path) | |
| 248 | + assert prepared.definition_of_done.touched_files == [str(temp_dir / "index.html")] | |
| 249 | + assert prepared.definition_of_done.status == "fixing" | |
tests/test_workflow.pymodified@@ -205,6 +205,43 @@ def test_extract_verification_commands_from_markdown_splits_code_blocks() -> Non | ||
| 205 | 205 | ] |
| 206 | 206 | |
| 207 | 207 | |
| 208 | +def test_extract_verification_commands_from_markdown_ignores_prose_only_bullets() -> None: | |
| 209 | + markdown = "\n".join( | |
| 210 | + [ | |
| 211 | + "# Verification Plan", | |
| 212 | + "", | |
| 213 | + "## Verification Commands", | |
| 214 | + "- Check that all chapter links in index.html resolve to existing files", | |
| 215 | + "- Validate chapter titles with `python3 scripts/check_titles.py`", | |
| 216 | + "- `test -f index.html`", | |
| 217 | + ] | |
| 218 | + ) | |
| 219 | + | |
| 220 | + assert extract_verification_commands_from_markdown(markdown) == [ | |
| 221 | + "python3 scripts/check_titles.py", | |
| 222 | + "test -f index.html", | |
| 223 | + ] | |
| 224 | + | |
| 225 | + | |
| 226 | +def test_extract_verification_commands_keeps_shell_pipelines_intact() -> None: | |
| 227 | + markdown = "\n".join( | |
| 228 | + [ | |
| 229 | + "# Verification Plan", | |
| 230 | + "", | |
| 231 | + "## Verification Commands", | |
| 232 | + "```bash", | |
| 233 | + "ls -la chapters/", | |
| 234 | + "cat index.html | head -20", | |
| 235 | + "```", | |
| 236 | + ] | |
| 237 | + ) | |
| 238 | + | |
| 239 | + assert extract_verification_commands_from_markdown(markdown) == [ | |
| 240 | + "ls -la chapters/", | |
| 241 | + "cat index.html | head -20", | |
| 242 | + ] | |
| 243 | + | |
| 244 | + | |
| 208 | 245 | def test_workflow_artifact_store_and_bridge_round_trip(tmp_path: Path) -> None: |
| 209 | 246 | store = WorkflowArtifactStore(tmp_path) |
| 210 | 247 | brief = ClarifyBrief.fallback( |