tenseleyflow/loader / 9d7e370

Browse files

Derive guide quality floors

Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
9d7e3708ac86db0aba9efa59cc8413b342209d80
Parents
aa9c340
Tree
6fb71f0

2 changed files

StatusFile+-
M src/loader/runtime/dod.py 207 2
M tests/test_dod.py 74 0
src/loader/runtime/dod.pymodified
@@ -97,6 +97,18 @@ _MUTATING_FILE_CHANGE_HINTS = (
97
     "developing",
97
     "developing",
98
 )
98
 )
99
 _MINIMUM_SUBSTANTIVE_HTML_GUIDE_PAGES = 4
99
 _MINIMUM_SUBSTANTIVE_HTML_GUIDE_PAGES = 4
100
+_MINIMUM_SUBSTANTIVE_HTML_INDEX_CHARS = 400
101
+_MINIMUM_SUBSTANTIVE_HTML_INDEX_BLOCKS = 4
102
+_MINIMUM_SUBSTANTIVE_HTML_CHAPTER_CHARS = 900
103
+_MINIMUM_SUBSTANTIVE_HTML_CHAPTER_BLOCKS = 8
104
+_REFERENCE_HTML_INDEX_CHAR_FRACTION = 0.35
105
+_REFERENCE_HTML_INDEX_BLOCK_FRACTION = 0.35
106
+_REFERENCE_HTML_INDEX_CHAR_CAP = 1400
107
+_REFERENCE_HTML_INDEX_BLOCK_CAP = 12
108
+_REFERENCE_HTML_CHAPTER_CHAR_FRACTION = 0.50
109
+_REFERENCE_HTML_CHAPTER_BLOCK_FRACTION = 0.45
110
+_REFERENCE_HTML_CHAPTER_CHAR_CAP = 2200
111
+_REFERENCE_HTML_CHAPTER_BLOCK_CAP = 18
100
 
112
 
101
 
113
 
102
 @dataclass
114
 @dataclass
@@ -113,6 +125,16 @@ class VerificationEvidence:
113
     kind: VerificationKind = "runtime"
125
     kind: VerificationKind = "runtime"
114
 
126
 
115
 
127
 
128
+@dataclass(frozen=True)
129
+class HtmlGuideQualityFloor:
130
+    """Minimum content density expected for substantive generated HTML guides."""
131
+
132
+    index_chars: int = _MINIMUM_SUBSTANTIVE_HTML_INDEX_CHARS
133
+    index_blocks: int = _MINIMUM_SUBSTANTIVE_HTML_INDEX_BLOCKS
134
+    chapter_chars: int = _MINIMUM_SUBSTANTIVE_HTML_CHAPTER_CHARS
135
+    chapter_blocks: int = _MINIMUM_SUBSTANTIVE_HTML_CHAPTER_BLOCKS
136
+
137
+
116
 @dataclass
138
 @dataclass
117
 class DefinitionOfDone:
139
 class DefinitionOfDone:
118
     """Single source of truth for task completion state."""
140
     """Single source of truth for task completion state."""
@@ -727,6 +749,11 @@ def _derive_multi_page_html_quality_command(
727
         return None
749
         return None
728
 
750
 
729
     path_literals = ", ".join(repr(str(path)) for path in html_paths)
751
     path_literals = ", ".join(repr(str(path)) for path in html_paths)
752
+    quality_floor = _derive_html_guide_quality_floor(
753
+        task_statement,
754
+        output_paths=html_paths,
755
+        project_root=project_root,
756
+    )
730
     return "\n".join(
757
     return "\n".join(
731
         [
758
         [
732
             "python3 - <<'PY'",
759
             "python3 - <<'PY'",
@@ -734,6 +761,10 @@ def _derive_multi_page_html_quality_command(
734
             "import re",
761
             "import re",
735
             "",
762
             "",
736
             f"paths = [{path_literals}]",
763
             f"paths = [{path_literals}]",
764
+            f"minimum_index_chars = {quality_floor.index_chars}",
765
+            f"minimum_index_blocks = {quality_floor.index_blocks}",
766
+            f"minimum_chapter_chars = {quality_floor.chapter_chars}",
767
+            f"minimum_chapter_blocks = {quality_floor.chapter_blocks}",
737
             "tag_pattern = re.compile(r'<[^>]+>')",
768
             "tag_pattern = re.compile(r'<[^>]+>')",
738
             "content_block_pattern = re.compile(r'<(p|li|pre|code|section|article|table|h2|h3|h4)\\b', re.IGNORECASE)",
769
             "content_block_pattern = re.compile(r'<(p|li|pre|code|section|article|table|h2|h3|h4)\\b', re.IGNORECASE)",
739
             "issues = []",
770
             "issues = []",
@@ -748,8 +779,8 @@ def _derive_multi_page_html_quality_command(
748
             "    plain = re.sub(r'\\s+', ' ', plain).strip()",
779
             "    plain = re.sub(r'\\s+', ' ', plain).strip()",
749
             "    content_blocks = len(content_block_pattern.findall(text))",
780
             "    content_blocks = len(content_block_pattern.findall(text))",
750
             "    has_h1 = bool(re.search(r'<h1\\b', text, re.IGNORECASE))",
781
             "    has_h1 = bool(re.search(r'<h1\\b', text, re.IGNORECASE))",
751
-            "    minimum_chars = 180 if path.name.lower() == 'index.html' else 220",
782
+            "    minimum_chars = minimum_index_chars if path.name.lower() == 'index.html' else minimum_chapter_chars",
752
-            "    minimum_blocks = 2 if path.name.lower() == 'index.html' else 3",
783
+            "    minimum_blocks = minimum_index_blocks if path.name.lower() == 'index.html' else minimum_chapter_blocks",
753
             "    if not has_h1:",
784
             "    if not has_h1:",
754
             "        issues.append(f'{path}: missing <h1>')",
785
             "        issues.append(f'{path}: missing <h1>')",
755
             "    if len(plain) < minimum_chars:",
786
             "    if len(plain) < minimum_chars:",
@@ -770,6 +801,180 @@ def _derive_multi_page_html_quality_command(
770
     )
801
     )
771
 
802
 
772
 
803
 
804
+def _derive_html_guide_quality_floor(
805
+    task_statement: str,
806
+    *,
807
+    output_paths: list[Path],
808
+    project_root: Path,
809
+) -> HtmlGuideQualityFloor:
810
+    reference_paths = _reference_html_guide_paths_from_task(
811
+        task_statement,
812
+        output_paths=output_paths,
813
+        project_root=project_root,
814
+    )
815
+    if not reference_paths:
816
+        return HtmlGuideQualityFloor()
817
+
818
+    index_chars: list[int] = []
819
+    index_blocks: list[int] = []
820
+    chapter_chars: list[int] = []
821
+    chapter_blocks: list[int] = []
822
+    for path in reference_paths:
823
+        try:
824
+            text = path.read_text()
825
+        except OSError:
826
+            continue
827
+        plain = re.sub(r"\s+", " ", re.sub(r"<[^>]+>", " ", text)).strip()
828
+        blocks = len(
829
+            re.findall(
830
+                r"<(p|li|pre|code|section|article|table|h2|h3|h4)\b",
831
+                text,
832
+                re.IGNORECASE,
833
+            )
834
+        )
835
+        if path.name.lower() == "index.html":
836
+            index_chars.append(len(plain))
837
+            index_blocks.append(blocks)
838
+        else:
839
+            chapter_chars.append(len(plain))
840
+            chapter_blocks.append(blocks)
841
+
842
+    if not chapter_chars and not index_chars:
843
+        return HtmlGuideQualityFloor()
844
+
845
+    return HtmlGuideQualityFloor(
846
+        index_chars=_reference_quality_floor(
847
+            _lower_quartile(index_chars) or _lower_quartile(chapter_chars),
848
+            fraction=_REFERENCE_HTML_INDEX_CHAR_FRACTION,
849
+            minimum=_MINIMUM_SUBSTANTIVE_HTML_INDEX_CHARS,
850
+            cap=_REFERENCE_HTML_INDEX_CHAR_CAP,
851
+        ),
852
+        index_blocks=_reference_quality_floor(
853
+            _lower_quartile(index_blocks) or _lower_quartile(chapter_blocks),
854
+            fraction=_REFERENCE_HTML_INDEX_BLOCK_FRACTION,
855
+            minimum=_MINIMUM_SUBSTANTIVE_HTML_INDEX_BLOCKS,
856
+            cap=_REFERENCE_HTML_INDEX_BLOCK_CAP,
857
+        ),
858
+        chapter_chars=_reference_quality_floor(
859
+            _lower_quartile(chapter_chars) or _lower_quartile(index_chars),
860
+            fraction=_REFERENCE_HTML_CHAPTER_CHAR_FRACTION,
861
+            minimum=_MINIMUM_SUBSTANTIVE_HTML_CHAPTER_CHARS,
862
+            cap=_REFERENCE_HTML_CHAPTER_CHAR_CAP,
863
+        ),
864
+        chapter_blocks=_reference_quality_floor(
865
+            _lower_quartile(chapter_blocks) or _lower_quartile(index_blocks),
866
+            fraction=_REFERENCE_HTML_CHAPTER_BLOCK_FRACTION,
867
+            minimum=_MINIMUM_SUBSTANTIVE_HTML_CHAPTER_BLOCKS,
868
+            cap=_REFERENCE_HTML_CHAPTER_BLOCK_CAP,
869
+        ),
870
+    )
871
+
872
+
873
+def _reference_html_guide_paths_from_task(
874
+    task_statement: str,
875
+    *,
876
+    output_paths: list[Path],
877
+    project_root: Path,
878
+) -> list[Path]:
879
+    output_roots = _html_output_scope_roots(output_paths)
880
+    paths: list[Path] = []
881
+    seen: set[str] = set()
882
+    for raw_path in _extract_task_path_mentions(task_statement):
883
+        path = Path(raw_path).expanduser()
884
+        if not path.is_absolute():
885
+            path = project_root / path
886
+        try:
887
+            resolved = path.resolve(strict=False)
888
+        except (OSError, RuntimeError, ValueError):
889
+            continue
890
+        if any(_path_is_within_root(resolved, root) for root in output_roots):
891
+            continue
892
+        for candidate in _collect_reference_html_paths(resolved):
893
+            key = str(candidate)
894
+            if key in seen:
895
+                continue
896
+            seen.add(key)
897
+            paths.append(candidate)
898
+    return paths
899
+
900
+
901
+def _html_output_scope_roots(output_paths: list[Path]) -> tuple[Path, ...]:
902
+    roots: list[Path] = []
903
+    seen: set[str] = set()
904
+    for path in output_paths:
905
+        try:
906
+            resolved = path.expanduser().resolve(strict=False)
907
+        except (OSError, RuntimeError, ValueError):
908
+            continue
909
+        root = resolved.parent
910
+        if root.name.lower() in {"chapters", "pages", "sections"}:
911
+            root = root.parent
912
+        key = str(root)
913
+        if key in seen:
914
+            continue
915
+        seen.add(key)
916
+        roots.append(root)
917
+    return tuple(roots)
918
+
919
+
920
+def _extract_task_path_mentions(task_statement: str) -> list[str]:
921
+    candidates = re.findall(r"`([^`]+)`", task_statement)
922
+    candidates.extend(
923
+        re.findall(
924
+            r"(?:~|/|\./|\../)[A-Za-z0-9_./~+-]+",
925
+            task_statement,
926
+        )
927
+    )
928
+
929
+    paths: list[str] = []
930
+    seen: set[str] = set()
931
+    for candidate in candidates:
932
+        cleaned = candidate.strip().strip("`'\",.:;()[]{}")
933
+        if not cleaned or cleaned in seen:
934
+            continue
935
+        if not _looks_like_path_literal(cleaned):
936
+            continue
937
+        seen.add(cleaned)
938
+        paths.append(cleaned)
939
+    return paths
940
+
941
+
942
+def _collect_reference_html_paths(path: Path) -> list[Path]:
943
+    if path.is_file() and path.suffix.lower() in {".html", ".htm"}:
944
+        return [path]
945
+    if not path.is_dir():
946
+        return []
947
+
948
+    candidates: list[Path] = []
949
+    index = path / "index.html"
950
+    if index.is_file():
951
+        candidates.append(index)
952
+    try:
953
+        candidates.extend(sorted(child for child in path.rglob("*.html") if child.is_file()))
954
+    except OSError:
955
+        return candidates
956
+    return list(dict.fromkeys(candidates[:32]))
957
+
958
+
959
+def _lower_quartile(values: list[int]) -> int:
960
+    if not values:
961
+        return 0
962
+    ordered = sorted(values)
963
+    return ordered[max(0, (len(ordered) - 1) // 4)]
964
+
965
+
966
+def _reference_quality_floor(
967
+    value: int,
968
+    *,
969
+    fraction: float,
970
+    minimum: int,
971
+    cap: int,
972
+) -> int:
973
+    if value <= 0:
974
+        return minimum
975
+    return max(minimum, min(cap, int(value * fraction)))
976
+
977
+
773
 def collect_planned_artifact_targets(
978
 def collect_planned_artifact_targets(
774
     dod: DefinitionOfDone,
979
     dod: DefinitionOfDone,
775
     *,
980
     *,
tests/test_dod.pymodified
@@ -1,5 +1,6 @@
1
 """Tests for definition-of-done state and persistence."""
1
 """Tests for definition-of-done state and persistence."""
2
 
2
 
3
+import subprocess
3
 from pathlib import Path
4
 from pathlib import Path
4
 
5
 
5
 from loader.llm.base import ToolCall
6
 from loader.llm.base import ToolCall
@@ -262,6 +263,79 @@ def test_derive_verification_commands_adds_html_guide_quality_check_for_thorough
262
     assert any("HTML guide content quality issues:" in command for command in commands)
263
     assert any("HTML guide content quality issues:" in command for command in commands)
263
 
264
 
264
 
265
 
266
+def test_derive_verification_commands_uses_reference_guide_depth_floor(
267
+    tmp_path: Path,
268
+) -> None:
269
+    reference = tmp_path / "reference"
270
+    reference_chapters = reference / "chapters"
271
+    reference_chapters.mkdir(parents=True)
272
+    (reference / "index.html").write_text("<h1>Reference</h1>" + "<p>" + "i" * 1600 + "</p>")
273
+    for index in range(1, 5):
274
+        (reference_chapters / f"0{index}-topic.html").write_text(
275
+            "<h1>Reference Chapter</h1>"
276
+            + "".join(f"<h2>Section {section}</h2><p>{'x' * 300}</p>" for section in range(10))
277
+        )
278
+
279
+    guide = tmp_path / "guide"
280
+    chapters = guide / "chapters"
281
+    chapters.mkdir(parents=True)
282
+    (guide / "index.html").write_text("<h1>Guide</h1>" + "<p>" + "i" * 1000 + "</p>")
283
+    (chapters / "01-introduction.html").write_text(
284
+        "<h1>Intro</h1>"
285
+        + "".join(f"<h2>Section {section}</h2><p>{'x' * 110}</p>" for section in range(10))
286
+    )
287
+    for index in range(2, 5):
288
+        (chapters / f"0{index}-topic.html").write_text(
289
+            "<h1>Topic</h1>"
290
+            + "".join(f"<h2>Section {section}</h2><p>{'x' * 220}</p>" for section in range(10))
291
+        )
292
+
293
+    implementation_plan = tmp_path / "implementation.md"
294
+    implementation_plan.write_text(
295
+        "\n".join(
296
+            [
297
+                "# Implementation Plan",
298
+                "",
299
+                "## File Changes",
300
+                f"- `{guide / 'index.html'}`",
301
+                f"- `{chapters / '01-introduction.html'}`",
302
+                f"- `{chapters / '02-topic.html'}`",
303
+                f"- `{chapters / '03-topic.html'}`",
304
+                f"- `{chapters / '04-topic.html'}`",
305
+                "",
306
+            ]
307
+        )
308
+    )
309
+
310
+    dod = create_definition_of_done(
311
+        f"Create an equally thorough HTML guide modeled on {reference} at {guide}."
312
+    )
313
+    dod.implementation_plan = str(implementation_plan)
314
+
315
+    commands = derive_verification_commands(
316
+        dod,
317
+        project_root=tmp_path,
318
+        task_statement=dod.task_statement,
319
+        supplement_existing=True,
320
+    )
321
+    quality_command = next(
322
+        command for command in commands if "HTML guide content quality issues:" in command
323
+    )
324
+
325
+    result = subprocess.run(
326
+        quality_command,
327
+        shell=True,
328
+        cwd=tmp_path,
329
+        capture_output=True,
330
+        text=True,
331
+        check=False,
332
+    )
333
+
334
+    assert result.returncode == 1
335
+    assert "01-introduction.html: thin content" in result.stdout
336
+    assert "expected at least 15" in result.stdout
337
+
338
+
265
 def test_derive_verification_commands_flags_insufficient_pages_for_broad_thorough_guide(
339
 def test_derive_verification_commands_flags_insufficient_pages_for_broad_thorough_guide(
266
     tmp_path: Path,
340
     tmp_path: Path,
267
 ) -> None:
341
 ) -> None: