tenseleyflow/loader / 35c0bcb

Browse files

Block TOC quality inflation

Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
35c0bcb767b7d5678001e6a51d97bb28675b0316
Parents
7b1c338
Tree
7165e37

4 changed files

StatusFile+-
M src/loader/runtime/finalization.py 4 0
M src/loader/runtime/safeguard_services.py 93 0
M tests/test_finalization.py 2 0
M tests/test_safeguard_services.py 91 0
src/loader/runtime/finalization.pymodified
@@ -1362,6 +1362,10 @@ def _html_quality_repair_strategy_lines() -> list[str]:
13621362
         "exceed the expected text-character floor; if it says insufficient "
13631363
         "structured content, add enough real sections, lists, code, tables, or "
13641364
         "other content blocks to exceed the block floor.",
1365
+        "- Treat structured content as substantive page body material, not table-of-"
1366
+        "contents inflation: do not add duplicate navigation entries, relabel links "
1367
+        "to existing pages as new chapters, or introduce new missing page links just "
1368
+        "to increase block counts.",
13651369
     ]
13661370
 
13671371
 
src/loader/runtime/safeguard_services.pymodified
@@ -800,6 +800,23 @@ class PreActionValidator:
800800
         if not html_link_scope_result.valid:
801801
             return html_link_scope_result
802802
 
803
+        if Path(file_path).expanduser().exists():
804
+            html_index_result = self._validate_html_index_links(
805
+                str(file_path),
806
+                str(content),
807
+            )
808
+            if not html_index_result.valid:
809
+                return html_index_result
810
+
811
+        html_duplicate_root_links_result = (
812
+            self._validate_html_root_duplicate_local_links(
813
+                str(file_path),
814
+                str(content),
815
+            )
816
+        )
817
+        if not html_duplicate_root_links_result.valid:
818
+            return html_duplicate_root_links_result
819
+
803820
         html_declared_target_result = self._validate_html_declared_target_set(
804821
             str(file_path),
805822
             str(content),
@@ -884,6 +901,15 @@ class PreActionValidator:
884901
         if not html_index_result.valid:
885902
             return html_index_result
886903
 
904
+        html_duplicate_root_links_result = (
905
+            self._validate_html_root_duplicate_local_links(
906
+                str(file_path),
907
+                prospective_content,
908
+            )
909
+        )
910
+        if not html_duplicate_root_links_result.valid:
911
+            return html_duplicate_root_links_result
912
+
887913
         html_declared_target_result = self._validate_html_declared_target_set(
888914
             str(file_path),
889915
             prospective_content,
@@ -1292,6 +1318,53 @@ class PreActionValidator:
12921318
 
12931319
         return ValidationResult(valid=True)
12941320
 
1321
+    def _validate_html_root_duplicate_local_links(
1322
+        self,
1323
+        file_path: str,
1324
+        content: str,
1325
+    ) -> ValidationResult:
1326
+        normalized = Path(file_path).expanduser()
1327
+        if normalized.suffix.lower() != ".html" or normalized.name.lower() != "index.html":
1328
+            return ValidationResult(valid=True)
1329
+
1330
+        root = self._resolve_html_artifact_root(normalized)
1331
+        labels_by_target: dict[str, list[str]] = {}
1332
+        for _href, resolved, label in self._collect_local_html_link_labels(
1333
+            normalized,
1334
+            content,
1335
+        ):
1336
+            relative_target = self._relative_html_target(root, resolved)
1337
+            if relative_target is None:
1338
+                continue
1339
+            labels = labels_by_target.setdefault(relative_target, [])
1340
+            normalized_label = " ".join(label.split())
1341
+            if normalized_label and normalized_label not in labels:
1342
+                labels.append(normalized_label)
1343
+
1344
+        conflicting: list[str] = []
1345
+        for target, labels in labels_by_target.items():
1346
+            if len(labels) < 3:
1347
+                continue
1348
+            conflicting.append(f"{target} ({', '.join(labels[:3])})")
1349
+
1350
+        if not conflicting:
1351
+            return ValidationResult(valid=True)
1352
+
1353
+        preview = "; ".join(conflicting[:2])
1354
+        if len(conflicting) > 2:
1355
+            preview += "; ..."
1356
+        return ValidationResult(
1357
+            valid=False,
1358
+            reason="HTML root page repeats one local page as multiple distinct links",
1359
+            suggestion=(
1360
+                "Do not inflate a root index or table of contents by pointing many "
1361
+                "different entries at the same local page. Expand substantive body "
1362
+                "content in the target files, create any new pages before linking them, "
1363
+                f"or keep one accurate entry per local page. Repeated target(s): {preview}"
1364
+            ),
1365
+            severity="error",
1366
+        )
1367
+
12951368
     def _prospective_edit_content(
12961369
         self,
12971370
         file_path: str,
@@ -1493,6 +1566,26 @@ class PreActionValidator:
14931566
             targets.append((href, resolved))
14941567
         return targets
14951568
 
1569
+    def _collect_local_html_link_labels(
1570
+        self,
1571
+        file_path: Path,
1572
+        content: str,
1573
+    ) -> list[tuple[str, Path, str]]:
1574
+        pattern = re.compile(
1575
+            r"<a\b[^>]*href\s*=\s*[\"']([^\"']+)[\"'][^>]*>(.*?)</a>",
1576
+            re.IGNORECASE | re.DOTALL,
1577
+        )
1578
+        targets: list[tuple[str, Path, str]] = []
1579
+        for href, raw_label in pattern.findall(content):
1580
+            target_text = self._strip_local_href_target(href)
1581
+            if target_text is None or not self._is_local_html_link_target(target_text):
1582
+                continue
1583
+            resolved = (file_path.parent / target_text).resolve(strict=False)
1584
+            label = re.sub(r"<[^>]+>", " ", raw_label)
1585
+            label = re.sub(r"\s+", " ", label).strip()
1586
+            targets.append((href, resolved, label))
1587
+        return targets
1588
+
14961589
     def _collect_local_href_targets(
14971590
         self,
14981591
         file_path: Path,
tests/test_finalization.pymodified
@@ -489,6 +489,8 @@ def test_verification_repair_guidance_keeps_multi_file_quality_worklist(
489489
     assert f"Improve `{chapter_paths[0]}`: thin content" in guidance
490490
     assert f"Improve `{chapter_paths[-1]}`: thin content" in guidance
491491
     assert "add enough concrete prose" in guidance
492
+    assert "not table-of-contents inflation" in guidance
493
+    assert "do not add duplicate navigation entries" in guidance
492494
     assert "do not stop after touching only the first file" in guidance
493495
     assert repair is not None
494496
     assert repair.artifact_path == str(chapter_paths[0].resolve(strict=False))
tests/test_safeguard_services.pymodified
@@ -505,6 +505,97 @@ def test_pre_action_validator_allows_new_root_index_to_seed_child_html_links(
505505
     assert result.valid is True
506506
 
507507
 
508
+def test_pre_action_validator_blocks_existing_root_write_with_far_missing_links(
509
+    tmp_path: Path,
510
+) -> None:
511
+    validator = PreActionValidator()
512
+    guide = tmp_path / "guides" / "nginx"
513
+    chapters = guide / "chapters"
514
+    chapters.mkdir(parents=True)
515
+    for index, name in enumerate(
516
+        [
517
+            "introduction",
518
+            "installation",
519
+            "configuration",
520
+            "virtual-hosts",
521
+            "reverse-proxy",
522
+            "load-balancing",
523
+            "security",
524
+        ],
525
+        start=1,
526
+    ):
527
+        (chapters / f"{index:02d}-{name}.html").write_text("<html></html>\n")
528
+    index = guide / "index.html"
529
+    index.write_text(
530
+        "\n".join(
531
+            f'<li><a href="chapters/{path.name}">Chapter {number}</a></li>'
532
+            for number, path in enumerate(sorted(chapters.glob("*.html")), start=1)
533
+        )
534
+    )
535
+
536
+    result = validator.validate(
537
+        "write",
538
+        {
539
+            "file_path": str(index),
540
+            "content": (
541
+                index.read_text()
542
+                + "\n"
543
+                '<li><a href="chapters/08-performance.html">Chapter 8</a></li>\n'
544
+                '<li><a href="chapters/09-ssl.html">Chapter 9</a></li>\n'
545
+                '<li><a href="chapters/10-monitoring.html">Chapter 10</a></li>\n'
546
+            ),
547
+        },
548
+    )
549
+
550
+    assert result.valid is False
551
+    assert result.reason == "Edited HTML links point to files that do not exist"
552
+    assert "chapters/09-ssl.html" in result.suggestion
553
+
554
+
555
+def test_pre_action_validator_blocks_toc_inflation_with_duplicate_page_links(
556
+    tmp_path: Path,
557
+) -> None:
558
+    validator = PreActionValidator()
559
+    guide = tmp_path / "guides" / "nginx"
560
+    chapters = guide / "chapters"
561
+    chapters.mkdir(parents=True)
562
+    security = chapters / "07-security.html"
563
+    security.write_text("<html></html>\n")
564
+    index = guide / "index.html"
565
+    index.write_text(
566
+        "\n".join(
567
+            [
568
+                "<ul>",
569
+                '<li><a href="chapters/07-security.html">Chapter 7: Security</a></li>',
570
+                "</ul>",
571
+                "",
572
+            ]
573
+        )
574
+    )
575
+
576
+    result = validator.validate(
577
+        "edit",
578
+        {
579
+            "file_path": str(index),
580
+            "old_string": "</ul>",
581
+            "new_string": (
582
+                '<li><a href="chapters/07-security.html">Chapter 8: Performance</a></li>\n'
583
+                '<li><a href="chapters/07-security.html">Chapter 9: SSL</a></li>\n'
584
+                '<li><a href="chapters/07-security.html">Chapter 10: Monitoring</a></li>\n'
585
+                "</ul>"
586
+            ),
587
+        },
588
+    )
589
+
590
+    assert result.valid is False
591
+    assert (
592
+        result.reason
593
+        == "HTML root page repeats one local page as multiple distinct links"
594
+    )
595
+    assert "Do not inflate a root index or table of contents" in result.suggestion
596
+    assert "chapters/07-security.html" in result.suggestion
597
+
598
+
508599
 def test_pre_action_validator_blocks_shell_text_rewrite_for_html_target() -> None:
509600
     validator = PreActionValidator()
510601