Block TOC quality inflation
Authored by
mfwolffe <wolffemf@dukes.jmu.edu>
- SHA
35c0bcb767b7d5678001e6a51d97bb28675b0316- Parents
-
7b1c338 - Tree
7165e37
35c0bcb
35c0bcb767b7d5678001e6a51d97bb28675b03167b1c338
7165e37| Status | File | + | - |
|---|---|---|---|
| M |
src/loader/runtime/finalization.py
|
4 | 0 |
| M |
src/loader/runtime/safeguard_services.py
|
93 | 0 |
| M |
tests/test_finalization.py
|
2 | 0 |
| M |
tests/test_safeguard_services.py
|
91 | 0 |
src/loader/runtime/finalization.pymodified@@ -1362,6 +1362,10 @@ def _html_quality_repair_strategy_lines() -> list[str]: | ||
| 1362 | 1362 | "exceed the expected text-character floor; if it says insufficient " |
| 1363 | 1363 | "structured content, add enough real sections, lists, code, tables, or " |
| 1364 | 1364 | "other content blocks to exceed the block floor.", |
| 1365 | + "- Treat structured content as substantive page body material, not table-of-" | |
| 1366 | + "contents inflation: do not add duplicate navigation entries, relabel links " | |
| 1367 | + "to existing pages as new chapters, or introduce new missing page links just " | |
| 1368 | + "to increase block counts.", | |
| 1365 | 1369 | ] |
| 1366 | 1370 | |
| 1367 | 1371 | |
src/loader/runtime/safeguard_services.pymodified@@ -800,6 +800,23 @@ class PreActionValidator: | ||
| 800 | 800 | if not html_link_scope_result.valid: |
| 801 | 801 | return html_link_scope_result |
| 802 | 802 | |
| 803 | + if Path(file_path).expanduser().exists(): | |
| 804 | + html_index_result = self._validate_html_index_links( | |
| 805 | + str(file_path), | |
| 806 | + str(content), | |
| 807 | + ) | |
| 808 | + if not html_index_result.valid: | |
| 809 | + return html_index_result | |
| 810 | + | |
| 811 | + html_duplicate_root_links_result = ( | |
| 812 | + self._validate_html_root_duplicate_local_links( | |
| 813 | + str(file_path), | |
| 814 | + str(content), | |
| 815 | + ) | |
| 816 | + ) | |
| 817 | + if not html_duplicate_root_links_result.valid: | |
| 818 | + return html_duplicate_root_links_result | |
| 819 | + | |
| 803 | 820 | html_declared_target_result = self._validate_html_declared_target_set( |
| 804 | 821 | str(file_path), |
| 805 | 822 | str(content), |
@@ -884,6 +901,15 @@ class PreActionValidator: | ||
| 884 | 901 | if not html_index_result.valid: |
| 885 | 902 | return html_index_result |
| 886 | 903 | |
| 904 | + html_duplicate_root_links_result = ( | |
| 905 | + self._validate_html_root_duplicate_local_links( | |
| 906 | + str(file_path), | |
| 907 | + prospective_content, | |
| 908 | + ) | |
| 909 | + ) | |
| 910 | + if not html_duplicate_root_links_result.valid: | |
| 911 | + return html_duplicate_root_links_result | |
| 912 | + | |
| 887 | 913 | html_declared_target_result = self._validate_html_declared_target_set( |
| 888 | 914 | str(file_path), |
| 889 | 915 | prospective_content, |
@@ -1292,6 +1318,53 @@ class PreActionValidator: | ||
| 1292 | 1318 | |
| 1293 | 1319 | return ValidationResult(valid=True) |
| 1294 | 1320 | |
| 1321 | + def _validate_html_root_duplicate_local_links( | |
| 1322 | + self, | |
| 1323 | + file_path: str, | |
| 1324 | + content: str, | |
| 1325 | + ) -> ValidationResult: | |
| 1326 | + normalized = Path(file_path).expanduser() | |
| 1327 | + if normalized.suffix.lower() != ".html" or normalized.name.lower() != "index.html": | |
| 1328 | + return ValidationResult(valid=True) | |
| 1329 | + | |
| 1330 | + root = self._resolve_html_artifact_root(normalized) | |
| 1331 | + labels_by_target: dict[str, list[str]] = {} | |
| 1332 | + for _href, resolved, label in self._collect_local_html_link_labels( | |
| 1333 | + normalized, | |
| 1334 | + content, | |
| 1335 | + ): | |
| 1336 | + relative_target = self._relative_html_target(root, resolved) | |
| 1337 | + if relative_target is None: | |
| 1338 | + continue | |
| 1339 | + labels = labels_by_target.setdefault(relative_target, []) | |
| 1340 | + normalized_label = " ".join(label.split()) | |
| 1341 | + if normalized_label and normalized_label not in labels: | |
| 1342 | + labels.append(normalized_label) | |
| 1343 | + | |
| 1344 | + conflicting: list[str] = [] | |
| 1345 | + for target, labels in labels_by_target.items(): | |
| 1346 | + if len(labels) < 3: | |
| 1347 | + continue | |
| 1348 | + conflicting.append(f"{target} ({', '.join(labels[:3])})") | |
| 1349 | + | |
| 1350 | + if not conflicting: | |
| 1351 | + return ValidationResult(valid=True) | |
| 1352 | + | |
| 1353 | + preview = "; ".join(conflicting[:2]) | |
| 1354 | + if len(conflicting) > 2: | |
| 1355 | + preview += "; ..." | |
| 1356 | + return ValidationResult( | |
| 1357 | + valid=False, | |
| 1358 | + reason="HTML root page repeats one local page as multiple distinct links", | |
| 1359 | + suggestion=( | |
| 1360 | + "Do not inflate a root index or table of contents by pointing many " | |
| 1361 | + "different entries at the same local page. Expand substantive body " | |
| 1362 | + "content in the target files, create any new pages before linking them, " | |
| 1363 | + f"or keep one accurate entry per local page. Repeated target(s): {preview}" | |
| 1364 | + ), | |
| 1365 | + severity="error", | |
| 1366 | + ) | |
| 1367 | + | |
| 1295 | 1368 | def _prospective_edit_content( |
| 1296 | 1369 | self, |
| 1297 | 1370 | file_path: str, |
@@ -1493,6 +1566,26 @@ class PreActionValidator: | ||
| 1493 | 1566 | targets.append((href, resolved)) |
| 1494 | 1567 | return targets |
| 1495 | 1568 | |
| 1569 | + def _collect_local_html_link_labels( | |
| 1570 | + self, | |
| 1571 | + file_path: Path, | |
| 1572 | + content: str, | |
| 1573 | + ) -> list[tuple[str, Path, str]]: | |
| 1574 | + pattern = re.compile( | |
| 1575 | + r"<a\b[^>]*href\s*=\s*[\"']([^\"']+)[\"'][^>]*>(.*?)</a>", | |
| 1576 | + re.IGNORECASE | re.DOTALL, | |
| 1577 | + ) | |
| 1578 | + targets: list[tuple[str, Path, str]] = [] | |
| 1579 | + for href, raw_label in pattern.findall(content): | |
| 1580 | + target_text = self._strip_local_href_target(href) | |
| 1581 | + if target_text is None or not self._is_local_html_link_target(target_text): | |
| 1582 | + continue | |
| 1583 | + resolved = (file_path.parent / target_text).resolve(strict=False) | |
| 1584 | + label = re.sub(r"<[^>]+>", " ", raw_label) | |
| 1585 | + label = re.sub(r"\s+", " ", label).strip() | |
| 1586 | + targets.append((href, resolved, label)) | |
| 1587 | + return targets | |
| 1588 | + | |
| 1496 | 1589 | def _collect_local_href_targets( |
| 1497 | 1590 | self, |
| 1498 | 1591 | file_path: Path, |
tests/test_finalization.pymodified@@ -489,6 +489,8 @@ def test_verification_repair_guidance_keeps_multi_file_quality_worklist( | ||
| 489 | 489 | assert f"Improve `{chapter_paths[0]}`: thin content" in guidance |
| 490 | 490 | assert f"Improve `{chapter_paths[-1]}`: thin content" in guidance |
| 491 | 491 | assert "add enough concrete prose" in guidance |
| 492 | + assert "not table-of-contents inflation" in guidance | |
| 493 | + assert "do not add duplicate navigation entries" in guidance | |
| 492 | 494 | assert "do not stop after touching only the first file" in guidance |
| 493 | 495 | assert repair is not None |
| 494 | 496 | assert repair.artifact_path == str(chapter_paths[0].resolve(strict=False)) |
tests/test_safeguard_services.pymodified@@ -505,6 +505,97 @@ def test_pre_action_validator_allows_new_root_index_to_seed_child_html_links( | ||
| 505 | 505 | assert result.valid is True |
| 506 | 506 | |
| 507 | 507 | |
| 508 | +def test_pre_action_validator_blocks_existing_root_write_with_far_missing_links( | |
| 509 | + tmp_path: Path, | |
| 510 | +) -> None: | |
| 511 | + validator = PreActionValidator() | |
| 512 | + guide = tmp_path / "guides" / "nginx" | |
| 513 | + chapters = guide / "chapters" | |
| 514 | + chapters.mkdir(parents=True) | |
| 515 | + for index, name in enumerate( | |
| 516 | + [ | |
| 517 | + "introduction", | |
| 518 | + "installation", | |
| 519 | + "configuration", | |
| 520 | + "virtual-hosts", | |
| 521 | + "reverse-proxy", | |
| 522 | + "load-balancing", | |
| 523 | + "security", | |
| 524 | + ], | |
| 525 | + start=1, | |
| 526 | + ): | |
| 527 | + (chapters / f"{index:02d}-{name}.html").write_text("<html></html>\n") | |
| 528 | + index = guide / "index.html" | |
| 529 | + index.write_text( | |
| 530 | + "\n".join( | |
| 531 | + f'<li><a href="chapters/{path.name}">Chapter {number}</a></li>' | |
| 532 | + for number, path in enumerate(sorted(chapters.glob("*.html")), start=1) | |
| 533 | + ) | |
| 534 | + ) | |
| 535 | + | |
| 536 | + result = validator.validate( | |
| 537 | + "write", | |
| 538 | + { | |
| 539 | + "file_path": str(index), | |
| 540 | + "content": ( | |
| 541 | + index.read_text() | |
| 542 | + + "\n" | |
| 543 | + '<li><a href="chapters/08-performance.html">Chapter 8</a></li>\n' | |
| 544 | + '<li><a href="chapters/09-ssl.html">Chapter 9</a></li>\n' | |
| 545 | + '<li><a href="chapters/10-monitoring.html">Chapter 10</a></li>\n' | |
| 546 | + ), | |
| 547 | + }, | |
| 548 | + ) | |
| 549 | + | |
| 550 | + assert result.valid is False | |
| 551 | + assert result.reason == "Edited HTML links point to files that do not exist" | |
| 552 | + assert "chapters/09-ssl.html" in result.suggestion | |
| 553 | + | |
| 554 | + | |
| 555 | +def test_pre_action_validator_blocks_toc_inflation_with_duplicate_page_links( | |
| 556 | + tmp_path: Path, | |
| 557 | +) -> None: | |
| 558 | + validator = PreActionValidator() | |
| 559 | + guide = tmp_path / "guides" / "nginx" | |
| 560 | + chapters = guide / "chapters" | |
| 561 | + chapters.mkdir(parents=True) | |
| 562 | + security = chapters / "07-security.html" | |
| 563 | + security.write_text("<html></html>\n") | |
| 564 | + index = guide / "index.html" | |
| 565 | + index.write_text( | |
| 566 | + "\n".join( | |
| 567 | + [ | |
| 568 | + "<ul>", | |
| 569 | + '<li><a href="chapters/07-security.html">Chapter 7: Security</a></li>', | |
| 570 | + "</ul>", | |
| 571 | + "", | |
| 572 | + ] | |
| 573 | + ) | |
| 574 | + ) | |
| 575 | + | |
| 576 | + result = validator.validate( | |
| 577 | + "edit", | |
| 578 | + { | |
| 579 | + "file_path": str(index), | |
| 580 | + "old_string": "</ul>", | |
| 581 | + "new_string": ( | |
| 582 | + '<li><a href="chapters/07-security.html">Chapter 8: Performance</a></li>\n' | |
| 583 | + '<li><a href="chapters/07-security.html">Chapter 9: SSL</a></li>\n' | |
| 584 | + '<li><a href="chapters/07-security.html">Chapter 10: Monitoring</a></li>\n' | |
| 585 | + "</ul>" | |
| 586 | + ), | |
| 587 | + }, | |
| 588 | + ) | |
| 589 | + | |
| 590 | + assert result.valid is False | |
| 591 | + assert ( | |
| 592 | + result.reason | |
| 593 | + == "HTML root page repeats one local page as multiple distinct links" | |
| 594 | + ) | |
| 595 | + assert "Do not inflate a root index or table of contents" in result.suggestion | |
| 596 | + assert "chapters/07-security.html" in result.suggestion | |
| 597 | + | |
| 598 | + | |
| 508 | 599 | def test_pre_action_validator_blocks_shell_text_rewrite_for_html_target() -> None: |
| 509 | 600 | validator = PreActionValidator() |
| 510 | 601 | |