tenseleyflow/loader / b3d784b

Browse files

Guard HTML repair structure

Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
b3d784b5f83a1b941d56f76074ed1bf27c76524e
Parents
515ed7c
Tree
188e229

2 changed files

StatusFile+-
M src/loader/runtime/safeguard_services.py 154 1
M tests/test_safeguard_services.py 82 0
src/loader/runtime/safeguard_services.pymodified
@@ -9,7 +9,12 @@ from dataclasses import dataclass
9
 from difflib import get_close_matches
9
 from difflib import get_close_matches
10
 from pathlib import Path
10
 from pathlib import Path
11
 
11
 
12
-from ..tools.fs_safety import coerce_structured_patch_payload
12
+from ..tools.fs_safety import (
13
+    StructuredPatchHunk,
14
+    apply_structured_patch,
15
+    coerce_structured_patch_payload,
16
+    parse_unified_diff_patch,
17
+)
13
 
18
 
14
 TEXT_REWRITE_SUFFIXES = frozenset(
19
 TEXT_REWRITE_SUFFIXES = frozenset(
15
     {
20
     {
@@ -50,6 +55,20 @@ def _html_target_tokens(target: str) -> set[str]:
50
     return {token for token in re.split(r"[^a-z0-9]+", stem) if token}
55
     return {token for token in re.split(r"[^a-z0-9]+", stem) if token}
51
 
56
 
52
 
57
 
58
+_HTML_OPEN_RE = re.compile(r"<html\b", re.IGNORECASE)
59
+_HTML_CLOSE_RE = re.compile(r"</html\s*>", re.IGNORECASE)
60
+_BODY_OPEN_RE = re.compile(r"<body\b", re.IGNORECASE)
61
+_BODY_CLOSE_RE = re.compile(r"</body\s*>", re.IGNORECASE)
62
+
63
+
64
+def _html_text_looks_like_document(text: str) -> bool:
65
+    lowered = str(text or "").lower()
66
+    return any(
67
+        marker in lowered
68
+        for marker in ("<!doctype", "<html", "</html", "<body", "</body")
69
+    )
70
+
71
+
53
 def _ordered_html_target_number(target: str) -> int | None:
72
 def _ordered_html_target_number(target: str) -> int | None:
54
     match = re.match(r"(\d+)[-_]", Path(target).name)
73
     match = re.match(r"(\d+)[-_]", Path(target).name)
55
     if match is None:
74
     if match is None:
@@ -786,6 +805,13 @@ class PreActionValidator:
786
         if not html_placeholder_result.valid:
805
         if not html_placeholder_result.valid:
787
             return html_placeholder_result
806
             return html_placeholder_result
788
 
807
 
808
+        html_document_result = self._validate_html_document_integrity_content(
809
+            str(file_path),
810
+            str(content),
811
+        )
812
+        if not html_document_result.valid:
813
+            return html_document_result
814
+
789
         sensitive_paths = ['/etc/', '/usr/', '/bin/', '/sbin/', '/boot/', '/sys/', '/proc/']
815
         sensitive_paths = ['/etc/', '/usr/', '/bin/', '/sbin/', '/boot/', '/sys/', '/proc/']
790
         for sensitive in sensitive_paths:
816
         for sensitive in sensitive_paths:
791
             if file_path.startswith(sensitive):
817
             if file_path.startswith(sensitive):
@@ -897,6 +923,13 @@ class PreActionValidator:
897
         if not html_placeholder_result.valid:
923
         if not html_placeholder_result.valid:
898
             return html_placeholder_result
924
             return html_placeholder_result
899
 
925
 
926
+        html_document_result = self._validate_html_document_integrity_content(
927
+            str(file_path),
928
+            prospective_content,
929
+        )
930
+        if not html_document_result.valid:
931
+            return html_document_result
932
+
900
         html_index_result = self._validate_html_index_links(
933
         html_index_result = self._validate_html_index_links(
901
             str(file_path),
934
             str(file_path),
902
             prospective_content,
935
             prospective_content,
@@ -975,6 +1008,19 @@ class PreActionValidator:
975
         if not html_placeholder_result.valid:
1008
         if not html_placeholder_result.valid:
976
             return html_placeholder_result
1009
             return html_placeholder_result
977
 
1010
 
1011
+        prospective_content = self._prospective_patch_content(
1012
+            str(file_path),
1013
+            structured_hunks,
1014
+            raw_patch,
1015
+        )
1016
+        if prospective_content is not None:
1017
+            html_document_result = self._validate_html_document_integrity_content(
1018
+                str(file_path),
1019
+                prospective_content,
1020
+            )
1021
+            if not html_document_result.valid:
1022
+                return html_document_result
1023
+
978
         return ValidationResult(valid=True)
1024
         return ValidationResult(valid=True)
979
 
1025
 
980
     def _validate_html_placeholder_content(
1026
     def _validate_html_placeholder_content(
@@ -1045,6 +1091,76 @@ class PreActionValidator:
1045
             "\n".join(added_fragments),
1091
             "\n".join(added_fragments),
1046
         )
1092
         )
1047
 
1093
 
1094
+    def _validate_html_document_integrity_content(
1095
+        self,
1096
+        file_path: str,
1097
+        content: str,
1098
+    ) -> ValidationResult:
1099
+        """Reject HTML mutations that damage an existing document envelope."""
1100
+
1101
+        normalized = Path(file_path).expanduser()
1102
+        if normalized.suffix.lower() not in {".html", ".htm"}:
1103
+            return ValidationResult(valid=True)
1104
+
1105
+        text = str(content or "")
1106
+        if not _html_text_looks_like_document(text):
1107
+            return ValidationResult(valid=True)
1108
+
1109
+        html_open = list(_HTML_OPEN_RE.finditer(text))
1110
+        html_close = list(_HTML_CLOSE_RE.finditer(text))
1111
+        body_open = list(_BODY_OPEN_RE.finditer(text))
1112
+        body_close = list(_BODY_CLOSE_RE.finditer(text))
1113
+        issues: list[str] = []
1114
+
1115
+        if html_close or html_open:
1116
+            if len(html_open) != 1:
1117
+                issues.append(
1118
+                    f"expected exactly one opening <html> tag (found {len(html_open)})"
1119
+                )
1120
+            if len(html_close) != 1:
1121
+                issues.append(
1122
+                    f"expected exactly one closing </html> tag (found {len(html_close)})"
1123
+                )
1124
+        if body_close or body_open:
1125
+            if len(body_open) != 1:
1126
+                issues.append(
1127
+                    f"expected exactly one opening <body> tag (found {len(body_open)})"
1128
+                )
1129
+            if len(body_close) != 1:
1130
+                issues.append(
1131
+                    f"expected exactly one closing </body> tag (found {len(body_close)})"
1132
+                )
1133
+
1134
+        if html_open and html_close and html_open[0].start() > html_close[0].start():
1135
+            issues.append("opening <html> appears after closing </html>")
1136
+        if body_open and body_close and body_open[0].start() > body_close[0].start():
1137
+            issues.append("opening <body> appears after closing </body>")
1138
+        if html_open and body_open and body_open[0].start() < html_open[0].start():
1139
+            issues.append("opening <body> appears before opening <html>")
1140
+        if body_close and html_close and body_close[-1].start() > html_close[-1].start():
1141
+            issues.append("closing </body> appears after closing </html>")
1142
+        if html_close and text[html_close[-1].end() :].strip():
1143
+            issues.append("content appears after closing </html>")
1144
+
1145
+        if not issues:
1146
+            return ValidationResult(valid=True)
1147
+
1148
+        preview = "; ".join(issues[:3])
1149
+        if len(issues) > 3:
1150
+            preview += "; ..."
1151
+        return ValidationResult(
1152
+            valid=False,
1153
+            reason="HTML document structure would be invalid",
1154
+            suggestion=(
1155
+                f"{preview}. Keep the existing closing document tail intact. For "
1156
+                "content expansion, insert substantive body sections before the current "
1157
+                "`</body></html>` tail; for malformed files, write one complete HTML "
1158
+                "document with exactly one `<html>`, `<body>`, `</body>`, and `</html>` "
1159
+                "and no content after `</html>`."
1160
+            ),
1161
+            severity="error",
1162
+        )
1163
+
1048
     def _validate_html_write_local_link_scope(
1164
     def _validate_html_write_local_link_scope(
1049
         self,
1165
         self,
1050
         file_path: str,
1166
         file_path: str,
@@ -1388,6 +1504,43 @@ class PreActionValidator:
1388
             return new_string
1504
             return new_string
1389
         return current.replace(old_string, new_string, 1)
1505
         return current.replace(old_string, new_string, 1)
1390
 
1506
 
1507
+    def _prospective_patch_content(
1508
+        self,
1509
+        file_path: str,
1510
+        hunks: list[dict[str, object] | StructuredPatchHunk],
1511
+        raw_patch: object,
1512
+    ) -> str | None:
1513
+        normalized = Path(file_path).expanduser()
1514
+        if normalized.suffix.lower() not in {".html", ".htm"}:
1515
+            return None
1516
+        try:
1517
+            original_content = normalized.read_text()
1518
+        except (OSError, UnicodeDecodeError):
1519
+            return None
1520
+
1521
+        try:
1522
+            parsed_hunks: list[StructuredPatchHunk]
1523
+            if hunks:
1524
+                original_lines = original_content.splitlines()
1525
+                parsed_hunks = [
1526
+                    hunk
1527
+                    if isinstance(hunk, StructuredPatchHunk)
1528
+                    else StructuredPatchHunk.from_dict_with_original(
1529
+                        hunk,
1530
+                        original_lines=original_lines,
1531
+                    )
1532
+                    for hunk in hunks
1533
+                ]
1534
+            elif isinstance(raw_patch, str) and raw_patch.strip():
1535
+                parsed_hunks = parse_unified_diff_patch(raw_patch)
1536
+            else:
1537
+                return None
1538
+            if not parsed_hunks:
1539
+                return None
1540
+            return apply_structured_patch(original_content, parsed_hunks)
1541
+        except ValueError:
1542
+            return None
1543
+
1391
     def _allows_root_html_graph_seed(
1544
     def _allows_root_html_graph_seed(
1392
         self,
1545
         self,
1393
         file_path: str,
1546
         file_path: str,
tests/test_safeguard_services.pymodified
@@ -491,6 +491,88 @@ def test_pre_action_validator_blocks_placeholder_html_patch(tmp_path: Path) -> N
491
     assert "coming soon" in result.suggestion
491
     assert "coming soon" in result.suggestion
492
 
492
 
493
 
493
 
494
+def test_pre_action_validator_blocks_html_patch_that_removes_closing_body(
495
+    tmp_path: Path,
496
+) -> None:
497
+    validator = PreActionValidator()
498
+    page = tmp_path / "guide" / "chapters" / "02-installation.html"
499
+    page.parent.mkdir(parents=True)
500
+    page.write_text("<html>\n<body>\n<h1>Installation</h1>\n</body>\n</html>\n")
501
+
502
+    result = validator.validate(
503
+        "patch",
504
+        {
505
+            "file_path": str(page),
506
+            "hunks": [
507
+                {
508
+                    "old_start": 4,
509
+                    "old_lines": 1,
510
+                    "new_start": 4,
511
+                    "new_lines": 0,
512
+                    "lines": ["-</body>"],
513
+                }
514
+            ],
515
+        },
516
+    )
517
+
518
+    assert result.valid is False
519
+    assert result.reason == "HTML document structure would be invalid"
520
+    assert "expected exactly one closing </body> tag (found 0)" in result.suggestion
521
+    assert (
522
+        "insert substantive body sections before the current `</body></html>` tail"
523
+        in result.suggestion
524
+    )
525
+
526
+
527
+def test_pre_action_validator_blocks_html_edit_that_wraps_tail_in_new_body(
528
+    tmp_path: Path,
529
+) -> None:
530
+    validator = PreActionValidator()
531
+    page = tmp_path / "guide" / "chapters" / "02-installation.html"
532
+    page.parent.mkdir(parents=True)
533
+    page.write_text("<html><body><h1>Installation</h1></body></html>")
534
+
535
+    result = validator.validate(
536
+        "edit",
537
+        {
538
+            "file_path": str(page),
539
+            "old_string": "</body></html>",
540
+            "new_string": (
541
+                "<body><h1>Installation</h1>"
542
+                "<p>Short replacement fragment.</p></body></html>"
543
+            ),
544
+        },
545
+    )
546
+
547
+    assert result.valid is False
548
+    assert result.reason == "HTML document structure would be invalid"
549
+    assert "expected exactly one opening <body> tag (found 2)" in result.suggestion
550
+
551
+
552
+def test_pre_action_validator_allows_html_edit_inserted_before_closing_tail(
553
+    tmp_path: Path,
554
+) -> None:
555
+    validator = PreActionValidator()
556
+    page = tmp_path / "guide" / "chapters" / "02-installation.html"
557
+    page.parent.mkdir(parents=True)
558
+    page.write_text("<html><body><h1>Installation</h1></body></html>")
559
+
560
+    result = validator.validate(
561
+        "edit",
562
+        {
563
+            "file_path": str(page),
564
+            "old_string": "</body></html>",
565
+            "new_string": (
566
+                "<section><h2>Package Managers</h2>"
567
+                "<p>Install Nginx with the package manager for your platform.</p>"
568
+                "</section></body></html>"
569
+            ),
570
+        },
571
+    )
572
+
573
+    assert result.valid is True
574
+
575
+
494
 def test_pre_action_validator_blocks_missing_local_html_asset_href(tmp_path: Path) -> None:
576
 def test_pre_action_validator_blocks_missing_local_html_asset_href(tmp_path: Path) -> None:
495
     validator = PreActionValidator()
577
     validator = PreActionValidator()
496
     page = tmp_path / "guide" / "chapters" / "07-performance.html"
578
     page = tmp_path / "guide" / "chapters" / "07-performance.html"