@@ -9,7 +9,12 @@ from dataclasses import dataclass |
| 9 | 9 | from difflib import get_close_matches |
| 10 | 10 | from pathlib import Path |
| 11 | 11 | |
| 12 | | -from ..tools.fs_safety import coerce_structured_patch_payload |
| 12 | +from ..tools.fs_safety import ( |
| 13 | + StructuredPatchHunk, |
| 14 | + apply_structured_patch, |
| 15 | + coerce_structured_patch_payload, |
| 16 | + parse_unified_diff_patch, |
| 17 | +) |
| 13 | 18 | |
| 14 | 19 | TEXT_REWRITE_SUFFIXES = frozenset( |
| 15 | 20 | { |
@@ -50,6 +55,20 @@ def _html_target_tokens(target: str) -> set[str]: |
| 50 | 55 | return {token for token in re.split(r"[^a-z0-9]+", stem) if token} |
| 51 | 56 | |
| 52 | 57 | |
| 58 | +_HTML_OPEN_RE = re.compile(r"<html\b", re.IGNORECASE) |
| 59 | +_HTML_CLOSE_RE = re.compile(r"</html\s*>", re.IGNORECASE) |
| 60 | +_BODY_OPEN_RE = re.compile(r"<body\b", re.IGNORECASE) |
| 61 | +_BODY_CLOSE_RE = re.compile(r"</body\s*>", re.IGNORECASE) |
| 62 | + |
| 63 | + |
| 64 | +def _html_text_looks_like_document(text: str) -> bool: |
| 65 | + lowered = str(text or "").lower() |
| 66 | + return any( |
| 67 | + marker in lowered |
| 68 | + for marker in ("<!doctype", "<html", "</html", "<body", "</body") |
| 69 | + ) |
| 70 | + |
| 71 | + |
| 53 | 72 | def _ordered_html_target_number(target: str) -> int | None: |
| 54 | 73 | match = re.match(r"(\d+)[-_]", Path(target).name) |
| 55 | 74 | if match is None: |
@@ -786,6 +805,13 @@ class PreActionValidator: |
| 786 | 805 | if not html_placeholder_result.valid: |
| 787 | 806 | return html_placeholder_result |
| 788 | 807 | |
| 808 | + html_document_result = self._validate_html_document_integrity_content( |
| 809 | + str(file_path), |
| 810 | + str(content), |
| 811 | + ) |
| 812 | + if not html_document_result.valid: |
| 813 | + return html_document_result |
| 814 | + |
| 789 | 815 | sensitive_paths = ['/etc/', '/usr/', '/bin/', '/sbin/', '/boot/', '/sys/', '/proc/'] |
| 790 | 816 | for sensitive in sensitive_paths: |
| 791 | 817 | if file_path.startswith(sensitive): |
@@ -897,6 +923,13 @@ class PreActionValidator: |
| 897 | 923 | if not html_placeholder_result.valid: |
| 898 | 924 | return html_placeholder_result |
| 899 | 925 | |
| 926 | + html_document_result = self._validate_html_document_integrity_content( |
| 927 | + str(file_path), |
| 928 | + prospective_content, |
| 929 | + ) |
| 930 | + if not html_document_result.valid: |
| 931 | + return html_document_result |
| 932 | + |
| 900 | 933 | html_index_result = self._validate_html_index_links( |
| 901 | 934 | str(file_path), |
| 902 | 935 | prospective_content, |
@@ -975,6 +1008,19 @@ class PreActionValidator: |
| 975 | 1008 | if not html_placeholder_result.valid: |
| 976 | 1009 | return html_placeholder_result |
| 977 | 1010 | |
| 1011 | + prospective_content = self._prospective_patch_content( |
| 1012 | + str(file_path), |
| 1013 | + structured_hunks, |
| 1014 | + raw_patch, |
| 1015 | + ) |
| 1016 | + if prospective_content is not None: |
| 1017 | + html_document_result = self._validate_html_document_integrity_content( |
| 1018 | + str(file_path), |
| 1019 | + prospective_content, |
| 1020 | + ) |
| 1021 | + if not html_document_result.valid: |
| 1022 | + return html_document_result |
| 1023 | + |
| 978 | 1024 | return ValidationResult(valid=True) |
| 979 | 1025 | |
| 980 | 1026 | def _validate_html_placeholder_content( |
@@ -1045,6 +1091,76 @@ class PreActionValidator: |
| 1045 | 1091 | "\n".join(added_fragments), |
| 1046 | 1092 | ) |
| 1047 | 1093 | |
| 1094 | + def _validate_html_document_integrity_content( |
| 1095 | + self, |
| 1096 | + file_path: str, |
| 1097 | + content: str, |
| 1098 | + ) -> ValidationResult: |
| 1099 | + """Reject HTML mutations that damage an existing document envelope.""" |
| 1100 | + |
| 1101 | + normalized = Path(file_path).expanduser() |
| 1102 | + if normalized.suffix.lower() not in {".html", ".htm"}: |
| 1103 | + return ValidationResult(valid=True) |
| 1104 | + |
| 1105 | + text = str(content or "") |
| 1106 | + if not _html_text_looks_like_document(text): |
| 1107 | + return ValidationResult(valid=True) |
| 1108 | + |
| 1109 | + html_open = list(_HTML_OPEN_RE.finditer(text)) |
| 1110 | + html_close = list(_HTML_CLOSE_RE.finditer(text)) |
| 1111 | + body_open = list(_BODY_OPEN_RE.finditer(text)) |
| 1112 | + body_close = list(_BODY_CLOSE_RE.finditer(text)) |
| 1113 | + issues: list[str] = [] |
| 1114 | + |
| 1115 | + if html_close or html_open: |
| 1116 | + if len(html_open) != 1: |
| 1117 | + issues.append( |
| 1118 | + f"expected exactly one opening <html> tag (found {len(html_open)})" |
| 1119 | + ) |
| 1120 | + if len(html_close) != 1: |
| 1121 | + issues.append( |
| 1122 | + f"expected exactly one closing </html> tag (found {len(html_close)})" |
| 1123 | + ) |
| 1124 | + if body_close or body_open: |
| 1125 | + if len(body_open) != 1: |
| 1126 | + issues.append( |
| 1127 | + f"expected exactly one opening <body> tag (found {len(body_open)})" |
| 1128 | + ) |
| 1129 | + if len(body_close) != 1: |
| 1130 | + issues.append( |
| 1131 | + f"expected exactly one closing </body> tag (found {len(body_close)})" |
| 1132 | + ) |
| 1133 | + |
| 1134 | + if html_open and html_close and html_open[0].start() > html_close[0].start(): |
| 1135 | + issues.append("opening <html> appears after closing </html>") |
| 1136 | + if body_open and body_close and body_open[0].start() > body_close[0].start(): |
| 1137 | + issues.append("opening <body> appears after closing </body>") |
| 1138 | + if html_open and body_open and body_open[0].start() < html_open[0].start(): |
| 1139 | + issues.append("opening <body> appears before opening <html>") |
| 1140 | + if body_close and html_close and body_close[-1].start() > html_close[-1].start(): |
| 1141 | + issues.append("closing </body> appears after closing </html>") |
| 1142 | + if html_close and text[html_close[-1].end() :].strip(): |
| 1143 | + issues.append("content appears after closing </html>") |
| 1144 | + |
| 1145 | + if not issues: |
| 1146 | + return ValidationResult(valid=True) |
| 1147 | + |
| 1148 | + preview = "; ".join(issues[:3]) |
| 1149 | + if len(issues) > 3: |
| 1150 | + preview += "; ..." |
| 1151 | + return ValidationResult( |
| 1152 | + valid=False, |
| 1153 | + reason="HTML document structure would be invalid", |
| 1154 | + suggestion=( |
| 1155 | + f"{preview}. Keep the existing closing document tail intact. For " |
| 1156 | + "content expansion, insert substantive body sections before the current " |
| 1157 | + "`</body></html>` tail; for malformed files, write one complete HTML " |
| 1158 | + "document with exactly one `<html>`, `<body>`, `</body>`, and `</html>` " |
| 1159 | + "and no content after `</html>`." |
| 1160 | + ), |
| 1161 | + severity="error", |
| 1162 | + ) |
| 1163 | + |
| 1048 | 1164 | def _validate_html_write_local_link_scope( |
| 1049 | 1165 | self, |
| 1050 | 1166 | file_path: str, |
@@ -1388,6 +1504,43 @@ class PreActionValidator: |
| 1388 | 1504 | return new_string |
| 1389 | 1505 | return current.replace(old_string, new_string, 1) |
| 1390 | 1506 | |
| 1507 | + def _prospective_patch_content( |
| 1508 | + self, |
| 1509 | + file_path: str, |
| 1510 | + hunks: list[dict[str, object] | StructuredPatchHunk], |
| 1511 | + raw_patch: object, |
| 1512 | + ) -> str | None: |
| 1513 | + normalized = Path(file_path).expanduser() |
| 1514 | + if normalized.suffix.lower() not in {".html", ".htm"}: |
| 1515 | + return None |
| 1516 | + try: |
| 1517 | + original_content = normalized.read_text() |
| 1518 | + except (OSError, UnicodeDecodeError): |
| 1519 | + return None |
| 1520 | + |
| 1521 | + try: |
| 1522 | + parsed_hunks: list[StructuredPatchHunk] |
| 1523 | + if hunks: |
| 1524 | + original_lines = original_content.splitlines() |
| 1525 | + parsed_hunks = [ |
| 1526 | + hunk |
| 1527 | + if isinstance(hunk, StructuredPatchHunk) |
| 1528 | + else StructuredPatchHunk.from_dict_with_original( |
| 1529 | + hunk, |
| 1530 | + original_lines=original_lines, |
| 1531 | + ) |
| 1532 | + for hunk in hunks |
| 1533 | + ] |
| 1534 | + elif isinstance(raw_patch, str) and raw_patch.strip(): |
| 1535 | + parsed_hunks = parse_unified_diff_patch(raw_patch) |
| 1536 | + else: |
| 1537 | + return None |
| 1538 | + if not parsed_hunks: |
| 1539 | + return None |
| 1540 | + return apply_structured_patch(original_content, parsed_hunks) |
| 1541 | + except ValueError: |
| 1542 | + return None |
| 1543 | + |
| 1391 | 1544 | def _allows_root_html_graph_seed( |
| 1392 | 1545 | self, |
| 1393 | 1546 | file_path: str, |