@@ -9,7 +9,12 @@ from dataclasses import dataclass |
| 9 | from difflib import get_close_matches | 9 | from difflib import get_close_matches |
| 10 | from pathlib import Path | 10 | from pathlib import Path |
| 11 | | 11 | |
| 12 | -from ..tools.fs_safety import coerce_structured_patch_payload | 12 | +from ..tools.fs_safety import ( |
| | 13 | + StructuredPatchHunk, |
| | 14 | + apply_structured_patch, |
| | 15 | + coerce_structured_patch_payload, |
| | 16 | + parse_unified_diff_patch, |
| | 17 | +) |
| 13 | | 18 | |
| 14 | TEXT_REWRITE_SUFFIXES = frozenset( | 19 | TEXT_REWRITE_SUFFIXES = frozenset( |
| 15 | { | 20 | { |
@@ -50,6 +55,20 @@ def _html_target_tokens(target: str) -> set[str]: |
| 50 | return {token for token in re.split(r"[^a-z0-9]+", stem) if token} | 55 | return {token for token in re.split(r"[^a-z0-9]+", stem) if token} |
| 51 | | 56 | |
| 52 | | 57 | |
| | 58 | +_HTML_OPEN_RE = re.compile(r"<html\b", re.IGNORECASE) |
| | 59 | +_HTML_CLOSE_RE = re.compile(r"</html\s*>", re.IGNORECASE) |
| | 60 | +_BODY_OPEN_RE = re.compile(r"<body\b", re.IGNORECASE) |
| | 61 | +_BODY_CLOSE_RE = re.compile(r"</body\s*>", re.IGNORECASE) |
| | 62 | + |
| | 63 | + |
| | 64 | +def _html_text_looks_like_document(text: str) -> bool: |
| | 65 | + lowered = str(text or "").lower() |
| | 66 | + return any( |
| | 67 | + marker in lowered |
| | 68 | + for marker in ("<!doctype", "<html", "</html", "<body", "</body") |
| | 69 | + ) |
| | 70 | + |
| | 71 | + |
| 53 | def _ordered_html_target_number(target: str) -> int | None: | 72 | def _ordered_html_target_number(target: str) -> int | None: |
| 54 | match = re.match(r"(\d+)[-_]", Path(target).name) | 73 | match = re.match(r"(\d+)[-_]", Path(target).name) |
| 55 | if match is None: | 74 | if match is None: |
@@ -786,6 +805,13 @@ class PreActionValidator: |
| 786 | if not html_placeholder_result.valid: | 805 | if not html_placeholder_result.valid: |
| 787 | return html_placeholder_result | 806 | return html_placeholder_result |
| 788 | | 807 | |
| | 808 | + html_document_result = self._validate_html_document_integrity_content( |
| | 809 | + str(file_path), |
| | 810 | + str(content), |
| | 811 | + ) |
| | 812 | + if not html_document_result.valid: |
| | 813 | + return html_document_result |
| | 814 | + |
| 789 | sensitive_paths = ['/etc/', '/usr/', '/bin/', '/sbin/', '/boot/', '/sys/', '/proc/'] | 815 | sensitive_paths = ['/etc/', '/usr/', '/bin/', '/sbin/', '/boot/', '/sys/', '/proc/'] |
| 790 | for sensitive in sensitive_paths: | 816 | for sensitive in sensitive_paths: |
| 791 | if file_path.startswith(sensitive): | 817 | if file_path.startswith(sensitive): |
@@ -897,6 +923,13 @@ class PreActionValidator: |
| 897 | if not html_placeholder_result.valid: | 923 | if not html_placeholder_result.valid: |
| 898 | return html_placeholder_result | 924 | return html_placeholder_result |
| 899 | | 925 | |
| | 926 | + html_document_result = self._validate_html_document_integrity_content( |
| | 927 | + str(file_path), |
| | 928 | + prospective_content, |
| | 929 | + ) |
| | 930 | + if not html_document_result.valid: |
| | 931 | + return html_document_result |
| | 932 | + |
| 900 | html_index_result = self._validate_html_index_links( | 933 | html_index_result = self._validate_html_index_links( |
| 901 | str(file_path), | 934 | str(file_path), |
| 902 | prospective_content, | 935 | prospective_content, |
@@ -975,6 +1008,19 @@ class PreActionValidator: |
| 975 | if not html_placeholder_result.valid: | 1008 | if not html_placeholder_result.valid: |
| 976 | return html_placeholder_result | 1009 | return html_placeholder_result |
| 977 | | 1010 | |
| | 1011 | + prospective_content = self._prospective_patch_content( |
| | 1012 | + str(file_path), |
| | 1013 | + structured_hunks, |
| | 1014 | + raw_patch, |
| | 1015 | + ) |
| | 1016 | + if prospective_content is not None: |
| | 1017 | + html_document_result = self._validate_html_document_integrity_content( |
| | 1018 | + str(file_path), |
| | 1019 | + prospective_content, |
| | 1020 | + ) |
| | 1021 | + if not html_document_result.valid: |
| | 1022 | + return html_document_result |
| | 1023 | + |
| 978 | return ValidationResult(valid=True) | 1024 | return ValidationResult(valid=True) |
| 979 | | 1025 | |
| 980 | def _validate_html_placeholder_content( | 1026 | def _validate_html_placeholder_content( |
@@ -1045,6 +1091,76 @@ class PreActionValidator: |
| 1045 | "\n".join(added_fragments), | 1091 | "\n".join(added_fragments), |
| 1046 | ) | 1092 | ) |
| 1047 | | 1093 | |
| | 1094 | + def _validate_html_document_integrity_content( |
| | 1095 | + self, |
| | 1096 | + file_path: str, |
| | 1097 | + content: str, |
| | 1098 | + ) -> ValidationResult: |
| | 1099 | + """Reject HTML mutations that damage an existing document envelope.""" |
| | 1100 | + |
| | 1101 | + normalized = Path(file_path).expanduser() |
| | 1102 | + if normalized.suffix.lower() not in {".html", ".htm"}: |
| | 1103 | + return ValidationResult(valid=True) |
| | 1104 | + |
| | 1105 | + text = str(content or "") |
| | 1106 | + if not _html_text_looks_like_document(text): |
| | 1107 | + return ValidationResult(valid=True) |
| | 1108 | + |
| | 1109 | + html_open = list(_HTML_OPEN_RE.finditer(text)) |
| | 1110 | + html_close = list(_HTML_CLOSE_RE.finditer(text)) |
| | 1111 | + body_open = list(_BODY_OPEN_RE.finditer(text)) |
| | 1112 | + body_close = list(_BODY_CLOSE_RE.finditer(text)) |
| | 1113 | + issues: list[str] = [] |
| | 1114 | + |
| | 1115 | + if html_close or html_open: |
| | 1116 | + if len(html_open) != 1: |
| | 1117 | + issues.append( |
| | 1118 | + f"expected exactly one opening <html> tag (found {len(html_open)})" |
| | 1119 | + ) |
| | 1120 | + if len(html_close) != 1: |
| | 1121 | + issues.append( |
| | 1122 | + f"expected exactly one closing </html> tag (found {len(html_close)})" |
| | 1123 | + ) |
| | 1124 | + if body_close or body_open: |
| | 1125 | + if len(body_open) != 1: |
| | 1126 | + issues.append( |
| | 1127 | + f"expected exactly one opening <body> tag (found {len(body_open)})" |
| | 1128 | + ) |
| | 1129 | + if len(body_close) != 1: |
| | 1130 | + issues.append( |
| | 1131 | + f"expected exactly one closing </body> tag (found {len(body_close)})" |
| | 1132 | + ) |
| | 1133 | + |
| | 1134 | + if html_open and html_close and html_open[0].start() > html_close[0].start(): |
| | 1135 | + issues.append("opening <html> appears after closing </html>") |
| | 1136 | + if body_open and body_close and body_open[0].start() > body_close[0].start(): |
| | 1137 | + issues.append("opening <body> appears after closing </body>") |
| | 1138 | + if html_open and body_open and body_open[0].start() < html_open[0].start(): |
| | 1139 | + issues.append("opening <body> appears before opening <html>") |
| | 1140 | + if body_close and html_close and body_close[-1].start() > html_close[-1].start(): |
| | 1141 | + issues.append("closing </body> appears after closing </html>") |
| | 1142 | + if html_close and text[html_close[-1].end() :].strip(): |
| | 1143 | + issues.append("content appears after closing </html>") |
| | 1144 | + |
| | 1145 | + if not issues: |
| | 1146 | + return ValidationResult(valid=True) |
| | 1147 | + |
| | 1148 | + preview = "; ".join(issues[:3]) |
| | 1149 | + if len(issues) > 3: |
| | 1150 | + preview += "; ..." |
| | 1151 | + return ValidationResult( |
| | 1152 | + valid=False, |
| | 1153 | + reason="HTML document structure would be invalid", |
| | 1154 | + suggestion=( |
| | 1155 | + f"{preview}. Keep the existing closing document tail intact. For " |
| | 1156 | + "content expansion, insert substantive body sections before the current " |
| | 1157 | + "`</body></html>` tail; for malformed files, write one complete HTML " |
| | 1158 | + "document with exactly one `<html>`, `<body>`, `</body>`, and `</html>` " |
| | 1159 | + "and no content after `</html>`." |
| | 1160 | + ), |
| | 1161 | + severity="error", |
| | 1162 | + ) |
| | 1163 | + |
| 1048 | def _validate_html_write_local_link_scope( | 1164 | def _validate_html_write_local_link_scope( |
| 1049 | self, | 1165 | self, |
| 1050 | file_path: str, | 1166 | file_path: str, |
@@ -1388,6 +1504,43 @@ class PreActionValidator: |
| 1388 | return new_string | 1504 | return new_string |
| 1389 | return current.replace(old_string, new_string, 1) | 1505 | return current.replace(old_string, new_string, 1) |
| 1390 | | 1506 | |
| | 1507 | + def _prospective_patch_content( |
| | 1508 | + self, |
| | 1509 | + file_path: str, |
| | 1510 | + hunks: list[dict[str, object] | StructuredPatchHunk], |
| | 1511 | + raw_patch: object, |
| | 1512 | + ) -> str | None: |
| | 1513 | + normalized = Path(file_path).expanduser() |
| | 1514 | + if normalized.suffix.lower() not in {".html", ".htm"}: |
| | 1515 | + return None |
| | 1516 | + try: |
| | 1517 | + original_content = normalized.read_text() |
| | 1518 | + except (OSError, UnicodeDecodeError): |
| | 1519 | + return None |
| | 1520 | + |
| | 1521 | + try: |
| | 1522 | + parsed_hunks: list[StructuredPatchHunk] |
| | 1523 | + if hunks: |
| | 1524 | + original_lines = original_content.splitlines() |
| | 1525 | + parsed_hunks = [ |
| | 1526 | + hunk |
| | 1527 | + if isinstance(hunk, StructuredPatchHunk) |
| | 1528 | + else StructuredPatchHunk.from_dict_with_original( |
| | 1529 | + hunk, |
| | 1530 | + original_lines=original_lines, |
| | 1531 | + ) |
| | 1532 | + for hunk in hunks |
| | 1533 | + ] |
| | 1534 | + elif isinstance(raw_patch, str) and raw_patch.strip(): |
| | 1535 | + parsed_hunks = parse_unified_diff_patch(raw_patch) |
| | 1536 | + else: |
| | 1537 | + return None |
| | 1538 | + if not parsed_hunks: |
| | 1539 | + return None |
| | 1540 | + return apply_structured_patch(original_content, parsed_hunks) |
| | 1541 | + except ValueError: |
| | 1542 | + return None |
| | 1543 | + |
| 1391 | def _allows_root_html_graph_seed( | 1544 | def _allows_root_html_graph_seed( |
| 1392 | self, | 1545 | self, |
| 1393 | file_path: str, | 1546 | file_path: str, |