`b3d784b`

Guard HTML repair structure

Authored by mfwolffe <wolffemf@dukes.jmu.edu> 1 week ago

SHA: b3d784b5f83a1b941d56f76074ed1bf27c76524e
Parents: 515ed7c
Tree: 188e229

2 changed files

Status	File	+	-
M	`src/loader/runtime/safeguard_services.py`	154	1
M	`tests/test_safeguard_services.py`	82	0

src/loader/runtime/safeguard_services.pymodified

  from difflib import get_close_matches
  from pathlib import Path
--from ..tools.fs_safety import coerce_structured_patch_payload
++from ..tools.fs_safety import (
++    StructuredPatchHunk,
++    apply_structured_patch,
++    coerce_structured_patch_payload,
++    parse_unified_diff_patch,
++)
  TEXT_REWRITE_SUFFIXES = frozenset(
+     {
      return {token for token in re.split(r"[^a-z0-9]+", stem) if token}
++_HTML_OPEN_RE = re.compile(r"<html\b", re.IGNORECASE)
++_HTML_CLOSE_RE = re.compile(r"</html\s*>", re.IGNORECASE)
++_BODY_OPEN_RE = re.compile(r"<body\b", re.IGNORECASE)
++_BODY_CLOSE_RE = re.compile(r"</body\s*>", re.IGNORECASE)
++
++
++def _html_text_looks_like_document(text: str) -> bool:
++    lowered = str(text or "").lower()
++    return any(
++        marker in lowered
++        for marker in ("<!doctype", "<html", "</html", "<body", "</body")
++    )
++
++
  def _ordered_html_target_number(target: str) -> int | None:
      match = re.match(r"(\d+)[-_]", Path(target).name)
      if match is None:
          if not html_placeholder_result.valid:
              return html_placeholder_result
++        html_document_result = self._validate_html_document_integrity_content(
++            str(file_path),
++            str(content),
++        )
++        if not html_document_result.valid:
++            return html_document_result
++
          sensitive_paths = ['/etc/', '/usr/', '/bin/', '/sbin/', '/boot/', '/sys/', '/proc/']
          for sensitive in sensitive_paths:
              if file_path.startswith(sensitive):
          if not html_placeholder_result.valid:
              return html_placeholder_result
++        html_document_result = self._validate_html_document_integrity_content(
++            str(file_path),
++            prospective_content,
++        )
++        if not html_document_result.valid:
++            return html_document_result
++
          html_index_result = self._validate_html_index_links(
              str(file_path),
              prospective_content,
          if not html_placeholder_result.valid:
              return html_placeholder_result
++        prospective_content = self._prospective_patch_content(
++            str(file_path),
++            structured_hunks,
++            raw_patch,
++        )
++        if prospective_content is not None:
++            html_document_result = self._validate_html_document_integrity_content(
++                str(file_path),
++                prospective_content,
++            )
++            if not html_document_result.valid:
++                return html_document_result
++
          return ValidationResult(valid=True)
      def _validate_html_placeholder_content(
              "\n".join(added_fragments),
+         )
++    def _validate_html_document_integrity_content(
++        self,
++        file_path: str,
++        content: str,
++    ) -> ValidationResult:
++        """Reject HTML mutations that damage an existing document envelope."""
++
++        normalized = Path(file_path).expanduser()
++        if normalized.suffix.lower() not in {".html", ".htm"}:
++            return ValidationResult(valid=True)
++
++        text = str(content or "")
++        if not _html_text_looks_like_document(text):
++            return ValidationResult(valid=True)
++
++        html_open = list(_HTML_OPEN_RE.finditer(text))
++        html_close = list(_HTML_CLOSE_RE.finditer(text))
++        body_open = list(_BODY_OPEN_RE.finditer(text))
++        body_close = list(_BODY_CLOSE_RE.finditer(text))
++        issues: list[str] = []
++
++        if html_close or html_open:
++            if len(html_open) != 1:
++                issues.append(
++                    f"expected exactly one opening <html> tag (found {len(html_open)})"
++                )
++            if len(html_close) != 1:
++                issues.append(
++                    f"expected exactly one closing </html> tag (found {len(html_close)})"
++                )
++        if body_close or body_open:
++            if len(body_open) != 1:
++                issues.append(
++                    f"expected exactly one opening <body> tag (found {len(body_open)})"
++                )
++            if len(body_close) != 1:
++                issues.append(
++                    f"expected exactly one closing </body> tag (found {len(body_close)})"
++                )
++
++        if html_open and html_close and html_open[0].start() > html_close[0].start():
++            issues.append("opening <html> appears after closing </html>")
++        if body_open and body_close and body_open[0].start() > body_close[0].start():
++            issues.append("opening <body> appears after closing </body>")
++        if html_open and body_open and body_open[0].start() < html_open[0].start():
++            issues.append("opening <body> appears before opening <html>")
++        if body_close and html_close and body_close[-1].start() > html_close[-1].start():
++            issues.append("closing </body> appears after closing </html>")
++        if html_close and text[html_close[-1].end() :].strip():
++            issues.append("content appears after closing </html>")
++
++        if not issues:
++            return ValidationResult(valid=True)
++
++        preview = "; ".join(issues[:3])
++        if len(issues) > 3:
++            preview += "; ..."
++        return ValidationResult(
++            valid=False,
++            reason="HTML document structure would be invalid",
++            suggestion=(
++                f"{preview}. Keep the existing closing document tail intact. For "
++                "content expansion, insert substantive body sections before the current "
++                "`</body></html>` tail; for malformed files, write one complete HTML "
++                "document with exactly one `<html>`, `<body>`, `</body>`, and `</html>` "
++                "and no content after `</html>`."
++            ),
++            severity="error",
++        )
++
      def _validate_html_write_local_link_scope(
          self,
          file_path: str,
              return new_string
          return current.replace(old_string, new_string, 1)
++    def _prospective_patch_content(
++        self,
++        file_path: str,
++        hunks: list[dict[str, object] | StructuredPatchHunk],
++        raw_patch: object,
++    ) -> str | None:
++        normalized = Path(file_path).expanduser()
++        if normalized.suffix.lower() not in {".html", ".htm"}:
++            return None
++        try:
++            original_content = normalized.read_text()
++        except (OSError, UnicodeDecodeError):
++            return None
++
++        try:
++            parsed_hunks: list[StructuredPatchHunk]
++            if hunks:
++                original_lines = original_content.splitlines()
++                parsed_hunks = [
++                    hunk
++                    if isinstance(hunk, StructuredPatchHunk)
++                    else StructuredPatchHunk.from_dict_with_original(
++                        hunk,
++                        original_lines=original_lines,
++                    )
++                    for hunk in hunks
++                ]
++            elif isinstance(raw_patch, str) and raw_patch.strip():
++                parsed_hunks = parse_unified_diff_patch(raw_patch)
++            else:
++                return None
++            if not parsed_hunks:
++                return None
++            return apply_structured_patch(original_content, parsed_hunks)
++        except ValueError:
++            return None
++
      def _allows_root_html_graph_seed(
          self,
          file_path: str,

tests/test_safeguard_services.pymodified

      assert "coming soon" in result.suggestion
++def test_pre_action_validator_blocks_html_patch_that_removes_closing_body(
++    tmp_path: Path,
++) -> None:
++    validator = PreActionValidator()
++    page = tmp_path / "guide" / "chapters" / "02-installation.html"
++    page.parent.mkdir(parents=True)
++    page.write_text("<html>\n<body>\n<h1>Installation</h1>\n</body>\n</html>\n")
++
++    result = validator.validate(
++        "patch",
++        {
++            "file_path": str(page),
++            "hunks": [
++                {
++                    "old_start": 4,
++                    "old_lines": 1,
++                    "new_start": 4,
++                    "new_lines": 0,
++                    "lines": ["-</body>"],
++                }
++            ],
++        },
++    )
++
++    assert result.valid is False
++    assert result.reason == "HTML document structure would be invalid"
++    assert "expected exactly one closing </body> tag (found 0)" in result.suggestion
++    assert (
++        "insert substantive body sections before the current `</body></html>` tail"
++        in result.suggestion
++    )
++
++
++def test_pre_action_validator_blocks_html_edit_that_wraps_tail_in_new_body(
++    tmp_path: Path,
++) -> None:
++    validator = PreActionValidator()
++    page = tmp_path / "guide" / "chapters" / "02-installation.html"
++    page.parent.mkdir(parents=True)
++    page.write_text("<html><body><h1>Installation</h1></body></html>")
++
++    result = validator.validate(
++        "edit",
++        {
++            "file_path": str(page),
++            "old_string": "</body></html>",
++            "new_string": (
++                "<body><h1>Installation</h1>"
++                "<p>Short replacement fragment.</p></body></html>"
++            ),
++        },
++    )
++
++    assert result.valid is False
++    assert result.reason == "HTML document structure would be invalid"
++    assert "expected exactly one opening <body> tag (found 2)" in result.suggestion
++
++
++def test_pre_action_validator_allows_html_edit_inserted_before_closing_tail(
++    tmp_path: Path,
++) -> None:
++    validator = PreActionValidator()
++    page = tmp_path / "guide" / "chapters" / "02-installation.html"
++    page.parent.mkdir(parents=True)
++    page.write_text("<html><body><h1>Installation</h1></body></html>")
++
++    result = validator.validate(
++        "edit",
++        {
++            "file_path": str(page),
++            "old_string": "</body></html>",
++            "new_string": (
++                "<section><h2>Package Managers</h2>"
++                "<p>Install Nginx with the package manager for your platform.</p>"
++                "</section></body></html>"
++            ),
++        },
++    )
++
++    assert result.valid is True
++
++
  def test_pre_action_validator_blocks_missing_local_html_asset_href(tmp_path: Path) -> None:
      validator = PreActionValidator()
      page = tmp_path / "guide" / "chapters" / "07-performance.html"