`b3d784b`

Guard HTML repair structure

Authored by mfwolffe <wolffemf@dukes.jmu.edu> 1 week ago

SHA: b3d784b5f83a1b941d56f76074ed1bf27c76524e
Parents: 515ed7c
Tree: 188e229

2 changed files

Status	File	+	-
M	`src/loader/runtime/safeguard_services.py`	154	1
M	`tests/test_safeguard_services.py`	82	0

src/loader/runtime/safeguard_services.pymodified

  from difflib import get_close_matches
  from pathlib import Path
 -from ..tools.fs_safety import coerce_structured_patch_payload
 +from ..tools.fs_safety import (
 +    StructuredPatchHunk,
 +    apply_structured_patch,
 +    coerce_structured_patch_payload,
 +    parse_unified_diff_patch,
 +)
  TEXT_REWRITE_SUFFIXES = frozenset(
+     {
      return {token for token in re.split(r"[^a-z0-9]+", stem) if token}
 +_HTML_OPEN_RE = re.compile(r"<html\b", re.IGNORECASE)
 +_HTML_CLOSE_RE = re.compile(r"</html\s*>", re.IGNORECASE)
 +_BODY_OPEN_RE = re.compile(r"<body\b", re.IGNORECASE)
 +_BODY_CLOSE_RE = re.compile(r"</body\s*>", re.IGNORECASE)
++
++
 +def _html_text_looks_like_document(text: str) -> bool:
 +    lowered = str(text or "").lower()
 +    return any(
 +        marker in lowered
 +        for marker in ("<!doctype", "<html", "</html", "<body", "</body")
 +    )
++
++
  def _ordered_html_target_number(target: str) -> int | None:
      match = re.match(r"(\d+)[-_]", Path(target).name)
      if match is None:
          if not html_placeholder_result.valid:
              return html_placeholder_result
 +        html_document_result = self._validate_html_document_integrity_content(
 +            str(file_path),
 +            str(content),
 +        )
 +        if not html_document_result.valid:
 +            return html_document_result
++
          sensitive_paths = ['/etc/', '/usr/', '/bin/', '/sbin/', '/boot/', '/sys/', '/proc/']
          for sensitive in sensitive_paths:
              if file_path.startswith(sensitive):
          if not html_placeholder_result.valid:
              return html_placeholder_result
 +        html_document_result = self._validate_html_document_integrity_content(
 +            str(file_path),
 +            prospective_content,
 +        )
 +        if not html_document_result.valid:
 +            return html_document_result
++
          html_index_result = self._validate_html_index_links(
              str(file_path),
              prospective_content,
          if not html_placeholder_result.valid:
              return html_placeholder_result
 +        prospective_content = self._prospective_patch_content(
 +            str(file_path),
 +            structured_hunks,
 +            raw_patch,
 +        )
 +        if prospective_content is not None:
 +            html_document_result = self._validate_html_document_integrity_content(
 +                str(file_path),
 +                prospective_content,
 +            )
 +            if not html_document_result.valid:
 +                return html_document_result
++
          return ValidationResult(valid=True)
      def _validate_html_placeholder_content(
              "\n".join(added_fragments),
+         )
 +    def _validate_html_document_integrity_content(
 +        self,
 +        file_path: str,
 +        content: str,
 +    ) -> ValidationResult:
 +        """Reject HTML mutations that damage an existing document envelope."""
++
 +        normalized = Path(file_path).expanduser()
 +        if normalized.suffix.lower() not in {".html", ".htm"}:
 +            return ValidationResult(valid=True)
++
 +        text = str(content or "")
 +        if not _html_text_looks_like_document(text):
 +            return ValidationResult(valid=True)
++
 +        html_open = list(_HTML_OPEN_RE.finditer(text))
 +        html_close = list(_HTML_CLOSE_RE.finditer(text))
 +        body_open = list(_BODY_OPEN_RE.finditer(text))
 +        body_close = list(_BODY_CLOSE_RE.finditer(text))
 +        issues: list[str] = []
++
 +        if html_close or html_open:
 +            if len(html_open) != 1:
 +                issues.append(
 +                    f"expected exactly one opening <html> tag (found {len(html_open)})"
 +                )
 +            if len(html_close) != 1:
 +                issues.append(
 +                    f"expected exactly one closing </html> tag (found {len(html_close)})"
 +                )
 +        if body_close or body_open:
 +            if len(body_open) != 1:
 +                issues.append(
 +                    f"expected exactly one opening <body> tag (found {len(body_open)})"
 +                )
 +            if len(body_close) != 1:
 +                issues.append(
 +                    f"expected exactly one closing </body> tag (found {len(body_close)})"
 +                )
++
 +        if html_open and html_close and html_open[0].start() > html_close[0].start():
 +            issues.append("opening <html> appears after closing </html>")
 +        if body_open and body_close and body_open[0].start() > body_close[0].start():
 +            issues.append("opening <body> appears after closing </body>")
 +        if html_open and body_open and body_open[0].start() < html_open[0].start():
 +            issues.append("opening <body> appears before opening <html>")
 +        if body_close and html_close and body_close[-1].start() > html_close[-1].start():
 +            issues.append("closing </body> appears after closing </html>")
 +        if html_close and text[html_close[-1].end() :].strip():
 +            issues.append("content appears after closing </html>")
++
 +        if not issues:
 +            return ValidationResult(valid=True)
++
 +        preview = "; ".join(issues[:3])
 +        if len(issues) > 3:
 +            preview += "; ..."
 +        return ValidationResult(
 +            valid=False,
 +            reason="HTML document structure would be invalid",
 +            suggestion=(
 +                f"{preview}. Keep the existing closing document tail intact. For "
 +                "content expansion, insert substantive body sections before the current "
 +                "`</body></html>` tail; for malformed files, write one complete HTML "
 +                "document with exactly one `<html>`, `<body>`, `</body>`, and `</html>` "
 +                "and no content after `</html>`."
 +            ),
 +            severity="error",
 +        )
++
      def _validate_html_write_local_link_scope(
          self,
          file_path: str,
              return new_string
          return current.replace(old_string, new_string, 1)
 +    def _prospective_patch_content(
 +        self,
 +        file_path: str,
 +        hunks: list[dict[str, object] | StructuredPatchHunk],
 +        raw_patch: object,
 +    ) -> str | None:
 +        normalized = Path(file_path).expanduser()
 +        if normalized.suffix.lower() not in {".html", ".htm"}:
 +            return None
 +        try:
 +            original_content = normalized.read_text()
 +        except (OSError, UnicodeDecodeError):
 +            return None
++
 +        try:
 +            parsed_hunks: list[StructuredPatchHunk]
 +            if hunks:
 +                original_lines = original_content.splitlines()
 +                parsed_hunks = [
 +                    hunk
 +                    if isinstance(hunk, StructuredPatchHunk)
 +                    else StructuredPatchHunk.from_dict_with_original(
 +                        hunk,
 +                        original_lines=original_lines,
 +                    )
 +                    for hunk in hunks
 +                ]
 +            elif isinstance(raw_patch, str) and raw_patch.strip():
 +                parsed_hunks = parse_unified_diff_patch(raw_patch)
 +            else:
 +                return None
 +            if not parsed_hunks:
 +                return None
 +            return apply_structured_patch(original_content, parsed_hunks)
 +        except ValueError:
 +            return None
++
      def _allows_root_html_graph_seed(
          self,
          file_path: str,

tests/test_safeguard_services.pymodified

      assert "coming soon" in result.suggestion
 +def test_pre_action_validator_blocks_html_patch_that_removes_closing_body(
 +    tmp_path: Path,
 +) -> None:
 +    validator = PreActionValidator()
 +    page = tmp_path / "guide" / "chapters" / "02-installation.html"
 +    page.parent.mkdir(parents=True)
 +    page.write_text("<html>\n<body>\n<h1>Installation</h1>\n</body>\n</html>\n")
++
 +    result = validator.validate(
 +        "patch",
 +        {
 +            "file_path": str(page),
 +            "hunks": [
 +                {
 +                    "old_start": 4,
 +                    "old_lines": 1,
 +                    "new_start": 4,
 +                    "new_lines": 0,
 +                    "lines": ["-</body>"],
 +                }
 +            ],
 +        },
 +    )
++
 +    assert result.valid is False
 +    assert result.reason == "HTML document structure would be invalid"
 +    assert "expected exactly one closing </body> tag (found 0)" in result.suggestion
 +    assert (
 +        "insert substantive body sections before the current `</body></html>` tail"
 +        in result.suggestion
 +    )
++
++
 +def test_pre_action_validator_blocks_html_edit_that_wraps_tail_in_new_body(
 +    tmp_path: Path,
 +) -> None:
 +    validator = PreActionValidator()
 +    page = tmp_path / "guide" / "chapters" / "02-installation.html"
 +    page.parent.mkdir(parents=True)
 +    page.write_text("<html><body><h1>Installation</h1></body></html>")
++
 +    result = validator.validate(
 +        "edit",
 +        {
 +            "file_path": str(page),
 +            "old_string": "</body></html>",
 +            "new_string": (
 +                "<body><h1>Installation</h1>"
 +                "<p>Short replacement fragment.</p></body></html>"
 +            ),
 +        },
 +    )
++
 +    assert result.valid is False
 +    assert result.reason == "HTML document structure would be invalid"
 +    assert "expected exactly one opening <body> tag (found 2)" in result.suggestion
++
++
 +def test_pre_action_validator_allows_html_edit_inserted_before_closing_tail(
 +    tmp_path: Path,
 +) -> None:
 +    validator = PreActionValidator()
 +    page = tmp_path / "guide" / "chapters" / "02-installation.html"
 +    page.parent.mkdir(parents=True)
 +    page.write_text("<html><body><h1>Installation</h1></body></html>")
++
 +    result = validator.validate(
 +        "edit",
 +        {
 +            "file_path": str(page),
 +            "old_string": "</body></html>",
 +            "new_string": (
 +                "<section><h2>Package Managers</h2>"
 +                "<p>Install Nginx with the package manager for your platform.</p>"
 +                "</section></body></html>"
 +            ),
 +        },
 +    )
++
 +    assert result.valid is True
++
++
  def test_pre_action_validator_blocks_missing_local_html_asset_href(tmp_path: Path) -> None:
      validator = PreActionValidator()
      page = tmp_path / "guide" / "chapters" / "07-performance.html"