// SPDX-License-Identifier: AGPL-3.0-or-later
package markdown
import (
"regexp"
"github.com/microcosm-cc/bluemonday"
)
// sanitizer is the package-private bluemonday policy. Built once;
// safe for concurrent use.
//
// Threat model:
//
// - Goldmark with HTML rendering off escapes user-injected raw
// HTML at parse time, so the sanitizer is *defense in depth* —
// anything that reaches it should already be Goldmark-emitted
// HTML or our own AST-extension output.
// - The strict scheme allowlist (`http`, `https`, `mailto`) blocks
// `javascript:`, `data:`, `vbscript:` URIs entirely.
// - We diverge from GitHub by *not* allowing `data:image/...`. If
// a user wants an inline image, repo-relative paths work via
// /raw/. Documented in docs/markdown.md.
var sanitizer = func() *bluemonday.Policy {
p := bluemonday.UGCPolicy()
// Headings keep their auto-generated id so anchor links work.
p.AllowAttrs("id").OnElements("h1", "h2", "h3", "h4", "h5", "h6")
p.AllowAttrs("align").Matching(reAlign).OnElements("p", "div", "h1", "h2", "h3", "h4", "h5", "h6")
// Code-block class allowlist for Chroma (`language-foo`). The
// SpaceSeparatedTokens matcher is bluemonday-built-in; we
// further constrain via the regex below so only `language-*` and
// chroma's own ancillary classes pass.
p.AllowAttrs("class").Matching(reCodeClass).OnElements("code", "pre", "span")
// GFM task lists: Goldmark emits `` inside `