Go · 3399 bytes Raw Blame History
1 // SPDX-License-Identifier: AGPL-3.0-or-later
2
3 package markdown
4
5 import (
6 "regexp"
7
8 "github.com/microcosm-cc/bluemonday"
9 )
10
11 // sanitizer is the package-private bluemonday policy. Built once;
12 // safe for concurrent use.
13 //
14 // Threat model:
15 //
16 // - Goldmark with HTML rendering off escapes user-injected raw
17 // HTML at parse time, so the sanitizer is *defense in depth* —
18 // anything that reaches it should already be Goldmark-emitted
19 // HTML or our own AST-extension output.
20 // - The strict scheme allowlist (`http`, `https`, `mailto`) blocks
21 // `javascript:`, `data:`, `vbscript:` URIs entirely.
22 // - We diverge from GitHub by *not* allowing `data:image/...`. If
23 // a user wants an inline image, repo-relative paths work via
24 // /raw/. Documented in docs/markdown.md.
25 var sanitizer = func() *bluemonday.Policy {
26 p := bluemonday.UGCPolicy()
27
28 // Headings keep their auto-generated id so anchor links work.
29 p.AllowAttrs("id").OnElements("h1", "h2", "h3", "h4", "h5", "h6")
30 p.AllowAttrs("align").Matching(reAlign).OnElements("p", "div", "h1", "h2", "h3", "h4", "h5", "h6")
31
32 // Code-block class allowlist for Chroma (`language-foo`). The
33 // SpaceSeparatedTokens matcher is bluemonday-built-in; we
34 // further constrain via the regex below so only `language-*` and
35 // chroma's own ancillary classes pass.
36 p.AllowAttrs("class").Matching(reCodeClass).OnElements("code", "pre", "span")
37
38 // GFM task lists: Goldmark emits `<input checked="" disabled=""
39 // type="checkbox" />` inside `<li>`. UGCPolicy doesn't allow
40 // input by default; whitelist the disabled-checkbox shape only.
41 // `type` is matched against "checkbox"; `disabled` and `checked`
42 // are HTML boolean attrs (value commonly empty), so we don't
43 // constrain the value — presence is the signal.
44 p.AllowAttrs("type").Matching(regexp.MustCompile(`^checkbox$`)).OnElements("input")
45 p.AllowAttrs("disabled", "checked").OnElements("input")
46
47 // Folded sections + keyboard markers. Common in READMEs.
48 p.AllowElements("details", "summary", "kbd", "sup", "sub")
49
50 // Mention / ref / commit anchors emitted by our extensions carry
51 // these classes for styling. The base UGC policy already allows
52 // <a> with href + rel; we just need the class allowlist above to
53 // cover `shithub-mention`, `shithub-ref`, `shithub-commit`.
54
55 // Hard-restrict URL schemes. UGCPolicy already restricts schemes
56 // on <a>, but we tighten further: drop ftp, drop data:, leave
57 // only http(s) + mailto.
58 p.AllowURLSchemes("http", "https", "mailto")
59 // Image schemes — UGC allows http/https only by default; we keep
60 // that. No data: anywhere.
61 p.AllowImages()
62
63 // rel="noopener noreferrer" auto-added when target="_blank" is
64 // set. We only set target via opts on autolinks; let bluemonday
65 // keep its default rel-handling.
66 p.RequireNoFollowOnLinks(true)
67 return p
68 }()
69
70 var (
71 reCodeClass = regexp.MustCompile(`^(?:language-[A-Za-z0-9_+\-]+|chroma|chroma-[a-zA-Z]+|nl|ln|line|hl)(?:\s+(?:language-[A-Za-z0-9_+\-]+|chroma|chroma-[a-zA-Z]+|nl|ln|line|hl))*$`)
72 reAlign = regexp.MustCompile(`^(?:left|center|right)$`)
73 )
74
75 // sanitizeBytes is the hot-path entry the Render pipeline uses. The
76 // bluemonday Policy is built at package init via the var initializer
77 // above and is itself goroutine-safe — no need for sync.Once gymnastics.
78 func sanitizeBytes(in []byte) []byte {
79 return sanitizer.SanitizeBytes(in)
80 }
81