Go · 3360 bytes Raw Blame History
1 // SPDX-License-Identifier: AGPL-3.0-or-later
2
3 package markdown
4
5 import (
6 "regexp"
7 "sync"
8
9 "github.com/microcosm-cc/bluemonday"
10 )
11
12 // sanitizer is the package-private bluemonday policy. Built once;
13 // safe for concurrent use.
14 //
15 // Threat model:
16 //
17 // - Goldmark with HTML rendering off escapes user-injected raw
18 // HTML at parse time, so the sanitizer is *defense in depth* —
19 // anything that reaches it should already be Goldmark-emitted
20 // HTML or our own AST-extension output.
21 // - The strict scheme allowlist (`http`, `https`, `mailto`) blocks
22 // `javascript:`, `data:`, `vbscript:` URIs entirely.
23 // - We diverge from GitHub by *not* allowing `data:image/...`. If
24 // a user wants an inline image, repo-relative paths work via
25 // /raw/. Documented in docs/markdown.md.
26 var sanitizer = func() *bluemonday.Policy {
27 p := bluemonday.UGCPolicy()
28
29 // Headings keep their auto-generated id so anchor links work.
30 p.AllowAttrs("id").OnElements("h1", "h2", "h3", "h4", "h5", "h6")
31
32 // Code-block class allowlist for Chroma (`language-foo`). The
33 // SpaceSeparatedTokens matcher is bluemonday-built-in; we
34 // further constrain via the regex below so only `language-*` and
35 // chroma's own ancillary classes pass.
36 p.AllowAttrs("class").Matching(reCodeClass).OnElements("code", "pre", "span")
37
38 // GFM task lists: Goldmark emits `<input checked="" disabled=""
39 // type="checkbox" />` inside `<li>`. UGCPolicy doesn't allow
40 // input by default; whitelist the disabled-checkbox shape only.
41 // `type` is matched against "checkbox"; `disabled` and `checked`
42 // are HTML boolean attrs (value commonly empty), so we don't
43 // constrain the value — presence is the signal.
44 p.AllowAttrs("type").Matching(regexp.MustCompile(`^checkbox$`)).OnElements("input")
45 p.AllowAttrs("disabled", "checked").OnElements("input")
46
47 // Folded sections + keyboard markers. Common in READMEs.
48 p.AllowElements("details", "summary", "kbd", "sup", "sub")
49
50 // Mention / ref / commit anchors emitted by our extensions carry
51 // these classes for styling. The base UGC policy already allows
52 // <a> with href + rel; we just need the class allowlist above to
53 // cover `shithub-mention`, `shithub-ref`, `shithub-commit`.
54
55 // Hard-restrict URL schemes. UGCPolicy already restricts schemes
56 // on <a>, but we tighten further: drop ftp, drop data:, leave
57 // only http(s) + mailto.
58 p.AllowURLSchemes("http", "https", "mailto")
59 // Image schemes — UGC allows http/https only by default; we keep
60 // that. No data: anywhere.
61 p.AllowImages()
62
63 // rel="noopener noreferrer" auto-added when target="_blank" is
64 // set. We only set target via opts on autolinks; let bluemonday
65 // keep its default rel-handling.
66 p.RequireNoFollowOnLinks(true)
67 return p
68 }()
69
70 var reCodeClass = regexp.MustCompile(`^(?:language-[A-Za-z0-9_+\-]+|chroma|chroma-[a-zA-Z]+|nl|ln|line|hl)(?:\s+(?:language-[A-Za-z0-9_+\-]+|chroma|chroma-[a-zA-Z]+|nl|ln|line|hl))*$`)
71
72 // sanitizeBytes is the hot-path entry the Render pipeline uses. The
73 // bluemonday Policy is itself goroutine-safe; the once.Do here keeps
74 // the regex compilation cost off the first request path.
75 var sanitizerOnce sync.Once
76
77 func sanitizeBytes(in []byte) []byte {
78 sanitizerOnce.Do(func() {
79 // Touch the package-level sanitizer to ensure it's built.
80 _ = sanitizer
81 })
82 return sanitizer.SanitizeBytes(in)
83 }
84