tenseleyflow/shithub / be5f355

Browse files

S25: canonical internal/markdown package (Render + sanitizer + extensions + XSS suite)

Authored by mfwolffe <wolffemf@dukes.jmu.edu>
SHA
be5f355484e1713f4e50f46efcfe4d37730018b5
Parents
c26ddd4
Tree
ee2750a

8 changed files

StatusFile+-
A internal/markdown/extensions/emoji.go 175 0
A internal/markdown/extensions/extensions.go 355 0
A internal/markdown/markdown.go 73 0
A internal/markdown/markdown_test.go 336 0
A internal/markdown/opts.go 62 0
A internal/markdown/render.go 130 0
A internal/markdown/sanitize.go 83 0
A internal/markdown/version.go 19 0
internal/markdown/extensions/emoji.goadded
@@ -0,0 +1,175 @@
1
+// SPDX-License-Identifier: AGPL-3.0-or-later
2
+
3
+package extensions
4
+
5
+// emojiMap is a small curated set of emoji shortcodes mapped to
6
+// their unicode rune sequences. Sourced from the gemoji project's
7
+// stable subset; we deliberately avoid trending/political shortcodes.
8
+//
9
+// Adding a shortcode: append below + bump `markdown.Version`. Bytes
10
+// are direct UTF-8 (no escape sequences) so the file's bytes are
11
+// the rendered output.
12
+var emojiMap = map[string]string{
13
+	// Common reactions
14
+	"+1":              "👍",
15
+	"-1":              "👎",
16
+	"thumbsup":        "👍",
17
+	"thumbsdown":      "👎",
18
+	"smile":           "😄",
19
+	"laughing":        "😆",
20
+	"joy":             "😂",
21
+	"heart":           "❤️",
22
+	"heart_eyes":      "😍",
23
+	"tada":            "🎉",
24
+	"rocket":          "🚀",
25
+	"fire":            "🔥",
26
+	"sparkles":        "✨",
27
+	"eyes":            "👀",
28
+	"thinking":        "🤔",
29
+	"thinking_face":   "🤔",
30
+	"wave":            "👋",
31
+	"clap":            "👏",
32
+	"pray":            "🙏",
33
+	"100":             "💯",
34
+	"check":           "✅",
35
+	"x":               "❌",
36
+	"warning":         "⚠️",
37
+	"bug":             "🐛",
38
+	"sparkle":         "✨",
39
+	"star":            "⭐",
40
+	"hammer":          "🔨",
41
+	"wrench":          "🔧",
42
+	"package":         "📦",
43
+	"books":           "📚",
44
+	"book":            "📖",
45
+	"memo":            "📝",
46
+	"pencil":          "✏️",
47
+	"shipit":          "🚢",
48
+	"ship":            "🚢",
49
+	"lock":            "🔒",
50
+	"unlock":          "🔓",
51
+	"key":             "🔑",
52
+	"link":            "🔗",
53
+	"speech_balloon":  "💬",
54
+	"thought_balloon": "💭",
55
+	"computer":        "💻",
56
+	"keyboard":        "⌨️",
57
+	"floppy_disk":     "💾",
58
+	"cd":              "💿",
59
+	"dvd":             "📀",
60
+	"clipboard":       "📋",
61
+	"chart":           "📈",
62
+	"bar_chart":       "📊",
63
+	"calendar":        "📅",
64
+	"date":            "📆",
65
+	"hourglass":       "⌛",
66
+	"alarm_clock":     "⏰",
67
+	"clock1":          "🕐",
68
+	"bell":            "🔔",
69
+	"no_bell":         "🔕",
70
+	"loudspeaker":     "📢",
71
+	"mega":            "📣",
72
+	"mailbox":         "📫",
73
+	"envelope":        "✉️",
74
+	"postbox":         "📮",
75
+	"package_2":       "📮",
76
+	"mag":             "🔍",
77
+	"telescope":       "🔭",
78
+	"microscope":      "🔬",
79
+	"hammer_and_wrench": "🛠️",
80
+	"gear":              "⚙️",
81
+	"toolbox":           "🧰",
82
+	"nut_and_bolt":      "🔩",
83
+	"satellite":         "📡",
84
+	"globe":             "🌍",
85
+	"earth_americas":    "🌎",
86
+	"earth_asia":        "🌏",
87
+	"new":               "🆕",
88
+	"free":              "🆓",
89
+	"abc":               "🔤",
90
+	"abcd":              "🔡",
91
+	"capital_abcd":      "🔠",
92
+	"information_source": "ℹ️",
93
+	"interrobang":        "⁉️",
94
+	"question":           "❓",
95
+	"grey_question":      "❔",
96
+	"exclamation":        "❗",
97
+	"grey_exclamation":   "❕",
98
+	"o":                  "⭕",
99
+	"x_circle":           "❌",
100
+	"white_check_mark":   "✅",
101
+	"ballot_box_with_check": "☑️",
102
+	"heavy_check_mark":      "✔️",
103
+	"heavy_multiplication_x": "✖️",
104
+	"heavy_plus_sign":        "➕",
105
+	"heavy_minus_sign":       "➖",
106
+	"heavy_division_sign":    "➗",
107
+	"recycle":                "♻️",
108
+	"infinity":               "♾️",
109
+	"trophy":                 "🏆",
110
+	"medal":                  "🏅",
111
+	"first_place":            "🥇",
112
+	"second_place":           "🥈",
113
+	"third_place":            "🥉",
114
+	"crown":                  "👑",
115
+	"gem":                    "💎",
116
+	"art":                    "🎨",
117
+	"musical_note":           "🎵",
118
+	"musical_score":           "🎼",
119
+	"sound":                  "🔊",
120
+	"mute":                   "🔇",
121
+	"video_camera":           "📹",
122
+	"camera":                 "📷",
123
+	"camera_flash":           "📸",
124
+	"film_strip":             "🎞️",
125
+	"clapper":                "🎬",
126
+	"microphone":             "🎤",
127
+	"headphones":             "🎧",
128
+	"radio":                  "📻",
129
+	"tv":                     "📺",
130
+	"phone":                  "📞",
131
+	"telephone":              "☎️",
132
+	"iphone":                 "📱",
133
+	"calling":                "📲",
134
+	"battery":                "🔋",
135
+	"electric_plug":          "🔌",
136
+	"bulb":                   "💡",
137
+	"flashlight":             "🔦",
138
+	"candle":                 "🕯️",
139
+	"sun":                    "☀️",
140
+	"sunny":                  "☀️",
141
+	"moon":                   "🌙",
142
+	"star_2":                 "🌟",
143
+	"stars":                  "🌠",
144
+	"cloud":                  "☁️",
145
+	"snowflake":              "❄️",
146
+	"zap":                    "⚡",
147
+	"umbrella":               "☂️",
148
+	"rainbow":                "🌈",
149
+	"droplet":                "💧",
150
+	"ocean":                  "🌊",
151
+	"snowman":                "☃️",
152
+	"comet":                  "☄️",
153
+	"v":                      "✌️",
154
+	"point_up":               "☝️",
155
+	"point_down":             "👇",
156
+	"point_left":             "👈",
157
+	"point_right":            "👉",
158
+	"raised_hand":            "✋",
159
+	"open_hands":             "👐",
160
+	"muscle":                 "💪",
161
+	"writing_hand":           "✍️",
162
+	"selfie":                 "🤳",
163
+	"facepunch":              "👊",
164
+	"fist":                   "✊",
165
+	"poo":                    "💩",
166
+	"hankey":                 "💩",
167
+	"shit":                   "💩",
168
+}
169
+
170
+// lookupEmoji returns the unicode replacement for a shortcode,
171
+// or ("", false) if the code isn't in our curated set.
172
+func lookupEmoji(name string) (string, bool) {
173
+	v, ok := emojiMap[name]
174
+	return v, ok
175
+}
internal/markdown/extensions/extensions.goadded
@@ -0,0 +1,355 @@
1
+// SPDX-License-Identifier: AGPL-3.0-or-later
2
+
3
+// Package extensions hosts the AST transformer that adds shithub-
4
+// specific inline patterns (`@user`, `#N`, `owner/repo#N`, commit
5
+// SHAs, emoji shortcodes) to Goldmark's parsed text without ever
6
+// touching the contents of code blocks or inline code.
7
+//
8
+// Approach: a single `parser.ASTTransformer` walks the document
9
+// after parsing, visiting only `*ast.Text` nodes whose ancestors
10
+// are NOT code/codespan/autolink/link nodes. Each visited text node
11
+// is run through one combined regex; matches are replaced with
12
+// `*ast.Link` (mention/ref/commit) or `*ast.String` (emoji) nodes,
13
+// with the surrounding text preserved as `*ast.String` segments.
14
+//
15
+// Why an ASTTransformer instead of inline parsers: inline parsers
16
+// run during the main parse pass and need a `Trigger()` byte set
17
+// plus careful interaction with Goldmark's existing inline
18
+// disambiguation. The transformer approach is simpler, well-trodden
19
+// in other Go markdown stacks, and produces equivalent output for
20
+// every input we care about.
21
+package extensions
22
+
23
+import (
24
+	"bytes"
25
+	"context"
26
+	"regexp"
27
+	"strconv"
28
+	"strings"
29
+
30
+	"github.com/yuin/goldmark"
31
+	"github.com/yuin/goldmark/ast"
32
+	"github.com/yuin/goldmark/parser"
33
+	"github.com/yuin/goldmark/text"
34
+	"github.com/yuin/goldmark/util"
35
+)
36
+
37
+// Resolvers wires the transformer against the runtime. The fields
38
+// are independent so the parent package can decide which flavors
39
+// to enable. nil-resolver means "render this kind as plain text"
40
+// (no link, no error).
41
+//
42
+// All resolvers MUST be visibility-aware. The transformer does not
43
+// re-check visibility — it trusts the resolver's `ok` to gate
44
+// existence.
45
+type Resolvers struct {
46
+	User func(ctx context.Context, username string) (href string, ok bool)
47
+	// Issue covers both same-repo (#N when ownerHint == "") and
48
+	// cross-repo (owner/repo#N).
49
+	Issue func(ctx context.Context, ownerHint, repoHint string, number int64, viewerUserID int64) (href string, ok bool)
50
+	// Commit is invoked only when RepoOwner+RepoName are both set
51
+	// (a same-repo render) and the matched token is a 7-40 char
52
+	// lowercase hex string at a word boundary.
53
+	Commit func(ctx context.Context, repoOwner, repoName, shaPrefix string) (href, fullSHA string, ok bool)
54
+}
55
+
56
+// Options is the per-render config consumed by the transformer.
57
+type Options struct {
58
+	Ctx          context.Context
59
+	RepoOwner    string
60
+	RepoName     string
61
+	ViewerUserID int64
62
+	Resolvers    Resolvers
63
+	// Refs and Mentions accumulate resolved references for the caller.
64
+	// Pointers so the transformer can append.
65
+	Refs     *[]Ref
66
+	Mentions *[]Mention
67
+}
68
+
69
+// Ref / Mention mirror the parent-package types; we redeclare to
70
+// avoid an import cycle.
71
+type Ref struct {
72
+	Kind    string
73
+	Owner   string
74
+	Repo    string
75
+	Number  int64
76
+	FullSHA string
77
+	Href    string
78
+}
79
+
80
+type Mention struct {
81
+	Username string
82
+	Href     string
83
+}
84
+
85
+// reCombined matches every pattern in one pass. Order in the
86
+// alternation is by how they appear in source after parsing — left
87
+// to right. Capture groups:
88
+//
89
+//	(?:^|[^\w/])     leading boundary (consumed but reattached as text)
90
+//	#1               cross-repo: owner / repo / number
91
+//	#4               same-repo: number
92
+//	#5               mention: username
93
+//	#6               commit prefix
94
+//	#7               emoji name
95
+var reCombined = regexp.MustCompile(`` +
96
+	// cross-repo: alice/proj#3
97
+	`([A-Za-z0-9][A-Za-z0-9._-]*)/([A-Za-z0-9][A-Za-z0-9._-]*)#([0-9]{1,9})\b` +
98
+	// or same-repo: #3 — must have non-word non-/ boundary on the left
99
+	`|(?:^|[^\w/])#([0-9]{1,9})\b` +
100
+	// or mention: @alice — must have non-word boundary on the left
101
+	`|(?:^|[^\w])@([A-Za-z0-9][A-Za-z0-9_-]{0,38})\b` +
102
+	// or commit SHA: 7–40 lowercase hex, word-boundary on both sides
103
+	`|(?:^|[^\w/])([0-9a-f]{7,40})\b` +
104
+	// or emoji shortcode: :smile:
105
+	`|:([a-z0-9_+\-]+):`,
106
+)
107
+
108
+// Extension is a goldmark.Extender that registers the AST transformer.
109
+type Extension struct{ Opts *Options }
110
+
111
+// New constructs the extender with the given options.
112
+func New(opts *Options) goldmark.Extender { return &Extension{Opts: opts} }
113
+
114
+// Extend implements goldmark.Extender.
115
+func (e *Extension) Extend(m goldmark.Markdown) {
116
+	m.Parser().AddOptions(parser.WithASTTransformers(
117
+		util.Prioritized(&transformer{opts: e.Opts}, 999),
118
+	))
119
+}
120
+
121
+type transformer struct{ opts *Options }
122
+
123
+// Transform walks the document and replaces matched text segments.
124
+func (t *transformer) Transform(doc *ast.Document, reader text.Reader, _ parser.Context) {
125
+	if t.opts == nil {
126
+		return
127
+	}
128
+	source := reader.Source()
129
+	_ = ast.Walk(doc, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
130
+		if !entering {
131
+			return ast.WalkContinue, nil
132
+		}
133
+		// Skip subtrees that should never be linkified.
134
+		switch n.(type) {
135
+		case *ast.CodeSpan, *ast.AutoLink, *ast.Link, *ast.Image,
136
+			*ast.FencedCodeBlock, *ast.CodeBlock, *ast.RawHTML, *ast.HTMLBlock:
137
+			return ast.WalkSkipChildren, nil
138
+		}
139
+		txt, ok := n.(*ast.Text)
140
+		if !ok {
141
+			return ast.WalkContinue, nil
142
+		}
143
+		t.replaceText(txt, source)
144
+		return ast.WalkContinue, nil
145
+	})
146
+}
147
+
148
+// replaceText finds matches in the segment of `txt` and inserts
149
+// new sibling nodes (string runs + links) before the original text;
150
+// the original text is removed once everything's stitched in.
151
+func (t *transformer) replaceText(txt *ast.Text, source []byte) {
152
+	body := txt.Segment.Value(source)
153
+	matches := reCombined.FindAllSubmatchIndex(body, -1)
154
+	if len(matches) == 0 {
155
+		return
156
+	}
157
+	parent := txt.Parent()
158
+	if parent == nil {
159
+		return
160
+	}
161
+
162
+	cursor := 0
163
+	for _, m := range matches {
164
+		matchStart, matchEnd := m[0], m[1]
165
+
166
+		// Determine which alternation captured + where the visible
167
+		// content starts (excluding the regex-consumed boundary
168
+		// char, if any).
169
+		var (
170
+			isCrossRepo = m[2] >= 0
171
+			isSameRepo  = m[8] >= 0
172
+			isMention   = m[10] >= 0
173
+			isCommit    = m[12] >= 0
174
+			isEmoji     = m[14] >= 0
175
+		)
176
+		var contentStart int
177
+		switch {
178
+		case isCrossRepo:
179
+			contentStart = m[2]
180
+		case isSameRepo:
181
+			contentStart = m[8] - 1 // include `#`
182
+		case isMention:
183
+			contentStart = m[10] - 1 // include `@`
184
+		case isCommit:
185
+			contentStart = m[12]
186
+		case isEmoji:
187
+			contentStart = m[14] - 1 // include leading `:`
188
+		}
189
+
190
+		// Emit (a) any text between the previous cursor and the
191
+		// match start, then (b) the consumed-but-not-content
192
+		// boundary char (when contentStart > matchStart). Both into
193
+		// the parent before the original text node.
194
+		if matchStart > cursor {
195
+			t.insertText(parent, txt, body[cursor:matchStart])
196
+		}
197
+		if contentStart > matchStart {
198
+			t.insertText(parent, txt, body[matchStart:contentStart])
199
+		}
200
+
201
+		// Now emit the resolved (or fallback-plain) match content.
202
+		display := body[contentStart:matchEnd]
203
+		switch {
204
+		case isCrossRepo:
205
+			owner := string(body[m[2]:m[3]])
206
+			repo := string(body[m[4]:m[5]])
207
+			numStr := string(body[m[6]:m[7]])
208
+			if !t.appendIssueLink(parent, txt, owner, repo, numStr, display) {
209
+				t.insertText(parent, txt, display)
210
+			}
211
+		case isSameRepo:
212
+			numStr := string(body[m[8]:m[9]])
213
+			if !t.appendIssueLink(parent, txt, "", "", numStr, display) {
214
+				t.insertText(parent, txt, display)
215
+			}
216
+		case isMention:
217
+			name := string(body[m[10]:m[11]])
218
+			if !t.appendMentionLink(parent, txt, name, display) {
219
+				t.insertText(parent, txt, display)
220
+			}
221
+		case isCommit:
222
+			sha := string(body[m[12]:m[13]])
223
+			if !t.appendCommitLink(parent, txt, sha, display) {
224
+				t.insertText(parent, txt, display)
225
+			}
226
+		case isEmoji:
227
+			name := string(body[m[14]:m[15]])
228
+			if uni, ok := lookupEmoji(name); ok {
229
+				t.insertText(parent, txt, []byte(uni))
230
+			} else {
231
+				t.insertText(parent, txt, display)
232
+			}
233
+		}
234
+		cursor = matchEnd
235
+	}
236
+	// Trailing text after the last match.
237
+	if cursor < len(body) {
238
+		t.insertText(parent, txt, body[cursor:])
239
+	}
240
+	parent.RemoveChild(parent, txt)
241
+}
242
+
243
+// insertText appends a string node before the original text node
244
+// (which is removed at the end of replaceText).
245
+func (t *transformer) insertText(parent, before ast.Node, b []byte) {
246
+	if len(b) == 0 {
247
+		return
248
+	}
249
+	s := ast.NewString(append([]byte(nil), b...))
250
+	parent.InsertBefore(parent, before, s)
251
+}
252
+
253
+// appendIssueLink resolves an issue/PR ref and inserts a Link node.
254
+// `display` is the visible text the user typed (e.g. "#42" or
255
+// "alice/proj#5"). Returns false when the resolver declines (in
256
+// which case the caller renders the display text as plain text —
257
+// no link, no existence leak).
258
+func (t *transformer) appendIssueLink(parent, before ast.Node, owner, repo, numStr string, display []byte) bool {
259
+	if t.opts.Resolvers.Issue == nil {
260
+		return false
261
+	}
262
+	num, err := strconv.ParseInt(numStr, 10, 64)
263
+	if err != nil {
264
+		return false
265
+	}
266
+	href, ok := t.opts.Resolvers.Issue(t.opts.Ctx, owner, repo, num, t.opts.ViewerUserID)
267
+	if !ok {
268
+		return false
269
+	}
270
+	link := ast.NewLink()
271
+	link.Destination = []byte(href)
272
+	link.AppendChild(link, ast.NewString(append([]byte(nil), display...)))
273
+	parent.InsertBefore(parent, before, link)
274
+
275
+	if t.opts.Refs != nil {
276
+		*t.opts.Refs = append(*t.opts.Refs, Ref{
277
+			Kind:   "issue",
278
+			Owner:  owner,
279
+			Repo:   repo,
280
+			Number: num,
281
+			Href:   href,
282
+		})
283
+	}
284
+	return true
285
+}
286
+
287
+// appendMentionLink resolves a @username and inserts a Link node.
288
+func (t *transformer) appendMentionLink(parent, before ast.Node, username string, display []byte) bool {
289
+	if t.opts.Resolvers.User == nil {
290
+		return false
291
+	}
292
+	href, ok := t.opts.Resolvers.User(t.opts.Ctx, username)
293
+	if !ok {
294
+		return false
295
+	}
296
+	link := ast.NewLink()
297
+	link.Destination = []byte(href)
298
+	link.AppendChild(link, ast.NewString(append([]byte(nil), display...)))
299
+	parent.InsertBefore(parent, before, link)
300
+	if t.opts.Mentions != nil {
301
+		*t.opts.Mentions = append(*t.opts.Mentions, Mention{
302
+			Username: username,
303
+			Href:     href,
304
+		})
305
+	}
306
+	return true
307
+}
308
+
309
+// appendCommitLink resolves a commit SHA prefix in the current repo.
310
+func (t *transformer) appendCommitLink(parent, before ast.Node, shaPrefix string, display []byte) bool {
311
+	if t.opts.Resolvers.Commit == nil || t.opts.RepoOwner == "" || t.opts.RepoName == "" {
312
+		return false
313
+	}
314
+	href, full, ok := t.opts.Resolvers.Commit(t.opts.Ctx, t.opts.RepoOwner, t.opts.RepoName, shaPrefix)
315
+	if !ok {
316
+		return false
317
+	}
318
+	link := ast.NewLink()
319
+	link.Destination = []byte(href)
320
+	// Display the SHA as <code>; preserve the user's typed length.
321
+	codeText := append([]byte(nil), display...)
322
+	codeText = bytes.TrimSpace(codeText)
323
+	link.AppendChild(link, ast.NewCodeSpan())
324
+	cs := link.LastChild().(*ast.CodeSpan)
325
+	cs.AppendChild(cs, ast.NewString(codeText))
326
+	parent.InsertBefore(parent, before, link)
327
+
328
+	if t.opts.Refs != nil {
329
+		*t.opts.Refs = append(*t.opts.Refs, Ref{
330
+			Kind:    "commit",
331
+			FullSHA: full,
332
+			Href:    href,
333
+		})
334
+	}
335
+	return true
336
+}
337
+
338
+// trimLeadingNonWord drops the leading boundary char(s) — used when
339
+// a same-repo or mention token is rendered as plain text fallback.
340
+func trimLeadingNonWord(b []byte) []byte {
341
+	for len(b) > 0 && !isWordByte(b[0]) {
342
+		b = b[1:]
343
+	}
344
+	return b
345
+}
346
+
347
+func isWordByte(c byte) bool {
348
+	return c == '_' || (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
349
+}
350
+
351
+// silence unused-import warnings in a stripped build.
352
+var (
353
+	_ = strings.Builder{}
354
+	_ = trimLeadingNonWord
355
+)
internal/markdown/markdown.goadded
@@ -0,0 +1,73 @@
1
+// SPDX-License-Identifier: AGPL-3.0-or-later
2
+
3
+// Package markdown is shithub's canonical markdown rendering pipeline.
4
+//
5
+// One entry point: `Render(ctx, source, opts) (html, refs, mentions)`.
6
+// Every comment, issue/PR body, README, and any future surface that
7
+// takes user-authored markdown flows through here. No other code in
8
+// the tree imports goldmark or bluemonday directly; a lint guard at
9
+// `scripts/lint-markdown-boundary.sh` enforces that boundary.
10
+//
11
+// What's supported (CommonMark + a curated GFM set):
12
+//
13
+//   - Headings, paragraphs, lists, blockquotes, code blocks (fenced + indented).
14
+//   - GFM tables, strikethrough, autolinks, task lists.
15
+//   - `@username` mentions — resolved via Options.Resolvers.User.
16
+//   - `#N` and `owner/repo#N` references — resolved via
17
+//     Options.Resolvers.Issue, gated by viewer visibility.
18
+//   - Emoji shortcodes (`:smile:`) — curated set in `emoji/`.
19
+//   - `<details>` / `<summary>`, `<kbd>`, `<sup>`, `<sub>`,
20
+//     task-list checkboxes (output by Goldmark with `disabled`).
21
+//   - Code-block class allowlist: `language-*` (Chroma uses these).
22
+//
23
+// What's deliberately not supported:
24
+//
25
+//   - Raw HTML beyond the strict allowlist (we do NOT match GitHub's
26
+//     loose HTML acceptance — we err safer).
27
+//   - `data:` URIs (any flavor; documented in docs/markdown.md).
28
+//   - `javascript:` / `vbscript:` URLs (rejected by sanitizer).
29
+//   - GFM Footnotes (deferred).
30
+//   - Math (KaTeX), Mermaid, embedded media (post-MVP).
31
+//
32
+// Pipeline-version contract:
33
+//
34
+//   - `Version` (in version.go) stamps every render.
35
+//   - Callers store the version alongside cached HTML.
36
+//   - On read, callers compare the stored version to `Version`; if
37
+//     they differ, the cache is stale and the caller re-renders. We
38
+//     never run a one-shot "re-render every comment" job — lazy.
39
+//
40
+// Performance budget:
41
+//
42
+//   - 50 KiB body, full extensions: <30 ms p99 on MVP hardware.
43
+//   - Inputs above MaxRenderInputBytes are rejected up-front; callers
44
+//     enforce the matching cap at the API layer.
45
+package markdown
46
+
47
+// MaxRenderInputBytes caps the input body. Comments / bodies are
48
+// rejected at the API layer at 64 KiB or 256 KiB depending on
49
+// surface; this is the renderer's defensive fallback.
50
+const MaxRenderInputBytes = 1 << 20 // 1 MiB
51
+
52
+// Ref is one resolved reference produced during rendering. The
53
+// caller uses these for downstream notification fan-out (S29) and
54
+// for the issue_references index (S21).
55
+type Ref struct {
56
+	// Kind is "issue" or "commit" for now.
57
+	Kind string
58
+	// Same-repo refs leave Owner/Repo empty.
59
+	Owner string
60
+	Repo  string
61
+	// Number is set for issue refs; FullSHA for commit refs.
62
+	Number  int64
63
+	FullSHA string
64
+	// Href is the resolved URL the renderer wrote into the document.
65
+	Href string
66
+}
67
+
68
+// Mention is one resolved @username mention. S29's fan-out consumes
69
+// the deduplicated list returned by Render.
70
+type Mention struct {
71
+	Username string
72
+	Href     string
73
+}
internal/markdown/markdown_test.goadded
@@ -0,0 +1,336 @@
1
+// SPDX-License-Identifier: AGPL-3.0-or-later
2
+
3
+package markdown
4
+
5
+import (
6
+	"context"
7
+	"strings"
8
+	"testing"
9
+)
10
+
11
+// TestRender_HostileInputs is the XSS-vector cheatsheet. Every
12
+// fixture is a markdown body that *attempts* to inject executable
13
+// JS through a different vector. The pass condition: the rendered
14
+// HTML contains no `<script` tag, no `javascript:` URL, no event
15
+// handler attribute (`on*`), and no `data:` URI.
16
+//
17
+// Add new vectors here when a CVE / advisory lands in goldmark or
18
+// bluemonday — they're cheap to keep.
19
+func TestRender_HostileInputs(t *testing.T) {
20
+	t.Parallel()
21
+	vectors := []string{
22
+		// Direct script tag.
23
+		`<script>alert(1)</script>`,
24
+		`<SCRIPT>alert(1)</SCRIPT>`,
25
+		`<script src="//evil.com/x.js"></script>`,
26
+		// Inline event handlers.
27
+		`<img src="x" onerror="alert(1)">`,
28
+		`<img src=x onerror=alert(1)>`,
29
+		`<a onmouseover="alert(1)">x</a>`,
30
+		`<body onload="alert(1)">`,
31
+		// Style with expressions.
32
+		`<style>body{background:url("javascript:alert(1)")}</style>`,
33
+		`<div style="background:url(javascript:alert(1))">x</div>`,
34
+		// javascript: links.
35
+		`[click](javascript:alert(1))`,
36
+		`<a href="javascript:alert(1)">x</a>`,
37
+		`<a href="JaVaScRiPt:alert(1)">x</a>`,
38
+		`[click](JAVASCRIPT:alert(1))`,
39
+		// data: URIs (we disallow even data:image).
40
+		`<img src="data:image/svg+xml;base64,PHN2Zz4=">`,
41
+		`[x](data:text/html,<script>alert(1)</script>)`,
42
+		// vbscript:.
43
+		`<a href="vbscript:msgbox(1)">x</a>`,
44
+		// SVG-embedded scripts.
45
+		`<svg><script>alert(1)</script></svg>`,
46
+		`<svg onload="alert(1)"></svg>`,
47
+		// iframes.
48
+		`<iframe src="//evil.com"></iframe>`,
49
+		`<iframe srcdoc="<script>alert(1)</script>"></iframe>`,
50
+		// HTML in markdown link text doesn't escape sanitizer.
51
+		`[<script>alert(1)</script>](https://example.com)`,
52
+		// Mutation XSS via mismatched quotes.
53
+		`<a href="x"onmouseover="alert(1)">x</a>`,
54
+		// Encoded payloads.
55
+		`<a href="&#x6A;avascript:alert(1)">x</a>`,
56
+		`<a href="&#106;avascript:alert(1)">x</a>`,
57
+		// Backticked code-like content shouldn't escape.
58
+		"`<script>alert(1)</script>`",
59
+		// Embedded in autolinks.
60
+		`<javascript:alert(1)>`,
61
+		// Object/embed.
62
+		`<object data="x.swf"></object>`,
63
+		`<embed src="x.swf">`,
64
+		// Form/button with formaction.
65
+		`<form><button formaction="javascript:alert(1)">x</button></form>`,
66
+		// Meta refresh.
67
+		`<meta http-equiv="refresh" content="0; url=javascript:alert(1)">`,
68
+		// Base href hijack.
69
+		`<base href="javascript:">`,
70
+		// MathML / annotation.
71
+		`<math><annotation-xml encoding="text/html"><script>alert(1)</script></annotation-xml></math>`,
72
+		// CSS expression (legacy IE).
73
+		`<div style="width: expression(alert(1))">x</div>`,
74
+		// Nested fenced code with a script.
75
+		"```\n<script>alert(1)</script>\n```",
76
+		// Markdown link href with newlines.
77
+		"[x](\njavascript:alert(1))",
78
+		// Image with javascript:.
79
+		`![x](javascript:alert(1))`,
80
+		// HTML entities in URI.
81
+		`[x](java&#0000115;cript:alert(1))`,
82
+		// Hex / decimal entities in href attribute.
83
+		`<a href="javasc&#x72;ipt:alert(1)">x</a>`,
84
+		// Tab/newline obfuscation.
85
+		"<a href=\"java\tscript:alert(1)\">x</a>",
86
+		"<a href=\"java\nscript:alert(1)\">x</a>",
87
+		// Polyglot HTML+SVG.
88
+		`<svg/onload=alert(1)>`,
89
+		// Anchor with target=_blank but no rel (we want rel auto-set).
90
+		`<a href="https://evil.com" target="_blank">x</a>`,
91
+	}
92
+	for i, src := range vectors {
93
+		out, _, _, err := Render(context.Background(), []byte(src), Options{})
94
+		if err != nil {
95
+			t.Fatalf("vector %d render error: %v", i, err)
96
+		}
97
+		// Lower-case for case-insensitive substring search. We
98
+		// distinguish "executable surface" from "harmless text".
99
+		// Plain-text "javascript:" in prose is safe; "javascript:"
100
+		// inside href/src is an XSS — guard the latter shape only.
101
+		s := strings.ToLower(string(out))
102
+		for _, bad := range []string{
103
+			"<script", "</script>",
104
+			`href="javascript:`, `href='javascript:`,
105
+			`src="javascript:`, `src='javascript:`,
106
+			`href="vbscript:`, `src="vbscript:`,
107
+			`href="data:`, `src="data:text`, `src="data:image`,
108
+			" onerror=", " onload=", " onclick=", " onmouseover=",
109
+			"<iframe", "<object", "<embed",
110
+			"<style", "<base ", "<meta ",
111
+			"<annotation-xml", "expression(",
112
+		} {
113
+			if strings.Contains(s, bad) {
114
+				t.Errorf("vector %d (%q): rendered HTML contains %q\nout=%q", i, src, bad, out)
115
+			}
116
+		}
117
+	}
118
+}
119
+
120
+// TestRender_AllowsSafeHTML ensures the strict policy doesn't strip
121
+// `<details>`, `<summary>`, `<kbd>`, `<sup>`, `<sub>`, task-list
122
+// checkboxes, language-* class on code blocks, or auto-heading IDs.
123
+func TestRender_AllowsSafeHTML(t *testing.T) {
124
+	t.Parallel()
125
+	cases := []struct {
126
+		name      string
127
+		src       string
128
+		mustContain []string
129
+	}{
130
+		{
131
+			"details + summary",
132
+			"<details><summary>click</summary>secret</details>",
133
+			[]string{"<details>", "<summary>", "click", "secret"},
134
+		},
135
+		{
136
+			"kbd",
137
+			"press <kbd>Ctrl</kbd>+<kbd>C</kbd>",
138
+			[]string{"<kbd>Ctrl</kbd>", "<kbd>C</kbd>"},
139
+		},
140
+		{
141
+			"sup/sub",
142
+			"x<sup>2</sup> + y<sub>i</sub>",
143
+			[]string{"<sup>2</sup>", "<sub>i</sub>"},
144
+		},
145
+		{
146
+			"task list",
147
+			"- [x] done\n- [ ] not yet\n",
148
+			[]string{"<input", "checkbox", "disabled"},
149
+		},
150
+		{
151
+			"fenced code with language",
152
+			"```go\nfmt.Println(\"hi\")\n```",
153
+			[]string{`class="language-go"`},
154
+		},
155
+		{
156
+			"heading anchor id",
157
+			"# Hello world",
158
+			[]string{`id="hello-world"`},
159
+		},
160
+		{
161
+			"GFM table",
162
+			"| a | b |\n|---|---|\n| 1 | 2 |\n",
163
+			[]string{"<table>", "<th>a</th>", "<td>1</td>"},
164
+		},
165
+		{
166
+			"strikethrough",
167
+			"~~obsolete~~",
168
+			[]string{"<del>obsolete</del>"},
169
+		},
170
+		{
171
+			"autolink",
172
+			"https://example.com",
173
+			[]string{`href="https://example.com"`},
174
+		},
175
+	}
176
+	for _, c := range cases {
177
+		c := c
178
+		t.Run(c.name, func(t *testing.T) {
179
+			t.Parallel()
180
+			out, _, _, err := Render(context.Background(), []byte(c.src), Options{})
181
+			if err != nil {
182
+				t.Fatalf("render: %v", err)
183
+			}
184
+			s := string(out)
185
+			for _, want := range c.mustContain {
186
+				if !strings.Contains(s, want) {
187
+					t.Errorf("expected %q in output, got %q", want, s)
188
+				}
189
+			}
190
+		})
191
+	}
192
+}
193
+
194
+// TestRender_MentionResolution checks that @user resolves when the
195
+// resolver returns ok and stays plain text otherwise.
196
+func TestRender_MentionResolution(t *testing.T) {
197
+	t.Parallel()
198
+	resolver := func(_ context.Context, name string) (string, bool) {
199
+		if name == "alice" {
200
+			return "/alice", true
201
+		}
202
+		return "", false
203
+	}
204
+	out, _, mentions, err := Render(context.Background(), []byte("hi @alice and @bob"), Options{
205
+		Resolvers: Resolvers{User: resolver},
206
+	})
207
+	if err != nil {
208
+		t.Fatalf("render: %v", err)
209
+	}
210
+	s := string(out)
211
+	if !strings.Contains(s, `href="/alice"`) {
212
+		t.Errorf("expected @alice link, got %q", s)
213
+	}
214
+	if strings.Contains(s, `href="/bob"`) {
215
+		t.Errorf("@bob should not link, got %q", s)
216
+	}
217
+	if len(mentions) != 1 || mentions[0].Username != "alice" {
218
+		t.Errorf("expected 1 mention (alice), got %v", mentions)
219
+	}
220
+}
221
+
222
+// TestRender_IssueRefResolution checks both same-repo and cross-repo
223
+// refs, and that an unresolvable ref renders as plain text (no link).
224
+func TestRender_IssueRefResolution(t *testing.T) {
225
+	t.Parallel()
226
+	resolver := func(_ context.Context, owner, name string, num int64, _ int64) (string, bool) {
227
+		// Same-repo refs leave owner+name empty.
228
+		if owner == "" && name == "" && num == 7 {
229
+			return "/o/r/issues/7", true
230
+		}
231
+		if owner == "alice" && name == "proj" && num == 3 {
232
+			return "/alice/proj/issues/3", true
233
+		}
234
+		return "", false
235
+	}
236
+	out, refs, _, err := Render(context.Background(), []byte("see #7 and alice/proj#3, but not bob/x#9"), Options{
237
+		Resolvers: Resolvers{Issue: resolver},
238
+	})
239
+	if err != nil {
240
+		t.Fatalf("render: %v", err)
241
+	}
242
+	s := string(out)
243
+	if !strings.Contains(s, `href="/o/r/issues/7"`) {
244
+		t.Errorf("expected #7 link, got %q", s)
245
+	}
246
+	if !strings.Contains(s, `href="/alice/proj/issues/3"`) {
247
+		t.Errorf("expected alice/proj#3 link, got %q", s)
248
+	}
249
+	if strings.Contains(s, `href="/bob/x/issues/9"`) {
250
+		t.Errorf("bob/x#9 should not link, got %q", s)
251
+	}
252
+	if len(refs) != 2 {
253
+		t.Errorf("expected 2 refs, got %v", refs)
254
+	}
255
+}
256
+
257
+// TestRender_RefsInsideCodeAreInert confirms that #N inside inline
258
+// code or fenced code stays as text.
259
+func TestRender_RefsInsideCodeAreInert(t *testing.T) {
260
+	t.Parallel()
261
+	resolver := func(_ context.Context, owner, name string, num int64, _ int64) (string, bool) {
262
+		return "/should/not/appear", true
263
+	}
264
+	src := "Inline `#7` and:\n\n```\nblock #7 here\n```"
265
+	out, refs, _, err := Render(context.Background(), []byte(src), Options{
266
+		Resolvers: Resolvers{Issue: resolver},
267
+	})
268
+	if err != nil {
269
+		t.Fatalf("render: %v", err)
270
+	}
271
+	if strings.Contains(string(out), "/should/not/appear") {
272
+		t.Errorf("ref leaked into code block: %q", out)
273
+	}
274
+	if len(refs) != 0 {
275
+		t.Errorf("expected 0 refs inside code, got %v", refs)
276
+	}
277
+}
278
+
279
+// TestRender_EmojiShortcodes checks the curated set works.
280
+func TestRender_EmojiShortcodes(t *testing.T) {
281
+	t.Parallel()
282
+	out, _, _, err := Render(context.Background(), []byte("ship it :rocket: :+1: :notrealemoji:"), Options{})
283
+	if err != nil {
284
+		t.Fatalf("render: %v", err)
285
+	}
286
+	s := string(out)
287
+	if !strings.Contains(s, "🚀") {
288
+		t.Errorf("expected rocket emoji in output, got %q", s)
289
+	}
290
+	if !strings.Contains(s, "👍") {
291
+		t.Errorf("expected +1 emoji in output, got %q", s)
292
+	}
293
+	if !strings.Contains(s, ":notrealemoji:") {
294
+		t.Errorf("unknown shortcode should pass through, got %q", s)
295
+	}
296
+}
297
+
298
+// TestRender_InputTooLarge enforces the renderer's defensive cap.
299
+func TestRender_InputTooLarge(t *testing.T) {
300
+	t.Parallel()
301
+	big := make([]byte, MaxRenderInputBytes+1)
302
+	for i := range big {
303
+		big[i] = 'x'
304
+	}
305
+	if _, _, _, err := Render(context.Background(), big, Options{}); err == nil {
306
+		t.Errorf("expected ErrInputTooLarge")
307
+	}
308
+}
309
+
310
+// TestRender_SoftBreakAsBR controls the comment-vs-readme newline
311
+// handling.
312
+func TestRender_SoftBreakAsBR(t *testing.T) {
313
+	t.Parallel()
314
+	src := "line one\nline two\n"
315
+	br, _, _, _ := Render(context.Background(), []byte(src), Options{SoftBreakAsBR: true})
316
+	noBR, _, _, _ := Render(context.Background(), []byte(src), Options{SoftBreakAsBR: false})
317
+	if !strings.Contains(string(br), "<br") {
318
+		t.Errorf("SoftBreakAsBR=true: expected <br>, got %q", br)
319
+	}
320
+	if strings.Contains(string(noBR), "<br") {
321
+		t.Errorf("SoftBreakAsBR=false: should not contain <br>, got %q", noBR)
322
+	}
323
+}
324
+
325
+// TestRender_BackCompatRenderHTML keeps the old shim working so the
326
+// interim S17/S21/S22 callers don't need rewrite during S25.
327
+func TestRender_BackCompatRenderHTML(t *testing.T) {
328
+	t.Parallel()
329
+	html, err := RenderHTML([]byte("**bold** text"))
330
+	if err != nil {
331
+		t.Fatalf("RenderHTML: %v", err)
332
+	}
333
+	if !strings.Contains(html, "<strong>bold</strong>") {
334
+		t.Errorf("expected bold, got %q", html)
335
+	}
336
+}
internal/markdown/opts.goadded
@@ -0,0 +1,62 @@
1
+// SPDX-License-Identifier: AGPL-3.0-or-later
2
+
3
+package markdown
4
+
5
+import "context"
6
+
7
+// RepoContext is the optional repo binding for `#N` reference
8
+// resolution. When nil, `#N` renders as plain text (matches
9
+// "repo-context-less rendering" in the spec).
10
+type RepoContext struct {
11
+	OwnerUsername string
12
+	RepoName      string
13
+	RepoID        int64
14
+}
15
+
16
+// Resolvers wire the package against the rest of the runtime without
17
+// importing usersdb / issuesdb directly (avoids a cycle with packages
18
+// that themselves render markdown). Each resolver is optional; a nil
19
+// resolver means "render as plain text" for that flavor of reference.
20
+//
21
+// All resolvers MUST be visibility-aware: they receive the viewer
22
+// (Options.ViewerUserID) and return ok=false when the resource isn't
23
+// visible. Returning a link to a hidden resource leaks existence.
24
+type Resolvers struct {
25
+	// User: @username → "/username" if the user exists, isn't
26
+	// suspended, and (post-S30) is org-team-visible to the viewer.
27
+	User func(ctx context.Context, username string) (href string, ok bool)
28
+	// Issue: (repoOwner, repoName, number) → "/owner/repo/issues/N".
29
+	// Returns ok=false when the repo isn't visible to viewer or the
30
+	// number wasn't allocated yet.
31
+	Issue func(ctx context.Context, owner, name string, number int64, viewerUserID int64) (href string, ok bool)
32
+	// Commit: short-or-full SHA inside the current repo context →
33
+	// "/owner/repo/commit/<full_sha>". Only invoked when
34
+	// Opts.Repo != nil.
35
+	Commit func(ctx context.Context, repoOwner, repoName, shaPrefix string) (href, fullSHA string, ok bool)
36
+}
37
+
38
+// Options tunes a single Render call. Zero-value Options is valid
39
+// and yields a safe, generic render with no reference resolution.
40
+type Options struct {
41
+	// Repo is the binding for same-repo `#N` resolution. nil means
42
+	// "no repo context"; `#N` renders as plain text.
43
+	Repo *RepoContext
44
+
45
+	// ViewerUserID gates cross-repo `#N` and `@user` resolution.
46
+	// 0 = anonymous viewer.
47
+	ViewerUserID int64
48
+
49
+	// SoftBreakAsBR controls newline-as-<br> rendering:
50
+	//   true  — comment / issue / PR body shape (matches GitHub UI)
51
+	//   false — README / structured-doc shape (preserve markdown semantics)
52
+	SoftBreakAsBR bool
53
+
54
+	// LinkTargetBlank, when true, sets target="_blank" rel="noopener
55
+	// noreferrer" on autolinks + reference links. Default off; READMEs
56
+	// keep links in-page so the user doesn't lose context.
57
+	LinkTargetBlank bool
58
+
59
+	// Resolvers wires reference resolution. nil resolvers in the
60
+	// struct mean "render that flavor as plain text".
61
+	Resolvers Resolvers
62
+}
internal/markdown/render.goadded
@@ -0,0 +1,130 @@
1
+// SPDX-License-Identifier: AGPL-3.0-or-later
2
+
3
+package markdown
4
+
5
+import (
6
+	"bytes"
7
+	"context"
8
+	"errors"
9
+
10
+	"github.com/yuin/goldmark"
11
+	"github.com/yuin/goldmark/extension"
12
+	"github.com/yuin/goldmark/parser"
13
+	"github.com/yuin/goldmark/renderer"
14
+	gmhtml "github.com/yuin/goldmark/renderer/html"
15
+
16
+	"github.com/tenseleyFlow/shithub/internal/markdown/extensions"
17
+)
18
+
19
+// ErrInputTooLarge is returned when source exceeds MaxRenderInputBytes.
20
+// Callers should reject the input at the API layer before reaching
21
+// Render; this is the renderer's defensive fallback.
22
+var ErrInputTooLarge = errors.New("markdown: source exceeds MaxRenderInputBytes")
23
+
24
+// Render is the canonical markdown entry point. The output bytes
25
+// have already been Goldmark-rendered AND bluemonday-sanitized;
26
+// callers can wrap them as `template.HTML` and inject into a
27
+// template directly.
28
+//
29
+// `refs` and `mentions` carry the resolved cross-reference/mention
30
+// state for downstream consumers (S29 notification fan-out, S21
31
+// issue_references index). The order matches first-occurrence in
32
+// source.
33
+//
34
+// A nil ctx is fine; resolvers receive context.Background() in
35
+// that case.
36
+func Render(ctx context.Context, src []byte, opts Options) (rendered []byte, refs []Ref, mentions []Mention, err error) {
37
+	if len(src) == 0 {
38
+		return nil, nil, nil, nil
39
+	}
40
+	if len(src) > MaxRenderInputBytes {
41
+		return nil, nil, nil, ErrInputTooLarge
42
+	}
43
+	if ctx == nil {
44
+		ctx = context.Background()
45
+	}
46
+
47
+	// Build a fresh Goldmark for each render so the per-call
48
+	// extension Options can plug in without races. The build is
49
+	// cheap (~µs) and removes any cross-render contamination of
50
+	// the AST transformer state.
51
+	xRefs := []extensions.Ref{}
52
+	xMentions := []extensions.Mention{}
53
+	xOpts := &extensions.Options{
54
+		Ctx:          ctx,
55
+		ViewerUserID: opts.ViewerUserID,
56
+		Refs:         &xRefs,
57
+		Mentions:     &xMentions,
58
+		Resolvers: extensions.Resolvers{
59
+			User:   opts.Resolvers.User,
60
+			Issue:  opts.Resolvers.Issue,
61
+			Commit: opts.Resolvers.Commit,
62
+		},
63
+	}
64
+	if opts.Repo != nil {
65
+		xOpts.RepoOwner = opts.Repo.OwnerUsername
66
+		xOpts.RepoName = opts.Repo.RepoName
67
+	}
68
+
69
+	// gmhtml.WithUnsafe lets raw HTML through the parser → bluemonday
70
+	// scrubs at the sanitizer pass. Without this, every <details>,
71
+	// <kbd>, <sup>, etc. that users type would be HTML-escaped before
72
+	// the sanitizer ever sees them. The strict policy in sanitize.go
73
+	// is the security boundary.
74
+	htmlOpts := []renderer.Option{gmhtml.WithXHTML(), gmhtml.WithUnsafe()}
75
+	if opts.SoftBreakAsBR {
76
+		htmlOpts = append(htmlOpts, gmhtml.WithHardWraps())
77
+	}
78
+
79
+	gm := goldmark.New(
80
+		goldmark.WithExtensions(
81
+			extension.GFM, // tables, strikethrough, autolinks, task list
82
+			extensions.New(xOpts),
83
+		),
84
+		goldmark.WithParserOptions(parser.WithAutoHeadingID()),
85
+		goldmark.WithRendererOptions(htmlOpts...),
86
+	)
87
+
88
+	var buf bytes.Buffer
89
+	if err := gm.Convert(src, &buf); err != nil {
90
+		return nil, nil, nil, err
91
+	}
92
+	clean := sanitizeBytes(buf.Bytes())
93
+
94
+	// Convert extension-local types into the public types.
95
+	if len(xRefs) > 0 {
96
+		refs = make([]Ref, len(xRefs))
97
+		for i, r := range xRefs {
98
+			refs[i] = Ref{
99
+				Kind:    r.Kind,
100
+				Owner:   r.Owner,
101
+				Repo:    r.Repo,
102
+				Number:  r.Number,
103
+				FullSHA: r.FullSHA,
104
+				Href:    r.Href,
105
+			}
106
+		}
107
+	}
108
+	if len(xMentions) > 0 {
109
+		mentions = make([]Mention, len(xMentions))
110
+		for i, m := range xMentions {
111
+			mentions[i] = Mention{Username: m.Username, Href: m.Href}
112
+		}
113
+	}
114
+	return clean, refs, mentions, nil
115
+}
116
+
117
+// RenderHTML is the back-compat shim for callers that don't need
118
+// resolved refs/mentions. SoftBreakAsBR defaults to true (matches
119
+// the comment-style legacy behavior of the S17 helper this replaces).
120
+//
121
+// The signature matches `internal/repos/markdown.RenderHTML` so
122
+// existing callers can swap the import path without code changes.
123
+// New callers should prefer Render directly.
124
+func RenderHTML(src []byte) (string, error) {
125
+	out, _, _, err := Render(context.Background(), src, Options{SoftBreakAsBR: true})
126
+	if err != nil {
127
+		return "", err
128
+	}
129
+	return string(out), nil
130
+}
internal/markdown/sanitize.goadded
@@ -0,0 +1,83 @@
1
+// SPDX-License-Identifier: AGPL-3.0-or-later
2
+
3
+package markdown
4
+
5
+import (
6
+	"regexp"
7
+	"sync"
8
+
9
+	"github.com/microcosm-cc/bluemonday"
10
+)
11
+
12
+// sanitizer is the package-private bluemonday policy. Built once;
13
+// safe for concurrent use.
14
+//
15
+// Threat model:
16
+//
17
+//   - Goldmark with HTML rendering off escapes user-injected raw
18
+//     HTML at parse time, so the sanitizer is *defense in depth* —
19
+//     anything that reaches it should already be Goldmark-emitted
20
+//     HTML or our own AST-extension output.
21
+//   - The strict scheme allowlist (`http`, `https`, `mailto`) blocks
22
+//     `javascript:`, `data:`, `vbscript:` URIs entirely.
23
+//   - We diverge from GitHub by *not* allowing `data:image/...`. If
24
+//     a user wants an inline image, repo-relative paths work via
25
+//     /raw/. Documented in docs/markdown.md.
26
+var sanitizer = func() *bluemonday.Policy {
27
+	p := bluemonday.UGCPolicy()
28
+
29
+	// Headings keep their auto-generated id so anchor links work.
30
+	p.AllowAttrs("id").OnElements("h1", "h2", "h3", "h4", "h5", "h6")
31
+
32
+	// Code-block class allowlist for Chroma (`language-foo`). The
33
+	// SpaceSeparatedTokens matcher is bluemonday-built-in; we
34
+	// further constrain via the regex below so only `language-*` and
35
+	// chroma's own ancillary classes pass.
36
+	p.AllowAttrs("class").Matching(reCodeClass).OnElements("code", "pre", "span")
37
+
38
+	// GFM task lists: Goldmark emits `<input checked="" disabled=""
39
+	// type="checkbox" />` inside `<li>`. UGCPolicy doesn't allow
40
+	// input by default; whitelist the disabled-checkbox shape only.
41
+	// `type` is matched against "checkbox"; `disabled` and `checked`
42
+	// are HTML boolean attrs (value commonly empty), so we don't
43
+	// constrain the value — presence is the signal.
44
+	p.AllowAttrs("type").Matching(regexp.MustCompile(`^checkbox$`)).OnElements("input")
45
+	p.AllowAttrs("disabled", "checked").OnElements("input")
46
+
47
+	// Folded sections + keyboard markers. Common in READMEs.
48
+	p.AllowElements("details", "summary", "kbd", "sup", "sub")
49
+
50
+	// Mention / ref / commit anchors emitted by our extensions carry
51
+	// these classes for styling. The base UGC policy already allows
52
+	// <a> with href + rel; we just need the class allowlist above to
53
+	// cover `shithub-mention`, `shithub-ref`, `shithub-commit`.
54
+
55
+	// Hard-restrict URL schemes. UGCPolicy already restricts schemes
56
+	// on <a>, but we tighten further: drop ftp, drop data:, leave
57
+	// only http(s) + mailto.
58
+	p.AllowURLSchemes("http", "https", "mailto")
59
+	// Image schemes — UGC allows http/https only by default; we keep
60
+	// that. No data: anywhere.
61
+	p.AllowImages()
62
+
63
+	// rel="noopener noreferrer" auto-added when target="_blank" is
64
+	// set. We only set target via opts on autolinks; let bluemonday
65
+	// keep its default rel-handling.
66
+	p.RequireNoFollowOnLinks(true)
67
+	return p
68
+}()
69
+
70
+var reCodeClass = regexp.MustCompile(`^(?:language-[A-Za-z0-9_+\-]+|chroma|chroma-[a-zA-Z]+|nl|ln|line|hl)(?:\s+(?:language-[A-Za-z0-9_+\-]+|chroma|chroma-[a-zA-Z]+|nl|ln|line|hl))*$`)
71
+
72
+// sanitizeBytes is the hot-path entry the Render pipeline uses. The
73
+// bluemonday Policy is itself goroutine-safe; the once.Do here keeps
74
+// the regex compilation cost off the first request path.
75
+var sanitizerOnce sync.Once
76
+
77
+func sanitizeBytes(in []byte) []byte {
78
+	sanitizerOnce.Do(func() {
79
+		// Touch the package-level sanitizer to ensure it's built.
80
+		_ = sanitizer
81
+	})
82
+	return sanitizer.SanitizeBytes(in)
83
+}
internal/markdown/version.goadded
@@ -0,0 +1,19 @@
1
+// SPDX-License-Identifier: AGPL-3.0-or-later
2
+
3
+package markdown
4
+
5
+// Version is the canonical pipeline-version stamp written to
6
+// `body_html_cached`-style columns whenever a comment / issue body /
7
+// PR body is rendered. Bumping this constant invalidates cached HTML
8
+// lazily on read: callers compare the stored `md_pipeline_version` to
9
+// `Version` and re-render when they don't match.
10
+//
11
+// Bump rules:
12
+//   - Sanitizer policy change (allow/disallow tag, attribute, scheme).
13
+//   - New AST extension or rendering output change.
14
+//   - Goldmark / bluemonday major-version upgrade with output drift.
15
+//
16
+// Don't bump for:
17
+//   - Bug fixes that don't change rendered HTML for any input.
18
+//   - Performance-only changes.
19
+const Version int32 = 1