Rust · 11383 bytes Raw Blame History
1 //! Mach-O string table reader.
2 //!
3 //! The string table is a contiguous blob of null-terminated bytes pointed at
4 //! by `LC_SYMTAB.stroff / strsize`. Every symbol's `strx` is an offset into
5 //! this blob; the name runs from `strx` up to (not including) the next null.
6 //!
7 //! Assemblers regularly reuse suffix bytes — two symbols whose names share a
8 //! common tail can point at the same sequence. afs-as does this explicitly
9 //! via its suffix-dedup sort. The walker here handles any strx that lands
10 //! between nulls, not just those aligned to the start of a name.
11
12 use std::collections::HashMap;
13
14 use crate::macho::reader::ReadError;
15
16 #[derive(Debug, Clone, PartialEq, Eq)]
17 pub struct StringTable {
18 raw: Vec<u8>,
19 }
20
21 impl StringTable {
22 /// Lift the string table out of a file image, owning a copy for lifetime
23 /// independence from the source buffer. Both `stroff` and `strsize` come
24 /// from `LC_SYMTAB`.
25 pub fn from_file(file_bytes: &[u8], stroff: u32, strsize: u32) -> Result<Self, ReadError> {
26 let start = stroff as usize;
27 let end = start
28 .checked_add(strsize as usize)
29 .ok_or(ReadError::Truncated {
30 need: usize::MAX,
31 have: file_bytes.len(),
32 context: "string table (stroff + strsize overflows)",
33 })?;
34 if end > file_bytes.len() {
35 return Err(ReadError::Truncated {
36 need: end,
37 have: file_bytes.len(),
38 context: "string table",
39 });
40 }
41 Ok(StringTable {
42 raw: file_bytes[start..end].to_vec(),
43 })
44 }
45
46 /// Construct directly from owned bytes — handy for tests and for the
47 /// writer path (Sprint 14 builds a fresh table before emission).
48 pub fn from_bytes(raw: Vec<u8>) -> Self {
49 StringTable { raw }
50 }
51
52 /// Raw underlying bytes; useful for byte-level round-trip tests.
53 pub fn as_bytes(&self) -> &[u8] {
54 &self.raw
55 }
56
57 pub fn len(&self) -> usize {
58 self.raw.len()
59 }
60
61 pub fn is_empty(&self) -> bool {
62 self.raw.is_empty()
63 }
64
65 /// Look up the null-terminated string at offset `strx`. Non-UTF-8 names
66 /// return `ReadError::BadCmdsize` (re-used as the structural-error kind)
67 /// with a contextual `reason`; callers never see lossy replacement bytes.
68 ///
69 /// `strx = 0` returns the empty string (strtabs start with a null).
70 pub fn get(&self, strx: u32) -> Result<&str, ReadError> {
71 let start = strx as usize;
72 if start >= self.raw.len() {
73 return Err(ReadError::BadCmdsize {
74 cmd: 0,
75 cmdsize: 0,
76 at_offset: start,
77 reason: "strx out of bounds",
78 });
79 }
80 let end = start
81 + self.raw[start..]
82 .iter()
83 .position(|&b| b == 0)
84 .ok_or(ReadError::BadCmdsize {
85 cmd: 0,
86 cmdsize: 0,
87 at_offset: start,
88 reason: "unterminated string (no null byte before end)",
89 })?;
90 std::str::from_utf8(&self.raw[start..end]).map_err(|_| ReadError::BadCmdsize {
91 cmd: 0,
92 cmdsize: 0,
93 at_offset: start,
94 reason: "non-UTF-8 bytes in symbol name",
95 })
96 }
97 }
98
99 #[derive(Debug, Clone, Default, PartialEq, Eq)]
100 pub struct StringTableBuilder {
101 roots: Vec<RootString>,
102 offsets: HashMap<String, u32>,
103 }
104
105 #[derive(Debug, Clone, PartialEq, Eq)]
106 struct RootString {
107 name: String,
108 offset: u32,
109 }
110
111 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
112 struct BorrowedRootString<'a> {
113 name: &'a str,
114 offset: u32,
115 }
116
117 impl StringTableBuilder {
118 pub fn new() -> Self {
119 Self::default()
120 }
121
122 pub fn insert(&mut self, name: &str) {
123 self.offsets.entry(name.to_string()).or_insert(0);
124 }
125
126 pub fn build_with_name_offsets<'a, I>(names: I) -> (Vec<u8>, Vec<u32>)
127 where
128 I: IntoIterator<Item = &'a str>,
129 {
130 let mut entries: Vec<_> = names
131 .into_iter()
132 .enumerate()
133 .map(|(index, name)| (name, index))
134 .collect();
135 let mut offsets = vec![0; entries.len()];
136 entries.sort_by(|(lhs, lhs_index), (rhs, rhs_index)| {
137 reverse_suffix_order(lhs, rhs).then_with(|| lhs_index.cmp(rhs_index))
138 });
139
140 let mut raw = vec![0u8];
141 let mut roots = Vec::new();
142 for (name, index) in entries {
143 if let Some(offset) = find_borrowed_suffix_offset(&roots, name) {
144 offsets[index] = offset;
145 continue;
146 }
147
148 let offset = raw.len() as u32;
149 raw.extend_from_slice(name.as_bytes());
150 raw.push(0);
151 roots.push(BorrowedRootString { name, offset });
152 offsets[index] = offset;
153 }
154
155 while !raw.len().is_multiple_of(8) {
156 raw.push(0);
157 }
158 (raw, offsets)
159 }
160
161 pub fn finish(mut self) -> (Vec<u8>, HashMap<String, u32>) {
162 let mut names: Vec<String> = self.offsets.keys().cloned().collect();
163 names.sort_by(|lhs, rhs| reverse_suffix_order(lhs, rhs));
164
165 let mut raw = vec![0u8];
166 for name in names {
167 if let Some(offset) = self.find_suffix_offset(&name) {
168 self.offsets.insert(name, offset);
169 continue;
170 }
171
172 let offset = raw.len() as u32;
173 raw.extend_from_slice(name.as_bytes());
174 raw.push(0);
175 self.roots.push(RootString {
176 name: name.clone(),
177 offset,
178 });
179 self.offsets.insert(name, offset);
180 }
181
182 while !raw.len().is_multiple_of(8) {
183 raw.push(0);
184 }
185 (raw, self.offsets)
186 }
187
188 fn find_suffix_offset(&self, name: &str) -> Option<u32> {
189 if name.is_empty() {
190 return Some(0);
191 }
192 let insert_at = self
193 .roots
194 .partition_point(|root| reverse_suffix_order(&root.name, name).is_lt());
195 let existing = self.roots.get(insert_at.checked_sub(1)?)?;
196 (existing.name.len() >= name.len() && existing.name.ends_with(name))
197 .then(|| existing.offset + (existing.name.len() - name.len()) as u32)
198 }
199 }
200
201 fn find_borrowed_suffix_offset(roots: &[BorrowedRootString<'_>], name: &str) -> Option<u32> {
202 if name.is_empty() {
203 return Some(0);
204 }
205 let insert_at = roots.partition_point(|root| reverse_suffix_order(root.name, name).is_lt());
206 let existing = roots.get(insert_at.checked_sub(1)?)?;
207 (existing.name.len() >= name.len() && existing.name.ends_with(name))
208 .then(|| existing.offset + (existing.name.len() - name.len()) as u32)
209 }
210
211 fn reverse_suffix_order(lhs: &str, rhs: &str) -> std::cmp::Ordering {
212 let mut lhs_rev = lhs.bytes().rev();
213 let mut rhs_rev = rhs.bytes().rev();
214 loop {
215 match (lhs_rev.next(), rhs_rev.next()) {
216 (Some(a), Some(b)) => match a.cmp(&b) {
217 std::cmp::Ordering::Equal => continue,
218 other => return other,
219 },
220 (Some(_), None) => return std::cmp::Ordering::Less,
221 (None, Some(_)) => return std::cmp::Ordering::Greater,
222 (None, None) => return lhs.cmp(rhs),
223 }
224 }
225 }
226
227 #[cfg(test)]
228 mod tests {
229 use super::*;
230
231 fn tbl(bytes: &[u8]) -> StringTable {
232 StringTable::from_bytes(bytes.to_vec())
233 }
234
235 #[test]
236 fn empty_strx_yields_empty_string() {
237 let t = tbl(b"\0");
238 assert_eq!(t.get(0).unwrap(), "");
239 }
240
241 #[test]
242 fn resolves_simple_name() {
243 let t = tbl(b"\0_main\0_helper\0");
244 assert_eq!(t.get(1).unwrap(), "_main");
245 assert_eq!(t.get(7).unwrap(), "_helper");
246 }
247
248 #[test]
249 fn suffix_dedup_overlap() {
250 // strtab contains "_afs_array_sum\0" at offset 1; a symbol named just
251 // "array_sum" (9 chars) points mid-string at offset 6.
252 let mut raw = vec![0u8];
253 raw.extend_from_slice(b"_afs_array_sum\0");
254 let t = tbl(&raw);
255 assert_eq!(t.get(1).unwrap(), "_afs_array_sum");
256 assert_eq!(t.get(6).unwrap(), "array_sum");
257 assert_eq!(t.get(10).unwrap(), "y_sum");
258 }
259
260 #[test]
261 fn out_of_bounds_strx_errors() {
262 let t = tbl(b"\0a\0");
263 let err = t.get(100).unwrap_err();
264 assert!(
265 matches!(err, ReadError::BadCmdsize { reason, .. } if reason.contains("out of bounds"))
266 );
267 }
268
269 #[test]
270 fn unterminated_string_errors() {
271 let t = tbl(b"\0abcdef"); // no trailing null
272 let err = t.get(1).unwrap_err();
273 assert!(
274 matches!(err, ReadError::BadCmdsize { reason, .. } if reason.contains("unterminated"))
275 );
276 }
277
278 #[test]
279 fn non_utf8_bytes_error() {
280 let t = tbl(&[0, 0xff, 0xfe, 0]);
281 let err = t.get(1).unwrap_err();
282 assert!(matches!(err, ReadError::BadCmdsize { reason, .. } if reason.contains("UTF-8")));
283 }
284
285 #[test]
286 fn from_file_bounds_check() {
287 let file = vec![0u8; 32];
288 // stroff + strsize spills off the end.
289 let err = StringTable::from_file(&file, 30, 10).unwrap_err();
290 assert!(matches!(err, ReadError::Truncated { .. }));
291 }
292
293 #[test]
294 fn from_file_copies_range() {
295 let mut file = vec![0u8; 8];
296 file.extend_from_slice(b"\0_main\0");
297 let t = StringTable::from_file(&file, 8, 7).unwrap();
298 assert_eq!(t.as_bytes(), b"\0_main\0");
299 assert_eq!(t.get(1).unwrap(), "_main");
300 }
301
302 #[test]
303 fn builder_dedups_suffix_names() {
304 let mut builder = StringTableBuilder::new();
305 builder.insert("_array_sum");
306 builder.insert("_afs_array_sum");
307
308 let (bytes, offsets) = builder.finish();
309 let table = StringTable::from_bytes(bytes);
310 let afs = offsets["_afs_array_sum"];
311 let array = offsets["_array_sum"];
312
313 assert_eq!(table.get(afs).unwrap(), "_afs_array_sum");
314 assert_eq!(table.get(array).unwrap(), "_array_sum");
315 assert_eq!(array, afs + 4);
316 assert_eq!(table.as_bytes().len() % 8, 0);
317 }
318
319 #[test]
320 fn builder_ignores_same_last_byte_non_suffix_names() {
321 let mut builder = StringTableBuilder::new();
322 builder.insert("_alpha");
323 builder.insert("_beta");
324
325 let (bytes, offsets) = builder.finish();
326 let table = StringTable::from_bytes(bytes);
327
328 assert_eq!(table.get(offsets["_alpha"]).unwrap(), "_alpha");
329 assert_eq!(table.get(offsets["_beta"]).unwrap(), "_beta");
330 }
331
332 #[test]
333 fn builder_returns_offsets_in_input_order_without_cloning_keys() {
334 let names = ["_helper", "_afs_helper", "_helper", ""];
335 let (bytes, offsets) = StringTableBuilder::build_with_name_offsets(names);
336 let table = StringTable::from_bytes(bytes);
337
338 assert_eq!(table.get(offsets[0]).unwrap(), "_helper");
339 assert_eq!(table.get(offsets[1]).unwrap(), "_afs_helper");
340 assert_eq!(table.get(offsets[2]).unwrap(), "_helper");
341 assert_eq!(table.get(offsets[3]).unwrap(), "");
342 assert_eq!(offsets[0], offsets[2]);
343 assert_eq!(offsets[0], offsets[1] + 4);
344 assert_eq!(table.as_bytes().len() % 8, 0);
345 }
346 }
347