| 1 | //! Mach-O string table reader. |
| 2 | //! |
| 3 | //! The string table is a contiguous blob of null-terminated bytes pointed at |
| 4 | //! by `LC_SYMTAB.stroff / strsize`. Every symbol's `strx` is an offset into |
| 5 | //! this blob; the name runs from `strx` up to (not including) the next null. |
| 6 | //! |
| 7 | //! Assemblers regularly reuse suffix bytes — two symbols whose names share a |
| 8 | //! common tail can point at the same sequence. afs-as does this explicitly |
| 9 | //! via its suffix-dedup sort. The walker here handles any strx that lands |
| 10 | //! between nulls, not just those aligned to the start of a name. |
| 11 | |
| 12 | use std::collections::HashMap; |
| 13 | |
| 14 | use crate::macho::reader::ReadError; |
| 15 | |
| 16 | #[derive(Debug, Clone, PartialEq, Eq)] |
| 17 | pub struct StringTable { |
| 18 | raw: Vec<u8>, |
| 19 | } |
| 20 | |
| 21 | impl StringTable { |
| 22 | /// Lift the string table out of a file image, owning a copy for lifetime |
| 23 | /// independence from the source buffer. Both `stroff` and `strsize` come |
| 24 | /// from `LC_SYMTAB`. |
| 25 | pub fn from_file(file_bytes: &[u8], stroff: u32, strsize: u32) -> Result<Self, ReadError> { |
| 26 | let start = stroff as usize; |
| 27 | let end = start |
| 28 | .checked_add(strsize as usize) |
| 29 | .ok_or(ReadError::Truncated { |
| 30 | need: usize::MAX, |
| 31 | have: file_bytes.len(), |
| 32 | context: "string table (stroff + strsize overflows)", |
| 33 | })?; |
| 34 | if end > file_bytes.len() { |
| 35 | return Err(ReadError::Truncated { |
| 36 | need: end, |
| 37 | have: file_bytes.len(), |
| 38 | context: "string table", |
| 39 | }); |
| 40 | } |
| 41 | Ok(StringTable { |
| 42 | raw: file_bytes[start..end].to_vec(), |
| 43 | }) |
| 44 | } |
| 45 | |
| 46 | /// Construct directly from owned bytes — handy for tests and for the |
| 47 | /// writer path (Sprint 14 builds a fresh table before emission). |
| 48 | pub fn from_bytes(raw: Vec<u8>) -> Self { |
| 49 | StringTable { raw } |
| 50 | } |
| 51 | |
| 52 | /// Raw underlying bytes; useful for byte-level round-trip tests. |
| 53 | pub fn as_bytes(&self) -> &[u8] { |
| 54 | &self.raw |
| 55 | } |
| 56 | |
| 57 | pub fn len(&self) -> usize { |
| 58 | self.raw.len() |
| 59 | } |
| 60 | |
| 61 | pub fn is_empty(&self) -> bool { |
| 62 | self.raw.is_empty() |
| 63 | } |
| 64 | |
| 65 | /// Look up the null-terminated string at offset `strx`. Non-UTF-8 names |
| 66 | /// return `ReadError::BadCmdsize` (re-used as the structural-error kind) |
| 67 | /// with a contextual `reason`; callers never see lossy replacement bytes. |
| 68 | /// |
| 69 | /// `strx = 0` returns the empty string (strtabs start with a null). |
| 70 | pub fn get(&self, strx: u32) -> Result<&str, ReadError> { |
| 71 | let start = strx as usize; |
| 72 | if start >= self.raw.len() { |
| 73 | return Err(ReadError::BadCmdsize { |
| 74 | cmd: 0, |
| 75 | cmdsize: 0, |
| 76 | at_offset: start, |
| 77 | reason: "strx out of bounds", |
| 78 | }); |
| 79 | } |
| 80 | let end = start |
| 81 | + self.raw[start..] |
| 82 | .iter() |
| 83 | .position(|&b| b == 0) |
| 84 | .ok_or(ReadError::BadCmdsize { |
| 85 | cmd: 0, |
| 86 | cmdsize: 0, |
| 87 | at_offset: start, |
| 88 | reason: "unterminated string (no null byte before end)", |
| 89 | })?; |
| 90 | std::str::from_utf8(&self.raw[start..end]).map_err(|_| ReadError::BadCmdsize { |
| 91 | cmd: 0, |
| 92 | cmdsize: 0, |
| 93 | at_offset: start, |
| 94 | reason: "non-UTF-8 bytes in symbol name", |
| 95 | }) |
| 96 | } |
| 97 | } |
| 98 | |
| 99 | #[derive(Debug, Clone, Default, PartialEq, Eq)] |
| 100 | pub struct StringTableBuilder { |
| 101 | roots: Vec<RootString>, |
| 102 | offsets: HashMap<String, u32>, |
| 103 | } |
| 104 | |
| 105 | #[derive(Debug, Clone, PartialEq, Eq)] |
| 106 | struct RootString { |
| 107 | name: String, |
| 108 | offset: u32, |
| 109 | } |
| 110 | |
| 111 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] |
| 112 | struct BorrowedRootString<'a> { |
| 113 | name: &'a str, |
| 114 | offset: u32, |
| 115 | } |
| 116 | |
| 117 | impl StringTableBuilder { |
| 118 | pub fn new() -> Self { |
| 119 | Self::default() |
| 120 | } |
| 121 | |
| 122 | pub fn insert(&mut self, name: &str) { |
| 123 | self.offsets.entry(name.to_string()).or_insert(0); |
| 124 | } |
| 125 | |
| 126 | pub fn build_with_name_offsets<'a, I>(names: I) -> (Vec<u8>, Vec<u32>) |
| 127 | where |
| 128 | I: IntoIterator<Item = &'a str>, |
| 129 | { |
| 130 | let mut entries: Vec<_> = names |
| 131 | .into_iter() |
| 132 | .enumerate() |
| 133 | .map(|(index, name)| (name, index)) |
| 134 | .collect(); |
| 135 | let mut offsets = vec![0; entries.len()]; |
| 136 | entries.sort_by(|(lhs, lhs_index), (rhs, rhs_index)| { |
| 137 | reverse_suffix_order(lhs, rhs).then_with(|| lhs_index.cmp(rhs_index)) |
| 138 | }); |
| 139 | |
| 140 | let mut raw = vec![0u8]; |
| 141 | let mut roots = Vec::new(); |
| 142 | for (name, index) in entries { |
| 143 | if let Some(offset) = find_borrowed_suffix_offset(&roots, name) { |
| 144 | offsets[index] = offset; |
| 145 | continue; |
| 146 | } |
| 147 | |
| 148 | let offset = raw.len() as u32; |
| 149 | raw.extend_from_slice(name.as_bytes()); |
| 150 | raw.push(0); |
| 151 | roots.push(BorrowedRootString { name, offset }); |
| 152 | offsets[index] = offset; |
| 153 | } |
| 154 | |
| 155 | while !raw.len().is_multiple_of(8) { |
| 156 | raw.push(0); |
| 157 | } |
| 158 | (raw, offsets) |
| 159 | } |
| 160 | |
| 161 | pub fn finish(mut self) -> (Vec<u8>, HashMap<String, u32>) { |
| 162 | let mut names: Vec<String> = self.offsets.keys().cloned().collect(); |
| 163 | names.sort_by(|lhs, rhs| reverse_suffix_order(lhs, rhs)); |
| 164 | |
| 165 | let mut raw = vec![0u8]; |
| 166 | for name in names { |
| 167 | if let Some(offset) = self.find_suffix_offset(&name) { |
| 168 | self.offsets.insert(name, offset); |
| 169 | continue; |
| 170 | } |
| 171 | |
| 172 | let offset = raw.len() as u32; |
| 173 | raw.extend_from_slice(name.as_bytes()); |
| 174 | raw.push(0); |
| 175 | self.roots.push(RootString { |
| 176 | name: name.clone(), |
| 177 | offset, |
| 178 | }); |
| 179 | self.offsets.insert(name, offset); |
| 180 | } |
| 181 | |
| 182 | while !raw.len().is_multiple_of(8) { |
| 183 | raw.push(0); |
| 184 | } |
| 185 | (raw, self.offsets) |
| 186 | } |
| 187 | |
| 188 | fn find_suffix_offset(&self, name: &str) -> Option<u32> { |
| 189 | if name.is_empty() { |
| 190 | return Some(0); |
| 191 | } |
| 192 | let insert_at = self |
| 193 | .roots |
| 194 | .partition_point(|root| reverse_suffix_order(&root.name, name).is_lt()); |
| 195 | let existing = self.roots.get(insert_at.checked_sub(1)?)?; |
| 196 | (existing.name.len() >= name.len() && existing.name.ends_with(name)) |
| 197 | .then(|| existing.offset + (existing.name.len() - name.len()) as u32) |
| 198 | } |
| 199 | } |
| 200 | |
| 201 | fn find_borrowed_suffix_offset(roots: &[BorrowedRootString<'_>], name: &str) -> Option<u32> { |
| 202 | if name.is_empty() { |
| 203 | return Some(0); |
| 204 | } |
| 205 | let insert_at = roots.partition_point(|root| reverse_suffix_order(root.name, name).is_lt()); |
| 206 | let existing = roots.get(insert_at.checked_sub(1)?)?; |
| 207 | (existing.name.len() >= name.len() && existing.name.ends_with(name)) |
| 208 | .then(|| existing.offset + (existing.name.len() - name.len()) as u32) |
| 209 | } |
| 210 | |
| 211 | fn reverse_suffix_order(lhs: &str, rhs: &str) -> std::cmp::Ordering { |
| 212 | let mut lhs_rev = lhs.bytes().rev(); |
| 213 | let mut rhs_rev = rhs.bytes().rev(); |
| 214 | loop { |
| 215 | match (lhs_rev.next(), rhs_rev.next()) { |
| 216 | (Some(a), Some(b)) => match a.cmp(&b) { |
| 217 | std::cmp::Ordering::Equal => continue, |
| 218 | other => return other, |
| 219 | }, |
| 220 | (Some(_), None) => return std::cmp::Ordering::Less, |
| 221 | (None, Some(_)) => return std::cmp::Ordering::Greater, |
| 222 | (None, None) => return lhs.cmp(rhs), |
| 223 | } |
| 224 | } |
| 225 | } |
| 226 | |
| 227 | #[cfg(test)] |
| 228 | mod tests { |
| 229 | use super::*; |
| 230 | |
| 231 | fn tbl(bytes: &[u8]) -> StringTable { |
| 232 | StringTable::from_bytes(bytes.to_vec()) |
| 233 | } |
| 234 | |
| 235 | #[test] |
| 236 | fn empty_strx_yields_empty_string() { |
| 237 | let t = tbl(b"\0"); |
| 238 | assert_eq!(t.get(0).unwrap(), ""); |
| 239 | } |
| 240 | |
| 241 | #[test] |
| 242 | fn resolves_simple_name() { |
| 243 | let t = tbl(b"\0_main\0_helper\0"); |
| 244 | assert_eq!(t.get(1).unwrap(), "_main"); |
| 245 | assert_eq!(t.get(7).unwrap(), "_helper"); |
| 246 | } |
| 247 | |
| 248 | #[test] |
| 249 | fn suffix_dedup_overlap() { |
| 250 | // strtab contains "_afs_array_sum\0" at offset 1; a symbol named just |
| 251 | // "array_sum" (9 chars) points mid-string at offset 6. |
| 252 | let mut raw = vec![0u8]; |
| 253 | raw.extend_from_slice(b"_afs_array_sum\0"); |
| 254 | let t = tbl(&raw); |
| 255 | assert_eq!(t.get(1).unwrap(), "_afs_array_sum"); |
| 256 | assert_eq!(t.get(6).unwrap(), "array_sum"); |
| 257 | assert_eq!(t.get(10).unwrap(), "y_sum"); |
| 258 | } |
| 259 | |
| 260 | #[test] |
| 261 | fn out_of_bounds_strx_errors() { |
| 262 | let t = tbl(b"\0a\0"); |
| 263 | let err = t.get(100).unwrap_err(); |
| 264 | assert!( |
| 265 | matches!(err, ReadError::BadCmdsize { reason, .. } if reason.contains("out of bounds")) |
| 266 | ); |
| 267 | } |
| 268 | |
| 269 | #[test] |
| 270 | fn unterminated_string_errors() { |
| 271 | let t = tbl(b"\0abcdef"); // no trailing null |
| 272 | let err = t.get(1).unwrap_err(); |
| 273 | assert!( |
| 274 | matches!(err, ReadError::BadCmdsize { reason, .. } if reason.contains("unterminated")) |
| 275 | ); |
| 276 | } |
| 277 | |
| 278 | #[test] |
| 279 | fn non_utf8_bytes_error() { |
| 280 | let t = tbl(&[0, 0xff, 0xfe, 0]); |
| 281 | let err = t.get(1).unwrap_err(); |
| 282 | assert!(matches!(err, ReadError::BadCmdsize { reason, .. } if reason.contains("UTF-8"))); |
| 283 | } |
| 284 | |
| 285 | #[test] |
| 286 | fn from_file_bounds_check() { |
| 287 | let file = vec![0u8; 32]; |
| 288 | // stroff + strsize spills off the end. |
| 289 | let err = StringTable::from_file(&file, 30, 10).unwrap_err(); |
| 290 | assert!(matches!(err, ReadError::Truncated { .. })); |
| 291 | } |
| 292 | |
| 293 | #[test] |
| 294 | fn from_file_copies_range() { |
| 295 | let mut file = vec![0u8; 8]; |
| 296 | file.extend_from_slice(b"\0_main\0"); |
| 297 | let t = StringTable::from_file(&file, 8, 7).unwrap(); |
| 298 | assert_eq!(t.as_bytes(), b"\0_main\0"); |
| 299 | assert_eq!(t.get(1).unwrap(), "_main"); |
| 300 | } |
| 301 | |
| 302 | #[test] |
| 303 | fn builder_dedups_suffix_names() { |
| 304 | let mut builder = StringTableBuilder::new(); |
| 305 | builder.insert("_array_sum"); |
| 306 | builder.insert("_afs_array_sum"); |
| 307 | |
| 308 | let (bytes, offsets) = builder.finish(); |
| 309 | let table = StringTable::from_bytes(bytes); |
| 310 | let afs = offsets["_afs_array_sum"]; |
| 311 | let array = offsets["_array_sum"]; |
| 312 | |
| 313 | assert_eq!(table.get(afs).unwrap(), "_afs_array_sum"); |
| 314 | assert_eq!(table.get(array).unwrap(), "_array_sum"); |
| 315 | assert_eq!(array, afs + 4); |
| 316 | assert_eq!(table.as_bytes().len() % 8, 0); |
| 317 | } |
| 318 | |
| 319 | #[test] |
| 320 | fn builder_ignores_same_last_byte_non_suffix_names() { |
| 321 | let mut builder = StringTableBuilder::new(); |
| 322 | builder.insert("_alpha"); |
| 323 | builder.insert("_beta"); |
| 324 | |
| 325 | let (bytes, offsets) = builder.finish(); |
| 326 | let table = StringTable::from_bytes(bytes); |
| 327 | |
| 328 | assert_eq!(table.get(offsets["_alpha"]).unwrap(), "_alpha"); |
| 329 | assert_eq!(table.get(offsets["_beta"]).unwrap(), "_beta"); |
| 330 | } |
| 331 | |
| 332 | #[test] |
| 333 | fn builder_returns_offsets_in_input_order_without_cloning_keys() { |
| 334 | let names = ["_helper", "_afs_helper", "_helper", ""]; |
| 335 | let (bytes, offsets) = StringTableBuilder::build_with_name_offsets(names); |
| 336 | let table = StringTable::from_bytes(bytes); |
| 337 | |
| 338 | assert_eq!(table.get(offsets[0]).unwrap(), "_helper"); |
| 339 | assert_eq!(table.get(offsets[1]).unwrap(), "_afs_helper"); |
| 340 | assert_eq!(table.get(offsets[2]).unwrap(), "_helper"); |
| 341 | assert_eq!(table.get(offsets[3]).unwrap(), ""); |
| 342 | assert_eq!(offsets[0], offsets[2]); |
| 343 | assert_eq!(offsets[0], offsets[1] + 4); |
| 344 | assert_eq!(table.as_bytes().len() % 8, 0); |
| 345 | } |
| 346 | } |
| 347 |