| 1 | //! Input-file aggregate: one `ObjectFile` per parsed `.o` on disk. |
| 2 | //! |
| 3 | //! Sprint 2 ties `MachHeader64`, the load-command list, `InputSection`s, |
| 4 | //! `InputSymbol`s, the `StringTable`, and the decoded `DysymtabCmd` together. |
| 5 | //! Sprint 4 will wrap this in an `InputFile` enum alongside `ArchiveFile`; |
| 6 | //! for now `ObjectFile` stands alone. |
| 7 | |
| 8 | use std::path::PathBuf; |
| 9 | |
| 10 | use crate::macho::constants::LC_DATA_IN_CODE; |
| 11 | use crate::macho::reader::{ |
| 12 | parse_commands, parse_header, DysymtabCmd, LinkEditDataCmd, LoadCommand, MachHeader64, |
| 13 | ReadError, SymtabCmd, HEADER_SIZE, |
| 14 | }; |
| 15 | use crate::section::InputSection; |
| 16 | use crate::string_table::StringTable; |
| 17 | use crate::symbol::{parse_nlist_table, InputSymbol, SymKind}; |
| 18 | |
| 19 | /// Whole parsed `.o` on disk. Section bodies and relocation bytes are owned |
| 20 | /// copies so the buffer this was parsed from can drop. |
| 21 | #[derive(Debug, Clone)] |
| 22 | pub struct ObjectFile { |
| 23 | pub path: PathBuf, |
| 24 | pub header: MachHeader64, |
| 25 | pub commands: Vec<LoadCommand>, |
| 26 | pub sections: Vec<InputSection>, |
| 27 | pub symbols: Vec<InputSymbol>, |
| 28 | pub strings: StringTable, |
| 29 | pub symtab: Option<SymtabCmd>, |
| 30 | pub dysymtab: Option<DysymtabCmd>, |
| 31 | pub data_in_code: Vec<DataInCodeEntry>, |
| 32 | } |
| 33 | |
| 34 | /// One `data_in_code_entry` preserved from an input object. |
| 35 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] |
| 36 | pub struct DataInCodeEntry { |
| 37 | /// File offset from the input Mach-O header. |
| 38 | pub offset: u32, |
| 39 | pub length: u16, |
| 40 | pub kind: u16, |
| 41 | } |
| 42 | |
| 43 | impl DataInCodeEntry { |
| 44 | const SIZE: usize = 8; |
| 45 | |
| 46 | fn parse_payload(payload: &[u8]) -> Vec<Self> { |
| 47 | payload |
| 48 | .chunks_exact(Self::SIZE) |
| 49 | .map(|chunk| DataInCodeEntry { |
| 50 | offset: u32::from_le_bytes(chunk[0..4].try_into().unwrap()), |
| 51 | length: u16::from_le_bytes(chunk[4..6].try_into().unwrap()), |
| 52 | kind: u16::from_le_bytes(chunk[6..8].try_into().unwrap()), |
| 53 | }) |
| 54 | .collect() |
| 55 | } |
| 56 | } |
| 57 | |
| 58 | impl ObjectFile { |
| 59 | pub fn parse(path: impl Into<PathBuf>, file_bytes: &[u8]) -> Result<Self, ReadError> { |
| 60 | let path = path.into(); |
| 61 | let header = parse_header(file_bytes)?; |
| 62 | let commands = parse_commands(&header, file_bytes)?; |
| 63 | |
| 64 | // Collect sections from every LC_SEGMENT_64 (MH_OBJECT usually has |
| 65 | // exactly one segment, but the layout is not required to). |
| 66 | let mut sections = Vec::new(); |
| 67 | for cmd in &commands { |
| 68 | if let LoadCommand::Segment64(seg) = cmd { |
| 69 | for hdr in &seg.sections { |
| 70 | sections.push(InputSection::from_header(hdr, file_bytes)?); |
| 71 | } |
| 72 | } |
| 73 | } |
| 74 | |
| 75 | // Lift the symbol table + string table if present. A bare segment-only |
| 76 | // .o without LC_SYMTAB is technically legal and stays symbol-empty. |
| 77 | let symtab = commands.iter().find_map(|c| match c { |
| 78 | LoadCommand::Symtab(s) => Some(*s), |
| 79 | _ => None, |
| 80 | }); |
| 81 | let dysymtab = commands.iter().find_map(|c| match c { |
| 82 | LoadCommand::Dysymtab(d) => Some(*d), |
| 83 | _ => None, |
| 84 | }); |
| 85 | |
| 86 | let (symbols, strings) = match symtab { |
| 87 | Some(s) => ( |
| 88 | parse_nlist_table(file_bytes, s.symoff, s.nsyms)?, |
| 89 | StringTable::from_file(file_bytes, s.stroff, s.strsize)?, |
| 90 | ), |
| 91 | None => (Vec::new(), StringTable::from_bytes(Vec::new())), |
| 92 | }; |
| 93 | let data_in_code = parse_data_in_code(&commands, file_bytes)?; |
| 94 | |
| 95 | Ok(ObjectFile { |
| 96 | path, |
| 97 | header, |
| 98 | commands, |
| 99 | sections, |
| 100 | symbols, |
| 101 | strings, |
| 102 | symtab, |
| 103 | dysymtab, |
| 104 | data_in_code, |
| 105 | }) |
| 106 | } |
| 107 | |
| 108 | /// Resolve the name of a symbol via this object's string table. |
| 109 | pub fn symbol_name(&self, sym: &InputSymbol) -> Result<&str, ReadError> { |
| 110 | self.strings.get(sym.strx()) |
| 111 | } |
| 112 | |
| 113 | /// For `N_INDR` aliases, resolve the aliased name via this object's |
| 114 | /// string table. Returns `None` when the symbol is not an indirect entry. |
| 115 | pub fn indirect_target_name(&self, sym: &InputSymbol) -> Option<Result<&str, ReadError>> { |
| 116 | if sym.kind() == SymKind::Indirect { |
| 117 | Some(self.strings.get(sym.value() as u32)) |
| 118 | } else { |
| 119 | None |
| 120 | } |
| 121 | } |
| 122 | |
| 123 | /// Iterate over sections in 1-based-nlist order — i.e., the order |
| 124 | /// `nlist.n_sect` refers to. afs-ld preserves the parsed order, which |
| 125 | /// matches the order they appear in the segments. |
| 126 | pub fn section_for_symbol(&self, sym: &InputSymbol) -> Option<&InputSection> { |
| 127 | if sym.sect_idx() == 0 { |
| 128 | return None; |
| 129 | } |
| 130 | self.sections |
| 131 | .get((sym.sect_idx() as usize).saturating_sub(1)) |
| 132 | } |
| 133 | } |
| 134 | |
| 135 | fn parse_data_in_code( |
| 136 | commands: &[LoadCommand], |
| 137 | file_bytes: &[u8], |
| 138 | ) -> Result<Vec<DataInCodeEntry>, ReadError> { |
| 139 | let mut out = Vec::new(); |
| 140 | for command in commands { |
| 141 | let LoadCommand::Raw { cmd, cmdsize, data } = command else { |
| 142 | continue; |
| 143 | }; |
| 144 | if *cmd != LC_DATA_IN_CODE { |
| 145 | continue; |
| 146 | } |
| 147 | let linkedit = LinkEditDataCmd::parse(*cmd, *cmdsize, data)?; |
| 148 | if !(linkedit.datasize as usize).is_multiple_of(DataInCodeEntry::SIZE) { |
| 149 | return Err(ReadError::BadCmdsize { |
| 150 | cmd: *cmd, |
| 151 | cmdsize: linkedit.datasize, |
| 152 | at_offset: 0, |
| 153 | reason: "LC_DATA_IN_CODE payload size is not a multiple of 8", |
| 154 | }); |
| 155 | } |
| 156 | let start = linkedit.dataoff as usize; |
| 157 | let end = start |
| 158 | .checked_add(linkedit.datasize as usize) |
| 159 | .ok_or(ReadError::Truncated { |
| 160 | need: usize::MAX, |
| 161 | have: file_bytes.len(), |
| 162 | context: "LC_DATA_IN_CODE payload (offset + size overflows)", |
| 163 | })?; |
| 164 | if end > file_bytes.len() { |
| 165 | return Err(ReadError::Truncated { |
| 166 | need: end, |
| 167 | have: file_bytes.len(), |
| 168 | context: "LC_DATA_IN_CODE payload", |
| 169 | }); |
| 170 | } |
| 171 | out.extend(DataInCodeEntry::parse_payload(&file_bytes[start..end])); |
| 172 | } |
| 173 | Ok(out) |
| 174 | } |
| 175 | |
| 176 | /// Total `sizeofcmds` region, exposed for callers doing byte-level round-trip |
| 177 | /// checks against the original file image. |
| 178 | pub fn header_and_cmds_end(header: &MachHeader64) -> usize { |
| 179 | HEADER_SIZE + header.sizeofcmds as usize |
| 180 | } |
| 181 | |
| 182 | #[cfg(test)] |
| 183 | mod tests { |
| 184 | use super::*; |
| 185 | use crate::macho::constants::*; |
| 186 | use crate::macho::reader::{ |
| 187 | write_commands, write_header, LinkEditDataCmd, LoadCommand, Section64Header, Segment64, |
| 188 | }; |
| 189 | use crate::symbol::{RawNlist, NLIST_SIZE}; |
| 190 | |
| 191 | fn name16(s: &str) -> [u8; 16] { |
| 192 | let mut out = [0u8; 16]; |
| 193 | let bytes = s.as_bytes(); |
| 194 | let n = bytes.len().min(16); |
| 195 | out[..n].copy_from_slice(&bytes[..n]); |
| 196 | out |
| 197 | } |
| 198 | |
| 199 | /// Build a tiny in-memory MH_OBJECT image with one __TEXT,__text section |
| 200 | /// holding 8 bytes, one external symbol `_main`, and the minimum symtab. |
| 201 | fn synth_image() -> Vec<u8> { |
| 202 | // 1) Section header for __text. |
| 203 | let text_sect = Section64Header { |
| 204 | sectname: name16("__text"), |
| 205 | segname: name16("__TEXT"), |
| 206 | addr: 0, |
| 207 | size: 8, |
| 208 | offset: 0, // fill in after layout |
| 209 | align: 2, |
| 210 | reloff: 0, |
| 211 | nreloc: 0, |
| 212 | flags: S_ATTR_PURE_INSTRUCTIONS | S_ATTR_SOME_INSTRUCTIONS, |
| 213 | reserved1: 0, |
| 214 | reserved2: 0, |
| 215 | reserved3: 0, |
| 216 | }; |
| 217 | // 2) Segment with one section. |
| 218 | let seg = Segment64 { |
| 219 | segname: name16(""), |
| 220 | vmaddr: 0, |
| 221 | vmsize: 8, |
| 222 | fileoff: 0, |
| 223 | filesize: 8, |
| 224 | maxprot: 7, |
| 225 | initprot: 7, |
| 226 | flags: 0, |
| 227 | sections: vec![text_sect], |
| 228 | }; |
| 229 | // 3) Symtab+string table. |
| 230 | let strtab = b"\0_main\0"; |
| 231 | let nsyms = 1u32; |
| 232 | let sym = RawNlist { |
| 233 | strx: 1, // "_main" |
| 234 | n_type: N_SECT | N_EXT, |
| 235 | n_sect: 1, |
| 236 | n_desc: 0, |
| 237 | n_value: 0, |
| 238 | }; |
| 239 | |
| 240 | // Layout: header → seg load cmd → symtab cmd → section content → nlist → strtab. |
| 241 | let hdr_size = HEADER_SIZE; |
| 242 | let seg_size = seg.wire_size() as usize; |
| 243 | let symtab_size = SymtabCmd::WIRE_SIZE as usize; |
| 244 | let sizeofcmds = (seg_size + symtab_size) as u32; |
| 245 | |
| 246 | let section_offset = (hdr_size + sizeofcmds as usize) as u32; |
| 247 | let symoff = section_offset + 8; // after section content |
| 248 | let stroff = symoff + NLIST_SIZE as u32 * nsyms; |
| 249 | |
| 250 | // Rebuild segment with the correct offset now that we know it. |
| 251 | let seg = Segment64 { |
| 252 | sections: vec![Section64Header { |
| 253 | offset: section_offset, |
| 254 | ..seg.sections[0] |
| 255 | }], |
| 256 | fileoff: section_offset as u64, |
| 257 | ..seg |
| 258 | }; |
| 259 | let seg_size = seg.wire_size() as usize; |
| 260 | let sizeofcmds = (seg_size + symtab_size) as u32; |
| 261 | |
| 262 | let header = MachHeader64 { |
| 263 | magic: MH_MAGIC_64, |
| 264 | cputype: CPU_TYPE_ARM64, |
| 265 | cpusubtype: 0, |
| 266 | filetype: MH_OBJECT, |
| 267 | ncmds: 2, |
| 268 | sizeofcmds, |
| 269 | flags: MH_SUBSECTIONS_VIA_SYMBOLS, |
| 270 | reserved: 0, |
| 271 | }; |
| 272 | let symtab_cmd = SymtabCmd { |
| 273 | symoff, |
| 274 | nsyms, |
| 275 | stroff, |
| 276 | strsize: strtab.len() as u32, |
| 277 | }; |
| 278 | |
| 279 | let mut image = Vec::new(); |
| 280 | write_header(&header, &mut image); |
| 281 | let cmds = vec![LoadCommand::Segment64(seg), LoadCommand::Symtab(symtab_cmd)]; |
| 282 | write_commands(&cmds, &mut image); |
| 283 | // Section content: 8 bytes (fake instructions). |
| 284 | image.extend_from_slice(&[0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x11, 0x22]); |
| 285 | // Nlist. |
| 286 | sym.write(&mut image); |
| 287 | // String table. |
| 288 | image.extend_from_slice(strtab); |
| 289 | image |
| 290 | } |
| 291 | |
| 292 | fn synth_image_with_data_in_code() -> Vec<u8> { |
| 293 | let text_sect = Section64Header { |
| 294 | sectname: name16("__text"), |
| 295 | segname: name16("__TEXT"), |
| 296 | addr: 0, |
| 297 | size: 8, |
| 298 | offset: 0, |
| 299 | align: 2, |
| 300 | reloff: 0, |
| 301 | nreloc: 0, |
| 302 | flags: S_ATTR_PURE_INSTRUCTIONS | S_ATTR_SOME_INSTRUCTIONS, |
| 303 | reserved1: 0, |
| 304 | reserved2: 0, |
| 305 | reserved3: 0, |
| 306 | }; |
| 307 | let seg = Segment64 { |
| 308 | segname: name16(""), |
| 309 | vmaddr: 0, |
| 310 | vmsize: 8, |
| 311 | fileoff: 0, |
| 312 | filesize: 8, |
| 313 | maxprot: 7, |
| 314 | initprot: 7, |
| 315 | flags: 0, |
| 316 | sections: vec![text_sect], |
| 317 | }; |
| 318 | let strtab = b"\0_main\0"; |
| 319 | let nsyms = 1u32; |
| 320 | let sym = RawNlist { |
| 321 | strx: 1, |
| 322 | n_type: N_SECT | N_EXT, |
| 323 | n_sect: 1, |
| 324 | n_desc: 0, |
| 325 | n_value: 0, |
| 326 | }; |
| 327 | let dic_blob = [ |
| 328 | 0u32.to_le_bytes().as_slice(), |
| 329 | 4u16.to_le_bytes().as_slice(), |
| 330 | DICE_KIND_DATA.to_le_bytes().as_slice(), |
| 331 | ] |
| 332 | .concat(); |
| 333 | let hdr_size = HEADER_SIZE; |
| 334 | let seg_size = seg.wire_size() as usize; |
| 335 | let dic_size = LinkEditDataCmd::WIRE_SIZE as usize; |
| 336 | let symtab_size = SymtabCmd::WIRE_SIZE as usize; |
| 337 | let sizeofcmds = (seg_size + dic_size + symtab_size) as u32; |
| 338 | |
| 339 | let section_offset = (hdr_size + sizeofcmds as usize) as u32; |
| 340 | let data_in_code_off = section_offset + 8; |
| 341 | let symoff = data_in_code_off + dic_blob.len() as u32; |
| 342 | let stroff = symoff + NLIST_SIZE as u32 * nsyms; |
| 343 | let seg = Segment64 { |
| 344 | sections: vec![Section64Header { |
| 345 | offset: section_offset, |
| 346 | ..seg.sections[0] |
| 347 | }], |
| 348 | fileoff: section_offset as u64, |
| 349 | ..seg |
| 350 | }; |
| 351 | let header = MachHeader64 { |
| 352 | magic: MH_MAGIC_64, |
| 353 | cputype: CPU_TYPE_ARM64, |
| 354 | cpusubtype: 0, |
| 355 | filetype: MH_OBJECT, |
| 356 | ncmds: 3, |
| 357 | sizeofcmds, |
| 358 | flags: MH_SUBSECTIONS_VIA_SYMBOLS, |
| 359 | reserved: 0, |
| 360 | }; |
| 361 | let symtab_cmd = SymtabCmd { |
| 362 | symoff, |
| 363 | nsyms, |
| 364 | stroff, |
| 365 | strsize: strtab.len() as u32, |
| 366 | }; |
| 367 | let dic_cmd = LoadCommand::Raw { |
| 368 | cmd: LC_DATA_IN_CODE, |
| 369 | cmdsize: LinkEditDataCmd::WIRE_SIZE, |
| 370 | data: [ |
| 371 | data_in_code_off.to_le_bytes().as_slice(), |
| 372 | (dic_blob.len() as u32).to_le_bytes().as_slice(), |
| 373 | ] |
| 374 | .concat(), |
| 375 | }; |
| 376 | |
| 377 | let mut image = Vec::new(); |
| 378 | write_header(&header, &mut image); |
| 379 | let cmds = vec![ |
| 380 | LoadCommand::Segment64(seg), |
| 381 | dic_cmd, |
| 382 | LoadCommand::Symtab(symtab_cmd), |
| 383 | ]; |
| 384 | write_commands(&cmds, &mut image); |
| 385 | image.extend_from_slice(&[0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x11, 0x22]); |
| 386 | image.extend_from_slice(&dic_blob); |
| 387 | sym.write(&mut image); |
| 388 | image.extend_from_slice(strtab); |
| 389 | image |
| 390 | } |
| 391 | |
| 392 | #[test] |
| 393 | fn parse_synth_object_end_to_end() { |
| 394 | let image = synth_image(); |
| 395 | let obj = ObjectFile::parse("/tmp/synth.o", &image).unwrap(); |
| 396 | assert_eq!(obj.sections.len(), 1); |
| 397 | let sec = &obj.sections[0]; |
| 398 | assert_eq!(sec.segname, "__TEXT"); |
| 399 | assert_eq!(sec.sectname, "__text"); |
| 400 | assert_eq!(sec.data.len(), 8); |
| 401 | assert_eq!( |
| 402 | sec.data, |
| 403 | vec![0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0x11, 0x22] |
| 404 | ); |
| 405 | |
| 406 | assert_eq!(obj.symbols.len(), 1); |
| 407 | let sym = &obj.symbols[0]; |
| 408 | assert_eq!(obj.symbol_name(sym).unwrap(), "_main"); |
| 409 | assert!(sym.is_ext()); |
| 410 | let sect = obj.section_for_symbol(sym).expect("n_sect=1 resolves"); |
| 411 | assert_eq!(sect.sectname, "__text"); |
| 412 | } |
| 413 | |
| 414 | #[test] |
| 415 | fn parse_preserves_data_in_code_entries() { |
| 416 | let image = synth_image_with_data_in_code(); |
| 417 | let obj = ObjectFile::parse("/tmp/synth-dic.o", &image).unwrap(); |
| 418 | assert_eq!( |
| 419 | obj.data_in_code, |
| 420 | vec![DataInCodeEntry { |
| 421 | offset: 0, |
| 422 | length: 4, |
| 423 | kind: DICE_KIND_DATA, |
| 424 | }] |
| 425 | ); |
| 426 | } |
| 427 | |
| 428 | #[test] |
| 429 | fn indirect_target_name_resolves() { |
| 430 | // Build a minimal strtab with "\0_alias\0_target\0" and a RawNlist |
| 431 | // whose n_value points at "_target". |
| 432 | let strtab = StringTable::from_bytes(b"\0_alias\0_target\0".to_vec()); |
| 433 | let obj = ObjectFile { |
| 434 | path: PathBuf::from("/tmp/t"), |
| 435 | header: MachHeader64 { |
| 436 | magic: MH_MAGIC_64, |
| 437 | cputype: CPU_TYPE_ARM64, |
| 438 | cpusubtype: 0, |
| 439 | filetype: MH_OBJECT, |
| 440 | ncmds: 0, |
| 441 | sizeofcmds: 0, |
| 442 | flags: 0, |
| 443 | reserved: 0, |
| 444 | }, |
| 445 | commands: Vec::new(), |
| 446 | sections: Vec::new(), |
| 447 | symbols: Vec::new(), |
| 448 | strings: strtab, |
| 449 | symtab: None, |
| 450 | dysymtab: None, |
| 451 | data_in_code: Vec::new(), |
| 452 | }; |
| 453 | let alias = InputSymbol::from_raw(RawNlist { |
| 454 | strx: 1, |
| 455 | n_type: N_INDR | N_EXT, |
| 456 | n_sect: 0, |
| 457 | n_desc: 0, |
| 458 | n_value: 8, // strx of "_target" |
| 459 | }); |
| 460 | let resolved = obj.indirect_target_name(&alias).unwrap().unwrap(); |
| 461 | assert_eq!(resolved, "_target"); |
| 462 | } |
| 463 | } |
| 464 |