| 1 | //! Atomization model. |
| 2 | //! |
| 3 | //! An **atom** is the linker's fundamental unit of output layout, |
| 4 | //! dead-stripping, and ICF. Each input section is split into one or more |
| 5 | //! atoms; output sections are concatenations of atoms. Every |
| 6 | //! `Symbol::Defined` owns exactly one atom (except `.alt_entry` chain |
| 7 | //! symbols which fold into a predecessor's atom). |
| 8 | //! |
| 9 | //! afs-as always sets `MH_SUBSECTIONS_VIA_SYMBOLS`, so in practice text and |
| 10 | //! data sections split at symbol boundaries; literal sections |
| 11 | //! (`__cstring`, `__literal*`) split at content boundaries; zerofill and |
| 12 | //! TLS sections split per-symbol. The full ruleset lives in |
| 13 | //! [`atomize_input_section`]. |
| 14 | //! |
| 15 | //! Later passes reference atoms via `AtomId` (Sprint 7's opaque handle). |
| 16 | //! This module hands out ids via `AtomTable::push`; `AtomId(0)` is a |
| 17 | //! pre-existing sentinel meaning "no atom bound yet" (used by |
| 18 | //! `Symbol::Defined { atom }` before atomization back-patches it). |
| 19 | |
| 20 | use std::collections::HashMap; |
| 21 | |
| 22 | use crate::input::ObjectFile; |
| 23 | use crate::macho::constants::MH_SUBSECTIONS_VIA_SYMBOLS; |
| 24 | use crate::reloc::{parse_raw_relocs, parse_relocs, Referent}; |
| 25 | use crate::resolve::{AtomId, InputId, SymbolId, SymbolTable}; |
| 26 | use crate::section::{InputSection, SectionKind}; |
| 27 | use crate::symbol::{InputSymbol, SymKind}; |
| 28 | |
| 29 | /// Which conceptual output section family this atom belongs to. Sprint 10 |
| 30 | /// turns these into real `__TEXT,__text` / `__DATA,__data` etc. placements. |
| 31 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] |
| 32 | pub enum AtomSection { |
| 33 | Text, |
| 34 | Data, |
| 35 | ConstData, |
| 36 | CStringLiterals, |
| 37 | Literal4, |
| 38 | Literal8, |
| 39 | Literal16, |
| 40 | ZeroFill, |
| 41 | ThreadLocalData, |
| 42 | ThreadLocalBss, |
| 43 | ThreadLocalVariables, |
| 44 | ThreadLocalInitPointers, |
| 45 | Coalesced, |
| 46 | CompactUnwind, |
| 47 | EhFrame, |
| 48 | SymbolStubs, |
| 49 | NonLazySymbolPointers, |
| 50 | LazySymbolPointers, |
| 51 | /// Section kind we don't have specialized layout for yet. Layout still |
| 52 | /// works (output section keyed by segname/sectname) but downstream |
| 53 | /// passes treat it opaquely. |
| 54 | Other, |
| 55 | } |
| 56 | |
| 57 | impl AtomSection { |
| 58 | pub fn from_section_kind(kind: SectionKind) -> Self { |
| 59 | match kind { |
| 60 | SectionKind::Text => AtomSection::Text, |
| 61 | SectionKind::Data => AtomSection::Data, |
| 62 | SectionKind::ConstData => AtomSection::ConstData, |
| 63 | SectionKind::CStringLiterals => AtomSection::CStringLiterals, |
| 64 | SectionKind::Literal4 => AtomSection::Literal4, |
| 65 | SectionKind::Literal8 => AtomSection::Literal8, |
| 66 | SectionKind::Literal16 => AtomSection::Literal16, |
| 67 | SectionKind::ZeroFill | SectionKind::GbZeroFill => AtomSection::ZeroFill, |
| 68 | SectionKind::ThreadLocalRegular => AtomSection::ThreadLocalData, |
| 69 | SectionKind::ThreadLocalZeroFill => AtomSection::ThreadLocalBss, |
| 70 | SectionKind::ThreadLocalVariables => AtomSection::ThreadLocalVariables, |
| 71 | SectionKind::ThreadLocalVariablePointers => AtomSection::ThreadLocalVariables, |
| 72 | SectionKind::ThreadLocalInitPointers => AtomSection::ThreadLocalInitPointers, |
| 73 | SectionKind::Coalesced => AtomSection::Coalesced, |
| 74 | SectionKind::CompactUnwind => AtomSection::CompactUnwind, |
| 75 | SectionKind::EhFrame => AtomSection::EhFrame, |
| 76 | SectionKind::SymbolStubs => AtomSection::SymbolStubs, |
| 77 | SectionKind::NonLazySymbolPointers => AtomSection::NonLazySymbolPointers, |
| 78 | SectionKind::LazySymbolPointers => AtomSection::LazySymbolPointers, |
| 79 | SectionKind::Regular | SectionKind::Unknown(_) => AtomSection::Other, |
| 80 | } |
| 81 | } |
| 82 | |
| 83 | pub fn is_zerofill(self) -> bool { |
| 84 | matches!(self, AtomSection::ZeroFill | AtomSection::ThreadLocalBss) |
| 85 | } |
| 86 | |
| 87 | pub fn is_literal(self) -> bool { |
| 88 | matches!( |
| 89 | self, |
| 90 | AtomSection::CStringLiterals |
| 91 | | AtomSection::Literal4 |
| 92 | | AtomSection::Literal8 |
| 93 | | AtomSection::Literal16 |
| 94 | ) |
| 95 | } |
| 96 | } |
| 97 | |
| 98 | /// Bit-packed boolean attributes. Fields intentionally narrow — each bit |
| 99 | /// carries clear linker-visible meaning. |
| 100 | #[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] |
| 101 | pub struct AtomFlags { |
| 102 | bits: u32, |
| 103 | } |
| 104 | |
| 105 | impl AtomFlags { |
| 106 | pub const NONE: AtomFlags = AtomFlags { bits: 0 }; |
| 107 | pub const NO_DEAD_STRIP: u32 = 1 << 0; |
| 108 | pub const WEAK_DEF: u32 = 1 << 1; |
| 109 | pub const THREAD_LOCAL: u32 = 1 << 2; |
| 110 | pub const LITERAL: u32 = 1 << 3; |
| 111 | pub const PURE_INSTRUCTIONS: u32 = 1 << 4; |
| 112 | pub const ADDRESS_TAKEN: u32 = 1 << 5; // set during reloc scan (Sprint 24's ICF gate) |
| 113 | |
| 114 | pub fn has(self, bit: u32) -> bool { |
| 115 | self.bits & bit != 0 |
| 116 | } |
| 117 | |
| 118 | pub fn with(mut self, bit: u32) -> Self { |
| 119 | self.bits |= bit; |
| 120 | self |
| 121 | } |
| 122 | |
| 123 | pub fn set(&mut self, bit: u32) { |
| 124 | self.bits |= bit; |
| 125 | } |
| 126 | |
| 127 | pub fn bits(self) -> u32 { |
| 128 | self.bits |
| 129 | } |
| 130 | } |
| 131 | |
| 132 | /// A symbol that resolves to a point inside another atom via `.alt_entry`. |
| 133 | /// Used for the `_start` / `_main` pattern where a secondary entry point |
| 134 | /// aliases into the middle of a function. |
| 135 | #[derive(Debug, Clone, PartialEq, Eq)] |
| 136 | pub struct AltEntry { |
| 137 | pub symbol: SymbolId, |
| 138 | /// Byte offset into the containing atom where this alt entry points. |
| 139 | pub offset_within_atom: u32, |
| 140 | } |
| 141 | |
| 142 | /// One atom. Dead-stripping, ICF, and layout all work in terms of atoms. |
| 143 | #[derive(Debug, Clone, PartialEq, Eq)] |
| 144 | pub struct Atom { |
| 145 | pub id: AtomId, |
| 146 | pub origin: InputId, |
| 147 | /// 1-based section index within `origin`'s Mach-O section list. |
| 148 | pub input_section: u8, |
| 149 | pub section: AtomSection, |
| 150 | /// Offset within the input section where this atom's content starts. |
| 151 | pub input_offset: u32, |
| 152 | /// Byte size. For zerofill atoms, this is virtual; `data` is empty. |
| 153 | pub size: u32, |
| 154 | /// log2 of required alignment. Inherited from the containing section. |
| 155 | pub align_pow2: u8, |
| 156 | /// Primary defining symbol, if any. Locals that split a section at |
| 157 | /// `MH_SUBSECTIONS_VIA_SYMBOLS` boundaries but have no matching |
| 158 | /// `Symbol::Defined` (rare; happens for unnamed atoms inside literal |
| 159 | /// sections) leave this `None`. |
| 160 | pub owner: Option<SymbolId>, |
| 161 | /// `.alt_entry` chain — symbols aliased into this atom. |
| 162 | pub alt_entries: Vec<AltEntry>, |
| 163 | /// File-backed content, empty for zerofill. |
| 164 | pub data: Vec<u8>, |
| 165 | pub flags: AtomFlags, |
| 166 | /// For compact-unwind and eh_frame atoms: the function atom whose |
| 167 | /// lifetime this metadata atom shares. Sprint 23 (dead-strip) uses |
| 168 | /// this to keep unwind metadata live iff the function is live. |
| 169 | pub parent_of: Option<AtomId>, |
| 170 | } |
| 171 | |
| 172 | /// Registry of all atoms in the link. `push` hands out stable `AtomId`s; |
| 173 | /// `get` / `get_mut` index into the table. |
| 174 | #[derive(Debug, Default)] |
| 175 | pub struct AtomTable { |
| 176 | atoms: Vec<Atom>, |
| 177 | } |
| 178 | |
| 179 | impl AtomTable { |
| 180 | pub fn new() -> Self { |
| 181 | Self::default() |
| 182 | } |
| 183 | |
| 184 | /// Assign an id to `atom` (overwriting any prior `id` field) and |
| 185 | /// store it. Returns the new handle. |
| 186 | pub fn push(&mut self, mut atom: Atom) -> AtomId { |
| 187 | // Skip id 0 — `AtomId(0)` is the pre-atomization placeholder for |
| 188 | // `Symbol::Defined { atom }` slots seeded before atomization runs. |
| 189 | let id = AtomId((self.atoms.len() as u32) + 1); |
| 190 | atom.id = id; |
| 191 | self.atoms.push(atom); |
| 192 | id |
| 193 | } |
| 194 | |
| 195 | pub fn get(&self, id: AtomId) -> &Atom { |
| 196 | &self.atoms[(id.0 - 1) as usize] |
| 197 | } |
| 198 | |
| 199 | pub fn get_mut(&mut self, id: AtomId) -> &mut Atom { |
| 200 | &mut self.atoms[(id.0 - 1) as usize] |
| 201 | } |
| 202 | |
| 203 | pub fn len(&self) -> usize { |
| 204 | self.atoms.len() |
| 205 | } |
| 206 | |
| 207 | pub fn is_empty(&self) -> bool { |
| 208 | self.atoms.is_empty() |
| 209 | } |
| 210 | |
| 211 | pub fn iter(&self) -> impl Iterator<Item = (AtomId, &Atom)> { |
| 212 | self.atoms |
| 213 | .iter() |
| 214 | .enumerate() |
| 215 | .map(|(i, a)| (AtomId((i + 1) as u32), a)) |
| 216 | } |
| 217 | |
| 218 | /// Group atoms by `(origin, input_section)`, preserving insertion |
| 219 | /// order within each group. Sprint 10's layout pass walks this |
| 220 | /// grouping to preserve input ordering within output sections. |
| 221 | pub fn by_input_section(&self) -> HashMap<(InputId, u8), Vec<AtomId>> { |
| 222 | let mut out: HashMap<(InputId, u8), Vec<AtomId>> = HashMap::new(); |
| 223 | for (id, atom) in self.iter() { |
| 224 | out.entry((atom.origin, atom.input_section)) |
| 225 | .or_default() |
| 226 | .push(id); |
| 227 | } |
| 228 | out |
| 229 | } |
| 230 | } |
| 231 | |
| 232 | // --------------------------------------------------------------------------- |
| 233 | // Atomization pass. |
| 234 | // --------------------------------------------------------------------------- |
| 235 | |
| 236 | /// Per-object atomization output. Back-patching `Symbol::Defined.atom` |
| 237 | /// walks `owner_by_sym`; Sprint 23's dead-strip reads `alt_entries_by_sym` |
| 238 | /// when computing the live graph. |
| 239 | #[derive(Debug, Default)] |
| 240 | pub struct ObjectAtomization { |
| 241 | pub atoms: Vec<AtomId>, |
| 242 | /// `(symbol_index_in_object → atom that owns it)`. Populated for every |
| 243 | /// external/private-extern SECT symbol that started a new atom. |
| 244 | pub owner_by_sym: Vec<(usize, AtomId)>, |
| 245 | /// `(symbol_index_in_object → (containing_atom, offset_within_atom))`. |
| 246 | /// Populated for `.alt_entry` symbols that folded into an existing atom. |
| 247 | pub alt_entries_by_sym: Vec<(usize, AtomId, u32)>, |
| 248 | } |
| 249 | |
| 250 | /// Atomize every section in `obj`, pushing into `table`. The caller |
| 251 | /// typically walks every input in sequence and merges results. |
| 252 | pub fn atomize_object( |
| 253 | input_id: InputId, |
| 254 | obj: &ObjectFile, |
| 255 | table: &mut AtomTable, |
| 256 | ) -> ObjectAtomization { |
| 257 | let subsections_via_symbols = obj.header.flags & MH_SUBSECTIONS_VIA_SYMBOLS != 0; |
| 258 | let mut out = ObjectAtomization::default(); |
| 259 | |
| 260 | for (sect_idx_zero, sect) in obj.sections.iter().enumerate() { |
| 261 | let sect_idx_one = (sect_idx_zero + 1) as u8; |
| 262 | // Gather symbols targeting this section and translate their |
| 263 | // `n_value` (absolute address in the object's layout) into |
| 264 | // in-section offsets by subtracting the section's `addr`. |
| 265 | // |
| 266 | // Only external / private-extern / alt-entry symbols count as |
| 267 | // subsection boundaries. Locals like `ltmp0` often sit at the |
| 268 | // same offset as an adjacent external (they're compiler-generated |
| 269 | // anchors for PC-relative addressing); splitting at them would |
| 270 | // produce zero-size atoms. This matches ld64's pragmatic reading |
| 271 | // of MH_SUBSECTIONS_VIA_SYMBOLS. |
| 272 | let mut syms: Vec<(usize, &InputSymbol, u32)> = obj |
| 273 | .symbols |
| 274 | .iter() |
| 275 | .enumerate() |
| 276 | .filter(|(_, s)| { |
| 277 | s.stab_kind().is_none() |
| 278 | && s.kind() == SymKind::Sect |
| 279 | && s.sect_idx() == sect_idx_one |
| 280 | && (s.is_ext() || s.is_private_ext() || s.alt_entry()) |
| 281 | }) |
| 282 | .map(|(i, s)| { |
| 283 | let offset = s.value().saturating_sub(sect.addr) as u32; |
| 284 | (i, s, offset) |
| 285 | }) |
| 286 | .collect(); |
| 287 | syms.sort_by_key(|(_, _, off)| *off); |
| 288 | |
| 289 | atomize_regular_section( |
| 290 | input_id, |
| 291 | sect_idx_one, |
| 292 | sect, |
| 293 | &syms, |
| 294 | subsections_via_symbols, |
| 295 | table, |
| 296 | &mut out, |
| 297 | ); |
| 298 | } |
| 299 | |
| 300 | // Post-pass: wire `parent_of` for every `__compact_unwind` atom to the |
| 301 | // function atom that its `function_start` reloc references. |
| 302 | link_unwind_parents(input_id, obj, table, &out); |
| 303 | |
| 304 | out |
| 305 | } |
| 306 | |
| 307 | /// Walk `__compact_unwind` atoms; for each, find its `function_start` |
| 308 | /// reloc (at record offset 0), resolve the referent to a function atom |
| 309 | /// within this same input, and set `parent_of`. External-symbol relocs |
| 310 | /// (e.g. `__compact_unwind` referencing a function in another object) |
| 311 | /// are left with `parent_of = None` and wired by Sprint 17's unwind |
| 312 | /// synthesis pass, which has the full atom table. |
| 313 | fn link_unwind_parents( |
| 314 | input_id: InputId, |
| 315 | obj: &ObjectFile, |
| 316 | table: &mut AtomTable, |
| 317 | out: &ObjectAtomization, |
| 318 | ) { |
| 319 | let Some((cu_idx_zero, cu_sect)) = obj |
| 320 | .sections |
| 321 | .iter() |
| 322 | .enumerate() |
| 323 | .find(|(_, s)| s.kind == SectionKind::CompactUnwind) |
| 324 | else { |
| 325 | return; |
| 326 | }; |
| 327 | let cu_idx_one = (cu_idx_zero + 1) as u8; |
| 328 | |
| 329 | let raws = match parse_raw_relocs(&cu_sect.raw_relocs, 0, cu_sect.nreloc) { |
| 330 | Ok(r) => r, |
| 331 | Err(_) => return, |
| 332 | }; |
| 333 | let fused = match parse_relocs(&raws) { |
| 334 | Ok(f) => f, |
| 335 | Err(_) => return, |
| 336 | }; |
| 337 | |
| 338 | // Index atoms produced by this object for (section, offset) lookup. |
| 339 | let mut atom_index: HashMap<(u8, u32), AtomId> = HashMap::new(); |
| 340 | for id in &out.atoms { |
| 341 | let a = table.get(*id); |
| 342 | atom_index.insert((a.input_section, a.input_offset), *id); |
| 343 | } |
| 344 | |
| 345 | // For each compact_unwind atom, find its first reloc. |
| 346 | for id in &out.atoms { |
| 347 | let atom = table.get(*id); |
| 348 | if atom.input_section != cu_idx_one { |
| 349 | continue; |
| 350 | } |
| 351 | let record_start = atom.input_offset; |
| 352 | let Some(r) = fused.iter().find(|r| r.offset == record_start) else { |
| 353 | continue; |
| 354 | }; |
| 355 | let parent = match r.referent { |
| 356 | Referent::Section(sect_idx) => { |
| 357 | // The 8-byte `function_start` field holds the target's |
| 358 | // in-section offset. For ARM64_RELOC_UNSIGNED, that byte |
| 359 | // window carries the addend directly. |
| 360 | if atom.data.len() >= 8 { |
| 361 | let mut buf = [0u8; 8]; |
| 362 | buf.copy_from_slice(&atom.data[0..8]); |
| 363 | let target_offset = u64::from_le_bytes(buf) as u32; |
| 364 | atom_index.get(&(sect_idx, target_offset)).copied() |
| 365 | } else { |
| 366 | None |
| 367 | } |
| 368 | } |
| 369 | Referent::Symbol(_) => None, |
| 370 | }; |
| 371 | if let Some(parent_id) = parent { |
| 372 | table.get_mut(*id).parent_of = Some(parent_id); |
| 373 | } |
| 374 | } |
| 375 | let _ = input_id; // reserved for cross-object lookup in Sprint 17 |
| 376 | } |
| 377 | |
| 378 | /// Replace every `Symbol::Defined { atom: AtomId(0), ... }` seeded before |
| 379 | /// atomization with the real atom handle and atom-relative offset. |
| 380 | /// Silently skips symbols that have no matching entry (e.g. those that |
| 381 | /// were replaced by a strong definition elsewhere before atomization ran). |
| 382 | pub fn backpatch_symbol_atoms( |
| 383 | atomization: &ObjectAtomization, |
| 384 | input_id: InputId, |
| 385 | obj: &ObjectFile, |
| 386 | sym_table: &mut SymbolTable, |
| 387 | atom_table: &mut AtomTable, |
| 388 | ) { |
| 389 | use crate::resolve::Symbol; |
| 390 | |
| 391 | for (sym_idx, atom_id) in &atomization.owner_by_sym { |
| 392 | let input_sym = &obj.symbols[*sym_idx]; |
| 393 | let Ok(name_str) = obj.symbol_name(input_sym) else { |
| 394 | continue; |
| 395 | }; |
| 396 | let istr = sym_table.intern(name_str); |
| 397 | let Some(sid) = sym_table.lookup(istr) else { |
| 398 | continue; |
| 399 | }; |
| 400 | // Primary owner symbols sit at atom boundary → atom-relative 0. |
| 401 | if let Symbol::Defined { origin, .. } = sym_table.get(sid) { |
| 402 | if *origin == input_id { |
| 403 | sym_table.bind_atom(sid, *atom_id, 0); |
| 404 | atom_table.get_mut(*atom_id).owner = Some(sid); |
| 405 | } |
| 406 | } |
| 407 | } |
| 408 | |
| 409 | for (sym_idx, atom_id, local_off) in &atomization.alt_entries_by_sym { |
| 410 | let input_sym = &obj.symbols[*sym_idx]; |
| 411 | let Ok(name_str) = obj.symbol_name(input_sym) else { |
| 412 | continue; |
| 413 | }; |
| 414 | let istr = sym_table.intern(name_str); |
| 415 | let Some(sid) = sym_table.lookup(istr) else { |
| 416 | continue; |
| 417 | }; |
| 418 | if let Symbol::Defined { origin, .. } = sym_table.get(sid) { |
| 419 | if *origin == input_id { |
| 420 | sym_table.bind_atom(sid, *atom_id, *local_off as u64); |
| 421 | // Update the atom's alt_entries with the resolver-side |
| 422 | // SymbolId (we stored the InputSymbol index during |
| 423 | // atomization; now we know the real handle). |
| 424 | let atom = atom_table.get_mut(*atom_id); |
| 425 | for alt in &mut atom.alt_entries { |
| 426 | if alt.symbol == SymbolId(*sym_idx as u32) |
| 427 | && alt.offset_within_atom == *local_off |
| 428 | { |
| 429 | alt.symbol = sid; |
| 430 | } |
| 431 | } |
| 432 | } |
| 433 | } |
| 434 | } |
| 435 | } |
| 436 | |
| 437 | /// Split one section into atoms according to the `MH_SUBSECTIONS_VIA_SYMBOLS` |
| 438 | /// invariant plus `.alt_entry` folding. Literal and unwind specialization |
| 439 | /// lands in follow-up commits; this function's fallback is "one atom per |
| 440 | /// section" for sections the subsections flag doesn't split. |
| 441 | #[allow(clippy::too_many_arguments)] |
| 442 | fn atomize_regular_section( |
| 443 | input_id: InputId, |
| 444 | section_idx: u8, |
| 445 | sect: &InputSection, |
| 446 | syms: &[(usize, &InputSymbol, u32)], |
| 447 | subsections_via_symbols: bool, |
| 448 | table: &mut AtomTable, |
| 449 | out: &mut ObjectAtomization, |
| 450 | ) { |
| 451 | let kind = sect.kind; |
| 452 | let atom_section = AtomSection::from_section_kind(kind); |
| 453 | |
| 454 | // Without the subsections flag, every section becomes one atom — the |
| 455 | // linker-side equivalent of Apple-style monolithic sections. |
| 456 | if !subsections_via_symbols { |
| 457 | let atom = build_section_atom(input_id, section_idx, sect, atom_section); |
| 458 | let id = table.push(atom); |
| 459 | out.atoms.push(id); |
| 460 | for (sym_idx, _sym, off) in syms { |
| 461 | out.alt_entries_by_sym.push((*sym_idx, id, *off)); |
| 462 | } |
| 463 | return; |
| 464 | } |
| 465 | |
| 466 | // Zerofill: splitting happens per symbol (each tentative common-style |
| 467 | // slot gets its own atom). If no symbols defined, emit a single atom. |
| 468 | if atom_section.is_zerofill() { |
| 469 | atomize_zerofill(input_id, section_idx, sect, syms, atom_section, table, out); |
| 470 | return; |
| 471 | } |
| 472 | |
| 473 | // Literal sections split on content boundaries (null for `__cstring`, |
| 474 | // fixed-size chunks for `__literal4/8/16`) independent of symbol |
| 475 | // labels. Sprint 24's ICF uses the per-atom content for dedup. |
| 476 | if atom_section.is_literal() { |
| 477 | atomize_literal_section(input_id, section_idx, sect, syms, atom_section, table, out); |
| 478 | return; |
| 479 | } |
| 480 | |
| 481 | // `__compact_unwind` is a fixed-layout array of 32-byte records; each |
| 482 | // record becomes its own atom with `parent_of` wired to the function |
| 483 | // atom it describes (linked post-hoc in `link_unwind_parents`). |
| 484 | if atom_section == AtomSection::CompactUnwind { |
| 485 | atomize_compact_unwind(input_id, section_idx, sect, syms, atom_section, table, out); |
| 486 | return; |
| 487 | } |
| 488 | |
| 489 | // With subsections_via_symbols and at least one split point, walk the |
| 490 | // sorted symbols and emit one atom per non-alt_entry boundary. |
| 491 | if syms.is_empty() { |
| 492 | let atom = build_section_atom(input_id, section_idx, sect, atom_section); |
| 493 | let id = table.push(atom); |
| 494 | out.atoms.push(id); |
| 495 | return; |
| 496 | } |
| 497 | |
| 498 | // If there's content before the first symbol, carve a head atom |
| 499 | // (unowned). afs-as emits a leading symbol in practice so this is |
| 500 | // typically zero bytes, but the fallback keeps the byte-flow intact. |
| 501 | let first_offset = syms[0].2; |
| 502 | if first_offset > 0 { |
| 503 | let head = build_slice_atom( |
| 504 | input_id, |
| 505 | section_idx, |
| 506 | sect, |
| 507 | atom_section, |
| 508 | 0, |
| 509 | first_offset, |
| 510 | None, |
| 511 | &[], |
| 512 | ); |
| 513 | let head_id = table.push(head); |
| 514 | out.atoms.push(head_id); |
| 515 | } |
| 516 | |
| 517 | // Walk symbol boundaries. |
| 518 | let section_size = sect.size as u32; |
| 519 | let mut i = 0; |
| 520 | while i < syms.len() { |
| 521 | let (primary_idx, primary, atom_offset) = syms[i]; |
| 522 | let next_real_boundary = find_next_non_alt_entry(syms, i + 1) |
| 523 | .map(|j| syms[j].2) |
| 524 | .unwrap_or(section_size); |
| 525 | let size = next_real_boundary.saturating_sub(atom_offset); |
| 526 | |
| 527 | // Collect alt_entries that fall into [atom_offset, atom_offset+size). |
| 528 | let mut alts: Vec<AltEntry> = Vec::new(); |
| 529 | let mut alt_folded: Vec<(usize, u32)> = Vec::new(); |
| 530 | for (alt_idx, alt_sym, alt_off) in syms.iter().skip(i + 1) { |
| 531 | if *alt_off >= atom_offset + size { |
| 532 | break; |
| 533 | } |
| 534 | if !alt_sym.alt_entry() { |
| 535 | break; |
| 536 | } |
| 537 | let local = *alt_off - atom_offset; |
| 538 | alts.push(AltEntry { |
| 539 | symbol: SymbolId(*alt_idx as u32), |
| 540 | offset_within_atom: local, |
| 541 | }); |
| 542 | alt_folded.push((*alt_idx, local)); |
| 543 | } |
| 544 | |
| 545 | let atom = build_slice_atom( |
| 546 | input_id, |
| 547 | section_idx, |
| 548 | sect, |
| 549 | atom_section, |
| 550 | atom_offset, |
| 551 | size, |
| 552 | Some(primary), |
| 553 | &alts, |
| 554 | ); |
| 555 | let id = table.push(atom); |
| 556 | out.atoms.push(id); |
| 557 | out.owner_by_sym.push((primary_idx, id)); |
| 558 | for (alt_idx, local_off) in alt_folded { |
| 559 | out.alt_entries_by_sym.push((alt_idx, id, local_off)); |
| 560 | } |
| 561 | |
| 562 | // Advance past the primary and its folded alt_entries. |
| 563 | i = find_next_non_alt_entry(syms, i + 1).unwrap_or(syms.len()); |
| 564 | } |
| 565 | } |
| 566 | |
| 567 | /// Split a literal section into atoms. `__cstring` splits at null-byte |
| 568 | /// terminators (variable-length); `__literal4/8/16` split at fixed-width |
| 569 | /// boundaries. Owner symbols attach at exact offsets where a symbol |
| 570 | /// points. |
| 571 | fn atomize_literal_section( |
| 572 | input_id: InputId, |
| 573 | section_idx: u8, |
| 574 | sect: &InputSection, |
| 575 | syms: &[(usize, &InputSymbol, u32)], |
| 576 | atom_section: AtomSection, |
| 577 | table: &mut AtomTable, |
| 578 | out: &mut ObjectAtomization, |
| 579 | ) { |
| 580 | match atom_section { |
| 581 | AtomSection::CStringLiterals => { |
| 582 | atomize_cstring(input_id, section_idx, sect, syms, atom_section, table, out) |
| 583 | } |
| 584 | AtomSection::Literal4 => atomize_fixed_literal( |
| 585 | input_id, |
| 586 | section_idx, |
| 587 | sect, |
| 588 | syms, |
| 589 | 4, |
| 590 | atom_section, |
| 591 | table, |
| 592 | out, |
| 593 | ), |
| 594 | AtomSection::Literal8 => atomize_fixed_literal( |
| 595 | input_id, |
| 596 | section_idx, |
| 597 | sect, |
| 598 | syms, |
| 599 | 8, |
| 600 | atom_section, |
| 601 | table, |
| 602 | out, |
| 603 | ), |
| 604 | AtomSection::Literal16 => atomize_fixed_literal( |
| 605 | input_id, |
| 606 | section_idx, |
| 607 | sect, |
| 608 | syms, |
| 609 | 16, |
| 610 | atom_section, |
| 611 | table, |
| 612 | out, |
| 613 | ), |
| 614 | _ => unreachable!("atomize_literal_section called with non-literal kind"), |
| 615 | } |
| 616 | } |
| 617 | |
| 618 | fn atomize_cstring( |
| 619 | input_id: InputId, |
| 620 | section_idx: u8, |
| 621 | sect: &InputSection, |
| 622 | syms: &[(usize, &InputSymbol, u32)], |
| 623 | atom_section: AtomSection, |
| 624 | table: &mut AtomTable, |
| 625 | out: &mut ObjectAtomization, |
| 626 | ) { |
| 627 | let mut offset = 0usize; |
| 628 | while offset < sect.data.len() { |
| 629 | let relative_nul = sect.data[offset..] |
| 630 | .iter() |
| 631 | .position(|&b| b == 0) |
| 632 | .unwrap_or(sect.data.len() - offset); |
| 633 | let end = offset + relative_nul + 1; |
| 634 | let end = end.min(sect.data.len()); |
| 635 | let data = sect.data[offset..end].to_vec(); |
| 636 | let size = (end - offset) as u32; |
| 637 | |
| 638 | let owner_entry = syms.iter().find(|(_, _, off)| *off as usize == offset); |
| 639 | let owner_idx = owner_entry.map(|(i, _, _)| *i); |
| 640 | |
| 641 | let mut flags = AtomFlags::default().with(AtomFlags::LITERAL); |
| 642 | if let Some((_, sym, _)) = owner_entry { |
| 643 | flags.set(symbol_flags(sym).bits()); |
| 644 | } |
| 645 | |
| 646 | let atom = Atom { |
| 647 | id: AtomId(0), |
| 648 | origin: input_id, |
| 649 | input_section: section_idx, |
| 650 | section: atom_section, |
| 651 | input_offset: offset as u32, |
| 652 | size, |
| 653 | align_pow2: sect.align_pow2 as u8, |
| 654 | owner: None, |
| 655 | alt_entries: Vec::new(), |
| 656 | data, |
| 657 | flags, |
| 658 | parent_of: None, |
| 659 | }; |
| 660 | let id = table.push(atom); |
| 661 | out.atoms.push(id); |
| 662 | if let Some(idx) = owner_idx { |
| 663 | out.owner_by_sym.push((idx, id)); |
| 664 | } |
| 665 | offset = end; |
| 666 | } |
| 667 | } |
| 668 | |
| 669 | #[allow(clippy::too_many_arguments)] |
| 670 | fn atomize_fixed_literal( |
| 671 | input_id: InputId, |
| 672 | section_idx: u8, |
| 673 | sect: &InputSection, |
| 674 | syms: &[(usize, &InputSymbol, u32)], |
| 675 | chunk_size: usize, |
| 676 | atom_section: AtomSection, |
| 677 | table: &mut AtomTable, |
| 678 | out: &mut ObjectAtomization, |
| 679 | ) { |
| 680 | let section_size = sect.size as usize; |
| 681 | let mut offset = 0usize; |
| 682 | while offset < section_size { |
| 683 | let end = (offset + chunk_size).min(section_size); |
| 684 | let data_end = end.min(sect.data.len()); |
| 685 | let data = if offset < data_end { |
| 686 | sect.data[offset..data_end].to_vec() |
| 687 | } else { |
| 688 | Vec::new() |
| 689 | }; |
| 690 | let size = (end - offset) as u32; |
| 691 | |
| 692 | let owner_entry = syms.iter().find(|(_, _, off)| *off as usize == offset); |
| 693 | let owner_idx = owner_entry.map(|(i, _, _)| *i); |
| 694 | |
| 695 | let mut flags = AtomFlags::default().with(AtomFlags::LITERAL); |
| 696 | if let Some((_, sym, _)) = owner_entry { |
| 697 | flags.set(symbol_flags(sym).bits()); |
| 698 | } |
| 699 | |
| 700 | let atom = Atom { |
| 701 | id: AtomId(0), |
| 702 | origin: input_id, |
| 703 | input_section: section_idx, |
| 704 | section: atom_section, |
| 705 | input_offset: offset as u32, |
| 706 | size, |
| 707 | align_pow2: sect.align_pow2 as u8, |
| 708 | owner: None, |
| 709 | alt_entries: Vec::new(), |
| 710 | data, |
| 711 | flags, |
| 712 | parent_of: None, |
| 713 | }; |
| 714 | let id = table.push(atom); |
| 715 | out.atoms.push(id); |
| 716 | if let Some(idx) = owner_idx { |
| 717 | out.owner_by_sym.push((idx, id)); |
| 718 | } |
| 719 | offset = end; |
| 720 | } |
| 721 | } |
| 722 | |
| 723 | /// Split `__compact_unwind` into 32-byte atoms (one per record). |
| 724 | /// `parent_of` is filled in post-hoc by `link_unwind_parents` once all |
| 725 | /// sections of this object have been atomized. |
| 726 | fn atomize_compact_unwind( |
| 727 | input_id: InputId, |
| 728 | section_idx: u8, |
| 729 | sect: &InputSection, |
| 730 | syms: &[(usize, &InputSymbol, u32)], |
| 731 | atom_section: AtomSection, |
| 732 | table: &mut AtomTable, |
| 733 | out: &mut ObjectAtomization, |
| 734 | ) { |
| 735 | const RECORD: usize = 32; |
| 736 | let section_size = sect.size as usize; |
| 737 | let mut offset = 0usize; |
| 738 | while offset < section_size { |
| 739 | let end = (offset + RECORD).min(section_size); |
| 740 | let data = sect.data[offset..end.min(sect.data.len())].to_vec(); |
| 741 | let size = (end - offset) as u32; |
| 742 | |
| 743 | let owner_idx = syms |
| 744 | .iter() |
| 745 | .find(|(_, _, off)| *off as usize == offset) |
| 746 | .map(|(i, _, _)| *i); |
| 747 | |
| 748 | let atom = Atom { |
| 749 | id: AtomId(0), |
| 750 | origin: input_id, |
| 751 | input_section: section_idx, |
| 752 | section: atom_section, |
| 753 | input_offset: offset as u32, |
| 754 | size, |
| 755 | align_pow2: sect.align_pow2 as u8, |
| 756 | owner: None, |
| 757 | alt_entries: Vec::new(), |
| 758 | data, |
| 759 | flags: AtomFlags::default(), |
| 760 | parent_of: None, // filled by link_unwind_parents |
| 761 | }; |
| 762 | let id = table.push(atom); |
| 763 | out.atoms.push(id); |
| 764 | if let Some(idx) = owner_idx { |
| 765 | out.owner_by_sym.push((idx, id)); |
| 766 | } |
| 767 | offset = end; |
| 768 | } |
| 769 | } |
| 770 | |
| 771 | fn atomize_zerofill( |
| 772 | input_id: InputId, |
| 773 | section_idx: u8, |
| 774 | sect: &InputSection, |
| 775 | syms: &[(usize, &InputSymbol, u32)], |
| 776 | atom_section: AtomSection, |
| 777 | table: &mut AtomTable, |
| 778 | out: &mut ObjectAtomization, |
| 779 | ) { |
| 780 | if syms.is_empty() { |
| 781 | let atom = build_section_atom(input_id, section_idx, sect, atom_section); |
| 782 | let id = table.push(atom); |
| 783 | out.atoms.push(id); |
| 784 | return; |
| 785 | } |
| 786 | let section_size = sect.size as u32; |
| 787 | for (i, (sym_idx, sym, start)) in syms.iter().enumerate() { |
| 788 | let start = *start; |
| 789 | let end = syms |
| 790 | .get(i + 1) |
| 791 | .map(|(_, _, off)| *off) |
| 792 | .unwrap_or(section_size); |
| 793 | let size = end.saturating_sub(start); |
| 794 | let atom = Atom { |
| 795 | id: AtomId(0), |
| 796 | origin: input_id, |
| 797 | input_section: section_idx, |
| 798 | section: atom_section, |
| 799 | input_offset: start, |
| 800 | size, |
| 801 | align_pow2: sect.align_pow2 as u8, |
| 802 | owner: Some(SymbolId(*sym_idx as u32)), |
| 803 | alt_entries: Vec::new(), |
| 804 | data: Vec::new(), // zerofill |
| 805 | flags: symbol_flags(sym), |
| 806 | parent_of: None, |
| 807 | }; |
| 808 | let id = table.push(atom); |
| 809 | out.atoms.push(id); |
| 810 | out.owner_by_sym.push((*sym_idx, id)); |
| 811 | } |
| 812 | } |
| 813 | |
| 814 | fn build_section_atom( |
| 815 | input_id: InputId, |
| 816 | section_idx: u8, |
| 817 | sect: &InputSection, |
| 818 | atom_section: AtomSection, |
| 819 | ) -> Atom { |
| 820 | let data = if atom_section.is_zerofill() { |
| 821 | Vec::new() |
| 822 | } else { |
| 823 | sect.data.clone() |
| 824 | }; |
| 825 | let mut flags = AtomFlags::default(); |
| 826 | if sect.kind == SectionKind::Text { |
| 827 | flags.set(AtomFlags::PURE_INSTRUCTIONS); |
| 828 | } |
| 829 | Atom { |
| 830 | id: AtomId(0), |
| 831 | origin: input_id, |
| 832 | input_section: section_idx, |
| 833 | section: atom_section, |
| 834 | input_offset: 0, |
| 835 | size: sect.size as u32, |
| 836 | align_pow2: sect.align_pow2 as u8, |
| 837 | owner: None, |
| 838 | alt_entries: Vec::new(), |
| 839 | data, |
| 840 | flags, |
| 841 | parent_of: None, |
| 842 | } |
| 843 | } |
| 844 | |
| 845 | #[allow(clippy::too_many_arguments)] |
| 846 | fn build_slice_atom( |
| 847 | input_id: InputId, |
| 848 | section_idx: u8, |
| 849 | sect: &InputSection, |
| 850 | atom_section: AtomSection, |
| 851 | offset: u32, |
| 852 | size: u32, |
| 853 | owner: Option<&InputSymbol>, |
| 854 | alt_entries: &[AltEntry], |
| 855 | ) -> Atom { |
| 856 | let data = if atom_section.is_zerofill() { |
| 857 | Vec::new() |
| 858 | } else { |
| 859 | let start = offset as usize; |
| 860 | let end = (offset + size) as usize; |
| 861 | sect.data[start..end.min(sect.data.len())].to_vec() |
| 862 | }; |
| 863 | let mut flags = AtomFlags::default(); |
| 864 | if sect.kind == SectionKind::Text { |
| 865 | flags.set(AtomFlags::PURE_INSTRUCTIONS); |
| 866 | } |
| 867 | if let Some(sym) = owner { |
| 868 | flags.set(symbol_flags(sym).bits()); |
| 869 | } |
| 870 | Atom { |
| 871 | id: AtomId(0), |
| 872 | origin: input_id, |
| 873 | input_section: section_idx, |
| 874 | section: atom_section, |
| 875 | input_offset: offset, |
| 876 | size, |
| 877 | align_pow2: sect.align_pow2 as u8, |
| 878 | // owner is wired at back-patch time via `backpatch_symbol_atoms`; |
| 879 | // atomization doesn't know the resolver-side SymbolId yet. |
| 880 | owner: None, |
| 881 | alt_entries: alt_entries.to_vec(), |
| 882 | data, |
| 883 | flags, |
| 884 | parent_of: None, |
| 885 | } |
| 886 | } |
| 887 | |
| 888 | fn symbol_flags(sym: &InputSymbol) -> AtomFlags { |
| 889 | let mut f = AtomFlags::default(); |
| 890 | if sym.no_dead_strip() { |
| 891 | f.set(AtomFlags::NO_DEAD_STRIP); |
| 892 | } |
| 893 | if sym.weak_def() { |
| 894 | f.set(AtomFlags::WEAK_DEF); |
| 895 | } |
| 896 | f |
| 897 | } |
| 898 | |
| 899 | /// Find the next non-alt_entry symbol starting from index `i`. Returns the |
| 900 | /// index (into `syms`), or `None` if every remaining symbol is an alt |
| 901 | /// entry. |
| 902 | fn find_next_non_alt_entry(syms: &[(usize, &InputSymbol, u32)], from: usize) -> Option<usize> { |
| 903 | syms.iter() |
| 904 | .enumerate() |
| 905 | .skip(from) |
| 906 | .find(|(_, (_, s, _))| !s.alt_entry()) |
| 907 | .map(|(i, _)| i) |
| 908 | } |
| 909 | |
| 910 | #[cfg(test)] |
| 911 | mod tests { |
| 912 | use super::*; |
| 913 | |
| 914 | fn make_text_atom(origin: InputId, sect: u8, off: u32, size: u32) -> Atom { |
| 915 | Atom { |
| 916 | id: AtomId(0), // will be overwritten by push |
| 917 | origin, |
| 918 | input_section: sect, |
| 919 | section: AtomSection::Text, |
| 920 | input_offset: off, |
| 921 | size, |
| 922 | align_pow2: 2, |
| 923 | owner: None, |
| 924 | alt_entries: Vec::new(), |
| 925 | data: vec![0u8; size as usize], |
| 926 | flags: AtomFlags::default().with(AtomFlags::PURE_INSTRUCTIONS), |
| 927 | parent_of: None, |
| 928 | } |
| 929 | } |
| 930 | |
| 931 | #[test] |
| 932 | fn push_assigns_stable_one_based_ids_and_roundtrips_via_get() { |
| 933 | let mut t = AtomTable::new(); |
| 934 | let a = t.push(make_text_atom(InputId(0), 1, 0, 16)); |
| 935 | let b = t.push(make_text_atom(InputId(0), 1, 16, 8)); |
| 936 | assert_eq!(a.0, 1); |
| 937 | assert_eq!(b.0, 2); |
| 938 | assert_eq!(t.len(), 2); |
| 939 | assert_eq!(t.get(a).input_offset, 0); |
| 940 | assert_eq!(t.get(b).input_offset, 16); |
| 941 | } |
| 942 | |
| 943 | #[test] |
| 944 | fn id_zero_is_reserved_as_placeholder() { |
| 945 | // `Symbol::Defined { atom: AtomId(0) }` is the pre-atomization |
| 946 | // sentinel; any real atom must have id >= 1. |
| 947 | let mut t = AtomTable::new(); |
| 948 | let id = t.push(make_text_atom(InputId(0), 1, 0, 1)); |
| 949 | assert_ne!(id, AtomId(0)); |
| 950 | assert_eq!(id, AtomId(1)); |
| 951 | } |
| 952 | |
| 953 | #[test] |
| 954 | fn atom_section_from_section_kind_covers_all_variants() { |
| 955 | assert_eq!( |
| 956 | AtomSection::from_section_kind(SectionKind::Text), |
| 957 | AtomSection::Text |
| 958 | ); |
| 959 | assert_eq!( |
| 960 | AtomSection::from_section_kind(SectionKind::CStringLiterals), |
| 961 | AtomSection::CStringLiterals |
| 962 | ); |
| 963 | assert_eq!( |
| 964 | AtomSection::from_section_kind(SectionKind::CompactUnwind), |
| 965 | AtomSection::CompactUnwind |
| 966 | ); |
| 967 | assert_eq!( |
| 968 | AtomSection::from_section_kind(SectionKind::ZeroFill), |
| 969 | AtomSection::ZeroFill |
| 970 | ); |
| 971 | assert!(AtomSection::from_section_kind(SectionKind::ZeroFill).is_zerofill()); |
| 972 | assert!(AtomSection::from_section_kind(SectionKind::CStringLiterals).is_literal()); |
| 973 | assert!(!AtomSection::from_section_kind(SectionKind::Text).is_literal()); |
| 974 | } |
| 975 | |
| 976 | #[test] |
| 977 | fn atom_flags_bitwise() { |
| 978 | let f = AtomFlags::default() |
| 979 | .with(AtomFlags::NO_DEAD_STRIP) |
| 980 | .with(AtomFlags::WEAK_DEF); |
| 981 | assert!(f.has(AtomFlags::NO_DEAD_STRIP)); |
| 982 | assert!(f.has(AtomFlags::WEAK_DEF)); |
| 983 | assert!(!f.has(AtomFlags::THREAD_LOCAL)); |
| 984 | } |
| 985 | |
| 986 | #[test] |
| 987 | fn by_input_section_groups_by_origin_and_section_index() { |
| 988 | let mut t = AtomTable::new(); |
| 989 | let a = t.push(make_text_atom(InputId(0), 1, 0, 4)); |
| 990 | let b = t.push(make_text_atom(InputId(0), 1, 4, 4)); |
| 991 | let c = t.push(make_text_atom(InputId(1), 1, 0, 4)); |
| 992 | let grouped = t.by_input_section(); |
| 993 | assert_eq!(grouped.get(&(InputId(0), 1)).unwrap(), &vec![a, b]); |
| 994 | assert_eq!(grouped.get(&(InputId(1), 1)).unwrap(), &vec![c]); |
| 995 | } |
| 996 | } |
| 997 |