MachOObject.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. //===- MachOObject.h - Mach-O object file model -----------------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #ifndef LLVM_LIB_OBJCOPY_MACHO_MACHOOBJECT_H
  9. #define LLVM_LIB_OBJCOPY_MACHO_MACHOOBJECT_H
  10. #include "llvm/ADT/StringRef.h"
  11. #include "llvm/BinaryFormat/MachO.h"
  12. #include "llvm/MC/StringTableBuilder.h"
  13. #include "llvm/ObjectYAML/DWARFYAML.h"
  14. #include "llvm/Support/StringSaver.h"
  15. #include "llvm/Support/YAMLTraits.h"
  16. #include <cstdint>
  17. #include <string>
  18. #include <vector>
  19. namespace llvm {
  20. namespace objcopy {
  21. namespace macho {
  22. struct MachHeader {
  23. uint32_t Magic;
  24. uint32_t CPUType;
  25. uint32_t CPUSubType;
  26. uint32_t FileType;
  27. uint32_t NCmds;
  28. uint32_t SizeOfCmds;
  29. uint32_t Flags;
  30. uint32_t Reserved = 0;
  31. };
  32. struct RelocationInfo;
  33. struct Section {
  34. uint32_t Index;
  35. std::string Segname;
  36. std::string Sectname;
  37. // CanonicalName is a string formatted as “<Segname>,<Sectname>".
  38. std::string CanonicalName;
  39. uint64_t Addr = 0;
  40. uint64_t Size = 0;
  41. // Offset in the input file.
  42. std::optional<uint32_t> OriginalOffset;
  43. uint32_t Offset = 0;
  44. uint32_t Align = 0;
  45. uint32_t RelOff = 0;
  46. uint32_t NReloc = 0;
  47. uint32_t Flags = 0;
  48. uint32_t Reserved1 = 0;
  49. uint32_t Reserved2 = 0;
  50. uint32_t Reserved3 = 0;
  51. StringRef Content;
  52. std::vector<RelocationInfo> Relocations;
  53. Section(StringRef SegName, StringRef SectName);
  54. Section(StringRef SegName, StringRef SectName, StringRef Content);
  55. MachO::SectionType getType() const {
  56. return static_cast<MachO::SectionType>(Flags & MachO::SECTION_TYPE);
  57. }
  58. bool isVirtualSection() const {
  59. return (getType() == MachO::S_ZEROFILL ||
  60. getType() == MachO::S_GB_ZEROFILL ||
  61. getType() == MachO::S_THREAD_LOCAL_ZEROFILL);
  62. }
  63. bool hasValidOffset() const {
  64. return !(isVirtualSection() || (OriginalOffset && *OriginalOffset == 0));
  65. }
  66. };
  67. struct LoadCommand {
  68. // The type MachO::macho_load_command is defined in llvm/BinaryFormat/MachO.h
  69. // and it is a union of all the structs corresponding to various load
  70. // commands.
  71. MachO::macho_load_command MachOLoadCommand;
  72. // The raw content of the payload of the load command (located right after the
  73. // corresponding struct). In some cases it is either empty or can be
  74. // copied-over without digging into its structure.
  75. std::vector<uint8_t> Payload;
  76. // Some load commands can contain (inside the payload) an array of sections,
  77. // though the contents of the sections are stored separately. The struct
  78. // Section describes only sections' metadata and where to find the
  79. // corresponding content inside the binary.
  80. std::vector<std::unique_ptr<Section>> Sections;
  81. // Returns the segment name if the load command is a segment command.
  82. std::optional<StringRef> getSegmentName() const;
  83. // Returns the segment vm address if the load command is a segment command.
  84. std::optional<uint64_t> getSegmentVMAddr() const;
  85. };
  86. // A symbol information. Fields which starts with "n_" are same as them in the
  87. // nlist.
  88. struct SymbolEntry {
  89. std::string Name;
  90. bool Referenced = false;
  91. uint32_t Index;
  92. uint8_t n_type;
  93. uint8_t n_sect;
  94. uint16_t n_desc;
  95. uint64_t n_value;
  96. bool isExternalSymbol() const { return n_type & MachO::N_EXT; }
  97. bool isLocalSymbol() const { return !isExternalSymbol(); }
  98. bool isUndefinedSymbol() const {
  99. return (n_type & MachO::N_TYPE) == MachO::N_UNDF;
  100. }
  101. bool isSwiftSymbol() const {
  102. return StringRef(Name).startswith("_$s") ||
  103. StringRef(Name).startswith("_$S");
  104. }
  105. std::optional<uint32_t> section() const {
  106. return n_sect == MachO::NO_SECT ? std::nullopt
  107. : std::optional<uint32_t>(n_sect);
  108. }
  109. };
  110. /// The location of the symbol table inside the binary is described by LC_SYMTAB
  111. /// load command.
  112. struct SymbolTable {
  113. std::vector<std::unique_ptr<SymbolEntry>> Symbols;
  114. using iterator = pointee_iterator<
  115. std::vector<std::unique_ptr<SymbolEntry>>::const_iterator>;
  116. iterator begin() const { return iterator(Symbols.begin()); }
  117. iterator end() const { return iterator(Symbols.end()); }
  118. const SymbolEntry *getSymbolByIndex(uint32_t Index) const;
  119. SymbolEntry *getSymbolByIndex(uint32_t Index);
  120. void removeSymbols(
  121. function_ref<bool(const std::unique_ptr<SymbolEntry> &)> ToRemove);
  122. };
  123. struct IndirectSymbolEntry {
  124. // The original value in an indirect symbol table. Higher bits encode extra
  125. // information (INDIRECT_SYMBOL_LOCAL and INDIRECT_SYMBOL_ABS).
  126. uint32_t OriginalIndex;
  127. /// The Symbol referenced by this entry. It's std::nullopt if the index is
  128. /// INDIRECT_SYMBOL_LOCAL or INDIRECT_SYMBOL_ABS.
  129. std::optional<SymbolEntry *> Symbol;
  130. IndirectSymbolEntry(uint32_t OriginalIndex,
  131. std::optional<SymbolEntry *> Symbol)
  132. : OriginalIndex(OriginalIndex), Symbol(Symbol) {}
  133. };
  134. struct IndirectSymbolTable {
  135. std::vector<IndirectSymbolEntry> Symbols;
  136. };
  137. /// The location of the string table inside the binary is described by LC_SYMTAB
  138. /// load command.
  139. struct StringTable {
  140. std::vector<std::string> Strings;
  141. };
  142. struct RelocationInfo {
  143. // The referenced symbol entry. Set if !Scattered && Extern.
  144. std::optional<const SymbolEntry *> Symbol;
  145. // The referenced section. Set if !Scattered && !Extern.
  146. std::optional<const Section *> Sec;
  147. // True if Info is a scattered_relocation_info.
  148. bool Scattered;
  149. // True if the type is an ADDEND. r_symbolnum holds the addend instead of a
  150. // symbol index.
  151. bool IsAddend;
  152. // True if the r_symbolnum points to a section number (i.e. r_extern=0).
  153. bool Extern;
  154. MachO::any_relocation_info Info;
  155. unsigned getPlainRelocationSymbolNum(bool IsLittleEndian) {
  156. if (IsLittleEndian)
  157. return Info.r_word1 & 0xffffff;
  158. return Info.r_word1 >> 8;
  159. }
  160. void setPlainRelocationSymbolNum(unsigned SymbolNum, bool IsLittleEndian) {
  161. assert(SymbolNum < (1 << 24) && "SymbolNum out of range");
  162. if (IsLittleEndian)
  163. Info.r_word1 = (Info.r_word1 & ~0x00ffffff) | SymbolNum;
  164. else
  165. Info.r_word1 = (Info.r_word1 & ~0xffffff00) | (SymbolNum << 8);
  166. }
  167. };
  168. /// The location of the rebase info inside the binary is described by
  169. /// LC_DYLD_INFO load command. Dyld rebases an image whenever dyld loads it at
  170. /// an address different from its preferred address. The rebase information is
  171. /// a stream of byte sized opcodes whose symbolic names start with
  172. /// REBASE_OPCODE_. Conceptually the rebase information is a table of tuples:
  173. /// <seg-index, seg-offset, type>
  174. /// The opcodes are a compressed way to encode the table by only
  175. /// encoding when a column changes. In addition simple patterns
  176. /// like "every n'th offset for m times" can be encoded in a few
  177. /// bytes.
  178. struct RebaseInfo {
  179. // At the moment we do not parse this info (and it is simply copied over),
  180. // but the proper support will be added later.
  181. ArrayRef<uint8_t> Opcodes;
  182. };
  183. /// The location of the bind info inside the binary is described by
  184. /// LC_DYLD_INFO load command. Dyld binds an image during the loading process,
  185. /// if the image requires any pointers to be initialized to symbols in other
  186. /// images. The bind information is a stream of byte sized opcodes whose
  187. /// symbolic names start with BIND_OPCODE_. Conceptually the bind information is
  188. /// a table of tuples: <seg-index, seg-offset, type, symbol-library-ordinal,
  189. /// symbol-name, addend> The opcodes are a compressed way to encode the table by
  190. /// only encoding when a column changes. In addition simple patterns like for
  191. /// runs of pointers initialized to the same value can be encoded in a few
  192. /// bytes.
  193. struct BindInfo {
  194. // At the moment we do not parse this info (and it is simply copied over),
  195. // but the proper support will be added later.
  196. ArrayRef<uint8_t> Opcodes;
  197. };
  198. /// The location of the weak bind info inside the binary is described by
  199. /// LC_DYLD_INFO load command. Some C++ programs require dyld to unique symbols
  200. /// so that all images in the process use the same copy of some code/data. This
  201. /// step is done after binding. The content of the weak_bind info is an opcode
  202. /// stream like the bind_info. But it is sorted alphabetically by symbol name.
  203. /// This enable dyld to walk all images with weak binding information in order
  204. /// and look for collisions. If there are no collisions, dyld does no updating.
  205. /// That means that some fixups are also encoded in the bind_info. For
  206. /// instance, all calls to "operator new" are first bound to libstdc++.dylib
  207. /// using the information in bind_info. Then if some image overrides operator
  208. /// new that is detected when the weak_bind information is processed and the
  209. /// call to operator new is then rebound.
  210. struct WeakBindInfo {
  211. // At the moment we do not parse this info (and it is simply copied over),
  212. // but the proper support will be added later.
  213. ArrayRef<uint8_t> Opcodes;
  214. };
  215. /// The location of the lazy bind info inside the binary is described by
  216. /// LC_DYLD_INFO load command. Some uses of external symbols do not need to be
  217. /// bound immediately. Instead they can be lazily bound on first use. The
  218. /// lazy_bind contains a stream of BIND opcodes to bind all lazy symbols. Normal
  219. /// use is that dyld ignores the lazy_bind section when loading an image.
  220. /// Instead the static linker arranged for the lazy pointer to initially point
  221. /// to a helper function which pushes the offset into the lazy_bind area for the
  222. /// symbol needing to be bound, then jumps to dyld which simply adds the offset
  223. /// to lazy_bind_off to get the information on what to bind.
  224. struct LazyBindInfo {
  225. ArrayRef<uint8_t> Opcodes;
  226. };
  227. /// The location of the export info inside the binary is described by
  228. /// LC_DYLD_INFO load command. The symbols exported by a dylib are encoded in a
  229. /// trie. This is a compact representation that factors out common prefixes. It
  230. /// also reduces LINKEDIT pages in RAM because it encodes all information (name,
  231. /// address, flags) in one small, contiguous range. The export area is a stream
  232. /// of nodes. The first node sequentially is the start node for the trie. Nodes
  233. /// for a symbol start with a uleb128 that is the length of the exported symbol
  234. /// information for the string so far. If there is no exported symbol, the node
  235. /// starts with a zero byte. If there is exported info, it follows the length.
  236. /// First is a uleb128 containing flags. Normally, it is followed by
  237. /// a uleb128 encoded offset which is location of the content named
  238. /// by the symbol from the mach_header for the image. If the flags
  239. /// is EXPORT_SYMBOL_FLAGS_REEXPORT, then following the flags is
  240. /// a uleb128 encoded library ordinal, then a zero terminated
  241. /// UTF8 string. If the string is zero length, then the symbol
  242. /// is re-export from the specified dylib with the same name.
  243. /// If the flags is EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER, then following
  244. /// the flags is two uleb128s: the stub offset and the resolver offset.
  245. /// The stub is used by non-lazy pointers. The resolver is used
  246. /// by lazy pointers and must be called to get the actual address to use.
  247. /// After the optional exported symbol information is a byte of
  248. /// how many edges (0-255) that this node has leaving it,
  249. /// followed by each edge.
  250. /// Each edge is a zero terminated UTF8 of the addition chars
  251. /// in the symbol, followed by a uleb128 offset for the node that
  252. /// edge points to.
  253. struct ExportInfo {
  254. ArrayRef<uint8_t> Trie;
  255. };
  256. struct LinkData {
  257. ArrayRef<uint8_t> Data;
  258. };
  259. struct Object {
  260. MachHeader Header;
  261. std::vector<LoadCommand> LoadCommands;
  262. SymbolTable SymTable;
  263. StringTable StrTable;
  264. RebaseInfo Rebases;
  265. BindInfo Binds;
  266. WeakBindInfo WeakBinds;
  267. LazyBindInfo LazyBinds;
  268. ExportInfo Exports;
  269. IndirectSymbolTable IndirectSymTable;
  270. LinkData DataInCode;
  271. LinkData LinkerOptimizationHint;
  272. LinkData FunctionStarts;
  273. LinkData ExportsTrie;
  274. LinkData ChainedFixups;
  275. LinkData DylibCodeSignDRs;
  276. std::optional<uint32_t> SwiftVersion;
  277. /// The index of LC_CODE_SIGNATURE load command if present.
  278. std::optional<size_t> CodeSignatureCommandIndex;
  279. /// The index of LC_DYLIB_CODE_SIGN_DRS load command if present.
  280. std::optional<size_t> DylibCodeSignDRsIndex;
  281. /// The index of LC_SYMTAB load command if present.
  282. std::optional<size_t> SymTabCommandIndex;
  283. /// The index of LC_DYLD_INFO or LC_DYLD_INFO_ONLY load command if present.
  284. std::optional<size_t> DyLdInfoCommandIndex;
  285. /// The index LC_DYSYMTAB load command if present.
  286. std::optional<size_t> DySymTabCommandIndex;
  287. /// The index LC_DATA_IN_CODE load command if present.
  288. std::optional<size_t> DataInCodeCommandIndex;
  289. /// The index of LC_LINKER_OPTIMIZATIN_HINT load command if present.
  290. std::optional<size_t> LinkerOptimizationHintCommandIndex;
  291. /// The index LC_FUNCTION_STARTS load command if present.
  292. std::optional<size_t> FunctionStartsCommandIndex;
  293. /// The index LC_DYLD_CHAINED_FIXUPS load command if present.
  294. std::optional<size_t> ChainedFixupsCommandIndex;
  295. /// The index LC_DYLD_EXPORTS_TRIE load command if present.
  296. std::optional<size_t> ExportsTrieCommandIndex;
  297. /// The index of the LC_SEGMENT or LC_SEGMENT_64 load command
  298. /// corresponding to the __TEXT segment.
  299. std::optional<size_t> TextSegmentCommandIndex;
  300. BumpPtrAllocator Alloc;
  301. StringSaver NewSectionsContents;
  302. Object() : NewSectionsContents(Alloc) {}
  303. Error
  304. removeSections(function_ref<bool(const std::unique_ptr<Section> &)> ToRemove);
  305. Error removeLoadCommands(function_ref<bool(const LoadCommand &)> ToRemove);
  306. void updateLoadCommandIndexes();
  307. /// Creates a new segment load command in the object and returns a reference
  308. /// to the newly created load command. The caller should verify that SegName
  309. /// is not too long (SegName.size() should be less than or equal to 16).
  310. LoadCommand &addSegment(StringRef SegName, uint64_t SegVMSize);
  311. bool is64Bit() const {
  312. return Header.Magic == MachO::MH_MAGIC_64 ||
  313. Header.Magic == MachO::MH_CIGAM_64;
  314. }
  315. uint64_t nextAvailableSegmentAddress() const;
  316. };
  317. } // end namespace macho
  318. } // end namespace objcopy
  319. } // end namespace llvm
  320. #endif // LLVM_LIB_OBJCOPY_MACHO_MACHOOBJECT_H