Object.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. //===- Object.h - Mach-O object file model ----------------------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #ifndef LLVM_OBJCOPY_MACHO_OBJECT_H
  9. #define LLVM_OBJCOPY_MACHO_OBJECT_H
  10. #include "llvm/ADT/Optional.h"
  11. #include "llvm/ADT/StringRef.h"
  12. #include "llvm/BinaryFormat/MachO.h"
  13. #include "llvm/MC/StringTableBuilder.h"
  14. #include "llvm/ObjectYAML/DWARFYAML.h"
  15. #include "llvm/Support/StringSaver.h"
  16. #include "llvm/Support/YAMLTraits.h"
  17. #include <cstdint>
  18. #include <string>
  19. #include <vector>
  20. namespace llvm {
  21. namespace objcopy {
  22. namespace macho {
  23. struct MachHeader {
  24. uint32_t Magic;
  25. uint32_t CPUType;
  26. uint32_t CPUSubType;
  27. uint32_t FileType;
  28. uint32_t NCmds;
  29. uint32_t SizeOfCmds;
  30. uint32_t Flags;
  31. uint32_t Reserved = 0;
  32. };
  33. struct RelocationInfo;
  34. struct Section {
  35. uint32_t Index;
  36. std::string Segname;
  37. std::string Sectname;
  38. // CanonicalName is a string formatted as “<Segname>,<Sectname>".
  39. std::string CanonicalName;
  40. uint64_t Addr = 0;
  41. uint64_t Size = 0;
  42. // Offset in the input file.
  43. Optional<uint32_t> OriginalOffset;
  44. uint32_t Offset = 0;
  45. uint32_t Align = 0;
  46. uint32_t RelOff = 0;
  47. uint32_t NReloc = 0;
  48. uint32_t Flags = 0;
  49. uint32_t Reserved1 = 0;
  50. uint32_t Reserved2 = 0;
  51. uint32_t Reserved3 = 0;
  52. StringRef Content;
  53. std::vector<RelocationInfo> Relocations;
  54. Section(StringRef SegName, StringRef SectName)
  55. : Segname(std::string(SegName)), Sectname(std::string(SectName)),
  56. CanonicalName((Twine(SegName) + Twine(',') + SectName).str()) {}
  57. Section(StringRef SegName, StringRef SectName, StringRef Content)
  58. : Segname(std::string(SegName)), Sectname(std::string(SectName)),
  59. CanonicalName((Twine(SegName) + Twine(',') + SectName).str()),
  60. Content(Content) {}
  61. MachO::SectionType getType() const {
  62. return static_cast<MachO::SectionType>(Flags & MachO::SECTION_TYPE);
  63. }
  64. bool isVirtualSection() const {
  65. return (getType() == MachO::S_ZEROFILL ||
  66. getType() == MachO::S_GB_ZEROFILL ||
  67. getType() == MachO::S_THREAD_LOCAL_ZEROFILL);
  68. }
  69. bool hasValidOffset() const {
  70. return !(isVirtualSection() || (OriginalOffset && *OriginalOffset == 0));
  71. }
  72. };
  73. struct LoadCommand {
  74. // The type MachO::macho_load_command is defined in llvm/BinaryFormat/MachO.h
  75. // and it is a union of all the structs corresponding to various load
  76. // commands.
  77. MachO::macho_load_command MachOLoadCommand;
  78. // The raw content of the payload of the load command (located right after the
  79. // corresponding struct). In some cases it is either empty or can be
  80. // copied-over without digging into its structure.
  81. std::vector<uint8_t> Payload;
  82. // Some load commands can contain (inside the payload) an array of sections,
  83. // though the contents of the sections are stored separately. The struct
  84. // Section describes only sections' metadata and where to find the
  85. // corresponding content inside the binary.
  86. std::vector<std::unique_ptr<Section>> Sections;
  87. // Returns the segment name if the load command is a segment command.
  88. Optional<StringRef> getSegmentName() const;
  89. // Returns the segment vm address if the load command is a segment command.
  90. Optional<uint64_t> getSegmentVMAddr() const;
  91. };
  92. // A symbol information. Fields which starts with "n_" are same as them in the
  93. // nlist.
  94. struct SymbolEntry {
  95. std::string Name;
  96. bool Referenced = false;
  97. uint32_t Index;
  98. uint8_t n_type;
  99. uint8_t n_sect;
  100. uint16_t n_desc;
  101. uint64_t n_value;
  102. bool isExternalSymbol() const { return n_type & MachO::N_EXT; }
  103. bool isLocalSymbol() const { return !isExternalSymbol(); }
  104. bool isUndefinedSymbol() const {
  105. return (n_type & MachO::N_TYPE) == MachO::N_UNDF;
  106. }
  107. bool isSwiftSymbol() const {
  108. return StringRef(Name).startswith("_$s") ||
  109. StringRef(Name).startswith("_$S");
  110. }
  111. Optional<uint32_t> section() const {
  112. return n_sect == MachO::NO_SECT ? None : Optional<uint32_t>(n_sect);
  113. }
  114. };
  115. /// The location of the symbol table inside the binary is described by LC_SYMTAB
  116. /// load command.
  117. struct SymbolTable {
  118. std::vector<std::unique_ptr<SymbolEntry>> Symbols;
  119. using iterator = pointee_iterator<
  120. std::vector<std::unique_ptr<SymbolEntry>>::const_iterator>;
  121. iterator begin() const { return iterator(Symbols.begin()); }
  122. iterator end() const { return iterator(Symbols.end()); }
  123. const SymbolEntry *getSymbolByIndex(uint32_t Index) const;
  124. SymbolEntry *getSymbolByIndex(uint32_t Index);
  125. void removeSymbols(
  126. function_ref<bool(const std::unique_ptr<SymbolEntry> &)> ToRemove);
  127. };
  128. struct IndirectSymbolEntry {
  129. // The original value in an indirect symbol table. Higher bits encode extra
  130. // information (INDIRECT_SYMBOL_LOCAL and INDIRECT_SYMBOL_ABS).
  131. uint32_t OriginalIndex;
  132. /// The Symbol referenced by this entry. It's None if the index is
  133. /// INDIRECT_SYMBOL_LOCAL or INDIRECT_SYMBOL_ABS.
  134. Optional<SymbolEntry *> Symbol;
  135. IndirectSymbolEntry(uint32_t OriginalIndex, Optional<SymbolEntry *> Symbol)
  136. : OriginalIndex(OriginalIndex), Symbol(Symbol) {}
  137. };
  138. struct IndirectSymbolTable {
  139. std::vector<IndirectSymbolEntry> Symbols;
  140. };
  141. /// The location of the string table inside the binary is described by LC_SYMTAB
  142. /// load command.
  143. struct StringTable {
  144. std::vector<std::string> Strings;
  145. };
  146. struct RelocationInfo {
  147. // The referenced symbol entry. Set if !Scattered && Extern.
  148. Optional<const SymbolEntry *> Symbol;
  149. // The referenced section. Set if !Scattered && !Extern.
  150. Optional<const Section *> Sec;
  151. // True if Info is a scattered_relocation_info.
  152. bool Scattered;
  153. // True if the type is an ADDEND. r_symbolnum holds the addend instead of a
  154. // symbol index.
  155. bool IsAddend;
  156. // True if the r_symbolnum points to a section number (i.e. r_extern=0).
  157. bool Extern;
  158. MachO::any_relocation_info Info;
  159. unsigned getPlainRelocationSymbolNum(bool IsLittleEndian) {
  160. if (IsLittleEndian)
  161. return Info.r_word1 & 0xffffff;
  162. return Info.r_word1 >> 8;
  163. }
  164. void setPlainRelocationSymbolNum(unsigned SymbolNum, bool IsLittleEndian) {
  165. assert(SymbolNum < (1 << 24) && "SymbolNum out of range");
  166. if (IsLittleEndian)
  167. Info.r_word1 = (Info.r_word1 & ~0x00ffffff) | SymbolNum;
  168. else
  169. Info.r_word1 = (Info.r_word1 & ~0xffffff00) | (SymbolNum << 8);
  170. }
  171. };
  172. /// The location of the rebase info inside the binary is described by
  173. /// LC_DYLD_INFO load command. Dyld rebases an image whenever dyld loads it at
  174. /// an address different from its preferred address. The rebase information is
  175. /// a stream of byte sized opcodes whose symbolic names start with
  176. /// REBASE_OPCODE_. Conceptually the rebase information is a table of tuples:
  177. /// <seg-index, seg-offset, type>
  178. /// The opcodes are a compressed way to encode the table by only
  179. /// encoding when a column changes. In addition simple patterns
  180. /// like "every n'th offset for m times" can be encoded in a few
  181. /// bytes.
  182. struct RebaseInfo {
  183. // At the moment we do not parse this info (and it is simply copied over),
  184. // but the proper support will be added later.
  185. ArrayRef<uint8_t> Opcodes;
  186. };
  187. /// The location of the bind info inside the binary is described by
  188. /// LC_DYLD_INFO load command. Dyld binds an image during the loading process,
  189. /// if the image requires any pointers to be initialized to symbols in other
  190. /// images. The bind information is a stream of byte sized opcodes whose
  191. /// symbolic names start with BIND_OPCODE_. Conceptually the bind information is
  192. /// a table of tuples: <seg-index, seg-offset, type, symbol-library-ordinal,
  193. /// symbol-name, addend> The opcodes are a compressed way to encode the table by
  194. /// only encoding when a column changes. In addition simple patterns like for
  195. /// runs of pointers initialized to the same value can be encoded in a few
  196. /// bytes.
  197. struct BindInfo {
  198. // At the moment we do not parse this info (and it is simply copied over),
  199. // but the proper support will be added later.
  200. ArrayRef<uint8_t> Opcodes;
  201. };
  202. /// The location of the weak bind info inside the binary is described by
  203. /// LC_DYLD_INFO load command. Some C++ programs require dyld to unique symbols
  204. /// so that all images in the process use the same copy of some code/data. This
  205. /// step is done after binding. The content of the weak_bind info is an opcode
  206. /// stream like the bind_info. But it is sorted alphabetically by symbol name.
  207. /// This enable dyld to walk all images with weak binding information in order
  208. /// and look for collisions. If there are no collisions, dyld does no updating.
  209. /// That means that some fixups are also encoded in the bind_info. For
  210. /// instance, all calls to "operator new" are first bound to libstdc++.dylib
  211. /// using the information in bind_info. Then if some image overrides operator
  212. /// new that is detected when the weak_bind information is processed and the
  213. /// call to operator new is then rebound.
  214. struct WeakBindInfo {
  215. // At the moment we do not parse this info (and it is simply copied over),
  216. // but the proper support will be added later.
  217. ArrayRef<uint8_t> Opcodes;
  218. };
  219. /// The location of the lazy bind info inside the binary is described by
  220. /// LC_DYLD_INFO load command. Some uses of external symbols do not need to be
  221. /// bound immediately. Instead they can be lazily bound on first use. The
  222. /// lazy_bind contains a stream of BIND opcodes to bind all lazy symbols. Normal
  223. /// use is that dyld ignores the lazy_bind section when loading an image.
  224. /// Instead the static linker arranged for the lazy pointer to initially point
  225. /// to a helper function which pushes the offset into the lazy_bind area for the
  226. /// symbol needing to be bound, then jumps to dyld which simply adds the offset
  227. /// to lazy_bind_off to get the information on what to bind.
  228. struct LazyBindInfo {
  229. ArrayRef<uint8_t> Opcodes;
  230. };
  231. /// The location of the export info inside the binary is described by
  232. /// LC_DYLD_INFO load command. The symbols exported by a dylib are encoded in a
  233. /// trie. This is a compact representation that factors out common prefixes. It
  234. /// also reduces LINKEDIT pages in RAM because it encodes all information (name,
  235. /// address, flags) in one small, contiguous range. The export area is a stream
  236. /// of nodes. The first node sequentially is the start node for the trie. Nodes
  237. /// for a symbol start with a uleb128 that is the length of the exported symbol
  238. /// information for the string so far. If there is no exported symbol, the node
  239. /// starts with a zero byte. If there is exported info, it follows the length.
  240. /// First is a uleb128 containing flags. Normally, it is followed by
  241. /// a uleb128 encoded offset which is location of the content named
  242. /// by the symbol from the mach_header for the image. If the flags
  243. /// is EXPORT_SYMBOL_FLAGS_REEXPORT, then following the flags is
  244. /// a uleb128 encoded library ordinal, then a zero terminated
  245. /// UTF8 string. If the string is zero length, then the symbol
  246. /// is re-export from the specified dylib with the same name.
  247. /// If the flags is EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER, then following
  248. /// the flags is two uleb128s: the stub offset and the resolver offset.
  249. /// The stub is used by non-lazy pointers. The resolver is used
  250. /// by lazy pointers and must be called to get the actual address to use.
  251. /// After the optional exported symbol information is a byte of
  252. /// how many edges (0-255) that this node has leaving it,
  253. /// followed by each edge.
  254. /// Each edge is a zero terminated UTF8 of the addition chars
  255. /// in the symbol, followed by a uleb128 offset for the node that
  256. /// edge points to.
  257. struct ExportInfo {
  258. ArrayRef<uint8_t> Trie;
  259. };
  260. struct LinkData {
  261. ArrayRef<uint8_t> Data;
  262. };
  263. struct Object {
  264. MachHeader Header;
  265. std::vector<LoadCommand> LoadCommands;
  266. SymbolTable SymTable;
  267. StringTable StrTable;
  268. RebaseInfo Rebases;
  269. BindInfo Binds;
  270. WeakBindInfo WeakBinds;
  271. LazyBindInfo LazyBinds;
  272. ExportInfo Exports;
  273. IndirectSymbolTable IndirectSymTable;
  274. LinkData DataInCode;
  275. LinkData LinkerOptimizationHint;
  276. LinkData FunctionStarts;
  277. LinkData ExportsTrie;
  278. LinkData ChainedFixups;
  279. Optional<uint32_t> SwiftVersion;
  280. /// The index of LC_CODE_SIGNATURE load command if present.
  281. Optional<size_t> CodeSignatureCommandIndex;
  282. /// The index of LC_SYMTAB load command if present.
  283. Optional<size_t> SymTabCommandIndex;
  284. /// The index of LC_DYLD_INFO or LC_DYLD_INFO_ONLY load command if present.
  285. Optional<size_t> DyLdInfoCommandIndex;
  286. /// The index LC_DYSYMTAB load command if present.
  287. Optional<size_t> DySymTabCommandIndex;
  288. /// The index LC_DATA_IN_CODE load command if present.
  289. Optional<size_t> DataInCodeCommandIndex;
  290. /// The index of LC_LINKER_OPTIMIZATIN_HINT load command if present.
  291. Optional<size_t> LinkerOptimizationHintCommandIndex;
  292. /// The index LC_FUNCTION_STARTS load command if present.
  293. Optional<size_t> FunctionStartsCommandIndex;
  294. /// The index LC_DYLD_CHAINED_FIXUPS load command if present.
  295. Optional<size_t> ChainedFixupsCommandIndex;
  296. /// The index LC_DYLD_EXPORTS_TRIE load command if present.
  297. Optional<size_t> ExportsTrieCommandIndex;
  298. /// The index of the LC_SEGMENT or LC_SEGMENT_64 load command
  299. /// corresponding to the __TEXT segment.
  300. Optional<size_t> TextSegmentCommandIndex;
  301. BumpPtrAllocator Alloc;
  302. StringSaver NewSectionsContents;
  303. Object() : NewSectionsContents(Alloc) {}
  304. Error
  305. removeSections(function_ref<bool(const std::unique_ptr<Section> &)> ToRemove);
  306. Error removeLoadCommands(function_ref<bool(const LoadCommand &)> ToRemove);
  307. void updateLoadCommandIndexes();
  308. /// Creates a new segment load command in the object and returns a reference
  309. /// to the newly created load command. The caller should verify that SegName
  310. /// is not too long (SegName.size() should be less than or equal to 16).
  311. LoadCommand &addSegment(StringRef SegName, uint64_t SegVMSize);
  312. bool is64Bit() const {
  313. return Header.Magic == MachO::MH_MAGIC_64 ||
  314. Header.Magic == MachO::MH_CIGAM_64;
  315. }
  316. uint64_t nextAvailableSegmentAddress() const;
  317. };
  318. } // end namespace macho
  319. } // end namespace objcopy
  320. } // end namespace llvm
  321. #endif // LLVM_OBJCOPY_MACHO_OBJECT_H