Object.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369
  1. //===- Object.h - Mach-O object file model ----------------------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #ifndef LLVM_OBJCOPY_MACHO_OBJECT_H
  9. #define LLVM_OBJCOPY_MACHO_OBJECT_H
  10. #include "llvm/ADT/Optional.h"
  11. #include "llvm/ADT/StringRef.h"
  12. #include "llvm/BinaryFormat/MachO.h"
  13. #include "llvm/MC/StringTableBuilder.h"
  14. #include "llvm/ObjectYAML/DWARFYAML.h"
  15. #include "llvm/Support/StringSaver.h"
  16. #include "llvm/Support/YAMLTraits.h"
  17. #include <cstdint>
  18. #include <string>
  19. #include <vector>
  20. namespace llvm {
  21. namespace objcopy {
  22. namespace macho {
  23. struct MachHeader {
  24. uint32_t Magic;
  25. uint32_t CPUType;
  26. uint32_t CPUSubType;
  27. uint32_t FileType;
  28. uint32_t NCmds;
  29. uint32_t SizeOfCmds;
  30. uint32_t Flags;
  31. uint32_t Reserved = 0;
  32. };
  33. struct RelocationInfo;
  34. struct Section {
  35. uint32_t Index;
  36. std::string Segname;
  37. std::string Sectname;
  38. // CanonicalName is a string formatted as “<Segname>,<Sectname>".
  39. std::string CanonicalName;
  40. uint64_t Addr = 0;
  41. uint64_t Size = 0;
  42. // Offset in the input file.
  43. Optional<uint32_t> OriginalOffset;
  44. uint32_t Offset = 0;
  45. uint32_t Align = 0;
  46. uint32_t RelOff = 0;
  47. uint32_t NReloc = 0;
  48. uint32_t Flags = 0;
  49. uint32_t Reserved1 = 0;
  50. uint32_t Reserved2 = 0;
  51. uint32_t Reserved3 = 0;
  52. StringRef Content;
  53. std::vector<RelocationInfo> Relocations;
  54. Section(StringRef SegName, StringRef SectName);
  55. Section(StringRef SegName, StringRef SectName, StringRef Content);
  56. MachO::SectionType getType() const {
  57. return static_cast<MachO::SectionType>(Flags & MachO::SECTION_TYPE);
  58. }
  59. bool isVirtualSection() const {
  60. return (getType() == MachO::S_ZEROFILL ||
  61. getType() == MachO::S_GB_ZEROFILL ||
  62. getType() == MachO::S_THREAD_LOCAL_ZEROFILL);
  63. }
  64. bool hasValidOffset() const {
  65. return !(isVirtualSection() || (OriginalOffset && *OriginalOffset == 0));
  66. }
  67. };
  68. struct LoadCommand {
  69. // The type MachO::macho_load_command is defined in llvm/BinaryFormat/MachO.h
  70. // and it is a union of all the structs corresponding to various load
  71. // commands.
  72. MachO::macho_load_command MachOLoadCommand;
  73. // The raw content of the payload of the load command (located right after the
  74. // corresponding struct). In some cases it is either empty or can be
  75. // copied-over without digging into its structure.
  76. std::vector<uint8_t> Payload;
  77. // Some load commands can contain (inside the payload) an array of sections,
  78. // though the contents of the sections are stored separately. The struct
  79. // Section describes only sections' metadata and where to find the
  80. // corresponding content inside the binary.
  81. std::vector<std::unique_ptr<Section>> Sections;
  82. // Returns the segment name if the load command is a segment command.
  83. Optional<StringRef> getSegmentName() const;
  84. // Returns the segment vm address if the load command is a segment command.
  85. Optional<uint64_t> getSegmentVMAddr() const;
  86. };
  87. // A symbol information. Fields which starts with "n_" are same as them in the
  88. // nlist.
  89. struct SymbolEntry {
  90. std::string Name;
  91. bool Referenced = false;
  92. uint32_t Index;
  93. uint8_t n_type;
  94. uint8_t n_sect;
  95. uint16_t n_desc;
  96. uint64_t n_value;
  97. bool isExternalSymbol() const { return n_type & MachO::N_EXT; }
  98. bool isLocalSymbol() const { return !isExternalSymbol(); }
  99. bool isUndefinedSymbol() const {
  100. return (n_type & MachO::N_TYPE) == MachO::N_UNDF;
  101. }
  102. bool isSwiftSymbol() const {
  103. return StringRef(Name).startswith("_$s") ||
  104. StringRef(Name).startswith("_$S");
  105. }
  106. Optional<uint32_t> section() const {
  107. return n_sect == MachO::NO_SECT ? None : Optional<uint32_t>(n_sect);
  108. }
  109. };
  110. /// The location of the symbol table inside the binary is described by LC_SYMTAB
  111. /// load command.
  112. struct SymbolTable {
  113. std::vector<std::unique_ptr<SymbolEntry>> Symbols;
  114. using iterator = pointee_iterator<
  115. std::vector<std::unique_ptr<SymbolEntry>>::const_iterator>;
  116. iterator begin() const { return iterator(Symbols.begin()); }
  117. iterator end() const { return iterator(Symbols.end()); }
  118. const SymbolEntry *getSymbolByIndex(uint32_t Index) const;
  119. SymbolEntry *getSymbolByIndex(uint32_t Index);
  120. void removeSymbols(
  121. function_ref<bool(const std::unique_ptr<SymbolEntry> &)> ToRemove);
  122. };
  123. struct IndirectSymbolEntry {
  124. // The original value in an indirect symbol table. Higher bits encode extra
  125. // information (INDIRECT_SYMBOL_LOCAL and INDIRECT_SYMBOL_ABS).
  126. uint32_t OriginalIndex;
  127. /// The Symbol referenced by this entry. It's None if the index is
  128. /// INDIRECT_SYMBOL_LOCAL or INDIRECT_SYMBOL_ABS.
  129. Optional<SymbolEntry *> Symbol;
  130. IndirectSymbolEntry(uint32_t OriginalIndex, Optional<SymbolEntry *> Symbol)
  131. : OriginalIndex(OriginalIndex), Symbol(Symbol) {}
  132. };
  133. struct IndirectSymbolTable {
  134. std::vector<IndirectSymbolEntry> Symbols;
  135. };
  136. /// The location of the string table inside the binary is described by LC_SYMTAB
  137. /// load command.
  138. struct StringTable {
  139. std::vector<std::string> Strings;
  140. };
  141. struct RelocationInfo {
  142. // The referenced symbol entry. Set if !Scattered && Extern.
  143. Optional<const SymbolEntry *> Symbol;
  144. // The referenced section. Set if !Scattered && !Extern.
  145. Optional<const Section *> Sec;
  146. // True if Info is a scattered_relocation_info.
  147. bool Scattered;
  148. // True if the type is an ADDEND. r_symbolnum holds the addend instead of a
  149. // symbol index.
  150. bool IsAddend;
  151. // True if the r_symbolnum points to a section number (i.e. r_extern=0).
  152. bool Extern;
  153. MachO::any_relocation_info Info;
  154. unsigned getPlainRelocationSymbolNum(bool IsLittleEndian) {
  155. if (IsLittleEndian)
  156. return Info.r_word1 & 0xffffff;
  157. return Info.r_word1 >> 8;
  158. }
  159. void setPlainRelocationSymbolNum(unsigned SymbolNum, bool IsLittleEndian) {
  160. assert(SymbolNum < (1 << 24) && "SymbolNum out of range");
  161. if (IsLittleEndian)
  162. Info.r_word1 = (Info.r_word1 & ~0x00ffffff) | SymbolNum;
  163. else
  164. Info.r_word1 = (Info.r_word1 & ~0xffffff00) | (SymbolNum << 8);
  165. }
  166. };
  167. /// The location of the rebase info inside the binary is described by
  168. /// LC_DYLD_INFO load command. Dyld rebases an image whenever dyld loads it at
  169. /// an address different from its preferred address. The rebase information is
  170. /// a stream of byte sized opcodes whose symbolic names start with
  171. /// REBASE_OPCODE_. Conceptually the rebase information is a table of tuples:
  172. /// <seg-index, seg-offset, type>
  173. /// The opcodes are a compressed way to encode the table by only
  174. /// encoding when a column changes. In addition simple patterns
  175. /// like "every n'th offset for m times" can be encoded in a few
  176. /// bytes.
  177. struct RebaseInfo {
  178. // At the moment we do not parse this info (and it is simply copied over),
  179. // but the proper support will be added later.
  180. ArrayRef<uint8_t> Opcodes;
  181. };
  182. /// The location of the bind info inside the binary is described by
  183. /// LC_DYLD_INFO load command. Dyld binds an image during the loading process,
  184. /// if the image requires any pointers to be initialized to symbols in other
  185. /// images. The bind information is a stream of byte sized opcodes whose
  186. /// symbolic names start with BIND_OPCODE_. Conceptually the bind information is
  187. /// a table of tuples: <seg-index, seg-offset, type, symbol-library-ordinal,
  188. /// symbol-name, addend> The opcodes are a compressed way to encode the table by
  189. /// only encoding when a column changes. In addition simple patterns like for
  190. /// runs of pointers initialized to the same value can be encoded in a few
  191. /// bytes.
  192. struct BindInfo {
  193. // At the moment we do not parse this info (and it is simply copied over),
  194. // but the proper support will be added later.
  195. ArrayRef<uint8_t> Opcodes;
  196. };
  197. /// The location of the weak bind info inside the binary is described by
  198. /// LC_DYLD_INFO load command. Some C++ programs require dyld to unique symbols
  199. /// so that all images in the process use the same copy of some code/data. This
  200. /// step is done after binding. The content of the weak_bind info is an opcode
  201. /// stream like the bind_info. But it is sorted alphabetically by symbol name.
  202. /// This enable dyld to walk all images with weak binding information in order
  203. /// and look for collisions. If there are no collisions, dyld does no updating.
  204. /// That means that some fixups are also encoded in the bind_info. For
  205. /// instance, all calls to "operator new" are first bound to libstdc++.dylib
  206. /// using the information in bind_info. Then if some image overrides operator
  207. /// new that is detected when the weak_bind information is processed and the
  208. /// call to operator new is then rebound.
  209. struct WeakBindInfo {
  210. // At the moment we do not parse this info (and it is simply copied over),
  211. // but the proper support will be added later.
  212. ArrayRef<uint8_t> Opcodes;
  213. };
  214. /// The location of the lazy bind info inside the binary is described by
  215. /// LC_DYLD_INFO load command. Some uses of external symbols do not need to be
  216. /// bound immediately. Instead they can be lazily bound on first use. The
  217. /// lazy_bind contains a stream of BIND opcodes to bind all lazy symbols. Normal
  218. /// use is that dyld ignores the lazy_bind section when loading an image.
  219. /// Instead the static linker arranged for the lazy pointer to initially point
  220. /// to a helper function which pushes the offset into the lazy_bind area for the
  221. /// symbol needing to be bound, then jumps to dyld which simply adds the offset
  222. /// to lazy_bind_off to get the information on what to bind.
  223. struct LazyBindInfo {
  224. ArrayRef<uint8_t> Opcodes;
  225. };
  226. /// The location of the export info inside the binary is described by
  227. /// LC_DYLD_INFO load command. The symbols exported by a dylib are encoded in a
  228. /// trie. This is a compact representation that factors out common prefixes. It
  229. /// also reduces LINKEDIT pages in RAM because it encodes all information (name,
  230. /// address, flags) in one small, contiguous range. The export area is a stream
  231. /// of nodes. The first node sequentially is the start node for the trie. Nodes
  232. /// for a symbol start with a uleb128 that is the length of the exported symbol
  233. /// information for the string so far. If there is no exported symbol, the node
  234. /// starts with a zero byte. If there is exported info, it follows the length.
  235. /// First is a uleb128 containing flags. Normally, it is followed by
  236. /// a uleb128 encoded offset which is location of the content named
  237. /// by the symbol from the mach_header for the image. If the flags
  238. /// is EXPORT_SYMBOL_FLAGS_REEXPORT, then following the flags is
  239. /// a uleb128 encoded library ordinal, then a zero terminated
  240. /// UTF8 string. If the string is zero length, then the symbol
  241. /// is re-export from the specified dylib with the same name.
  242. /// If the flags is EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER, then following
  243. /// the flags is two uleb128s: the stub offset and the resolver offset.
  244. /// The stub is used by non-lazy pointers. The resolver is used
  245. /// by lazy pointers and must be called to get the actual address to use.
  246. /// After the optional exported symbol information is a byte of
  247. /// how many edges (0-255) that this node has leaving it,
  248. /// followed by each edge.
  249. /// Each edge is a zero terminated UTF8 of the addition chars
  250. /// in the symbol, followed by a uleb128 offset for the node that
  251. /// edge points to.
  252. struct ExportInfo {
  253. ArrayRef<uint8_t> Trie;
  254. };
  255. struct LinkData {
  256. ArrayRef<uint8_t> Data;
  257. };
  258. struct Object {
  259. MachHeader Header;
  260. std::vector<LoadCommand> LoadCommands;
  261. SymbolTable SymTable;
  262. StringTable StrTable;
  263. RebaseInfo Rebases;
  264. BindInfo Binds;
  265. WeakBindInfo WeakBinds;
  266. LazyBindInfo LazyBinds;
  267. ExportInfo Exports;
  268. IndirectSymbolTable IndirectSymTable;
  269. LinkData DataInCode;
  270. LinkData LinkerOptimizationHint;
  271. LinkData FunctionStarts;
  272. LinkData ExportsTrie;
  273. LinkData ChainedFixups;
  274. Optional<uint32_t> SwiftVersion;
  275. /// The index of LC_CODE_SIGNATURE load command if present.
  276. Optional<size_t> CodeSignatureCommandIndex;
  277. /// The index of LC_SYMTAB load command if present.
  278. Optional<size_t> SymTabCommandIndex;
  279. /// The index of LC_DYLD_INFO or LC_DYLD_INFO_ONLY load command if present.
  280. Optional<size_t> DyLdInfoCommandIndex;
  281. /// The index LC_DYSYMTAB load command if present.
  282. Optional<size_t> DySymTabCommandIndex;
  283. /// The index LC_DATA_IN_CODE load command if present.
  284. Optional<size_t> DataInCodeCommandIndex;
  285. /// The index of LC_LINKER_OPTIMIZATIN_HINT load command if present.
  286. Optional<size_t> LinkerOptimizationHintCommandIndex;
  287. /// The index LC_FUNCTION_STARTS load command if present.
  288. Optional<size_t> FunctionStartsCommandIndex;
  289. /// The index LC_DYLD_CHAINED_FIXUPS load command if present.
  290. Optional<size_t> ChainedFixupsCommandIndex;
  291. /// The index LC_DYLD_EXPORTS_TRIE load command if present.
  292. Optional<size_t> ExportsTrieCommandIndex;
  293. /// The index of the LC_SEGMENT or LC_SEGMENT_64 load command
  294. /// corresponding to the __TEXT segment.
  295. Optional<size_t> TextSegmentCommandIndex;
  296. BumpPtrAllocator Alloc;
  297. StringSaver NewSectionsContents;
  298. Object() : NewSectionsContents(Alloc) {}
  299. Error
  300. removeSections(function_ref<bool(const std::unique_ptr<Section> &)> ToRemove);
  301. Error removeLoadCommands(function_ref<bool(const LoadCommand &)> ToRemove);
  302. void updateLoadCommandIndexes();
  303. /// Creates a new segment load command in the object and returns a reference
  304. /// to the newly created load command. The caller should verify that SegName
  305. /// is not too long (SegName.size() should be less than or equal to 16).
  306. LoadCommand &addSegment(StringRef SegName, uint64_t SegVMSize);
  307. bool is64Bit() const {
  308. return Header.Magic == MachO::MH_MAGIC_64 ||
  309. Header.Magic == MachO::MH_CIGAM_64;
  310. }
  311. uint64_t nextAvailableSegmentAddress() const;
  312. };
  313. } // end namespace macho
  314. } // end namespace objcopy
  315. } // end namespace llvm
  316. #endif // LLVM_OBJCOPY_MACHO_OBJECT_H