Object.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. //===- Object.h - Mach-O object file model ----------------------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #ifndef LLVM_OBJCOPY_MACHO_OBJECT_H
  9. #define LLVM_OBJCOPY_MACHO_OBJECT_H
  10. #include "llvm/ADT/Optional.h"
  11. #include "llvm/ADT/StringRef.h"
  12. #include "llvm/BinaryFormat/MachO.h"
  13. #include "llvm/MC/StringTableBuilder.h"
  14. #include "llvm/ObjectYAML/DWARFYAML.h"
  15. #include "llvm/Support/StringSaver.h"
  16. #include "llvm/Support/YAMLTraits.h"
  17. #include <cstdint>
  18. #include <string>
  19. #include <vector>
  20. namespace llvm {
  21. namespace objcopy {
  22. namespace macho {
  23. struct MachHeader {
  24. uint32_t Magic;
  25. uint32_t CPUType;
  26. uint32_t CPUSubType;
  27. uint32_t FileType;
  28. uint32_t NCmds;
  29. uint32_t SizeOfCmds;
  30. uint32_t Flags;
  31. uint32_t Reserved = 0;
  32. };
  33. struct RelocationInfo;
  34. struct Section {
  35. uint32_t Index;
  36. std::string Segname;
  37. std::string Sectname;
  38. // CanonicalName is a string formatted as “<Segname>,<Sectname>".
  39. std::string CanonicalName;
  40. uint64_t Addr = 0;
  41. uint64_t Size = 0;
  42. // Offset in the input file.
  43. Optional<uint32_t> OriginalOffset;
  44. uint32_t Offset = 0;
  45. uint32_t Align = 0;
  46. uint32_t RelOff = 0;
  47. uint32_t NReloc = 0;
  48. uint32_t Flags = 0;
  49. uint32_t Reserved1 = 0;
  50. uint32_t Reserved2 = 0;
  51. uint32_t Reserved3 = 0;
  52. StringRef Content;
  53. std::vector<RelocationInfo> Relocations;
  54. Section(StringRef SegName, StringRef SectName);
  55. Section(StringRef SegName, StringRef SectName, StringRef Content);
  56. MachO::SectionType getType() const {
  57. return static_cast<MachO::SectionType>(Flags & MachO::SECTION_TYPE);
  58. }
  59. bool isVirtualSection() const {
  60. return (getType() == MachO::S_ZEROFILL ||
  61. getType() == MachO::S_GB_ZEROFILL ||
  62. getType() == MachO::S_THREAD_LOCAL_ZEROFILL);
  63. }
  64. bool hasValidOffset() const {
  65. return !(isVirtualSection() || (OriginalOffset && *OriginalOffset == 0));
  66. }
  67. };
  68. struct LoadCommand {
  69. // The type MachO::macho_load_command is defined in llvm/BinaryFormat/MachO.h
  70. // and it is a union of all the structs corresponding to various load
  71. // commands.
  72. MachO::macho_load_command MachOLoadCommand;
  73. // The raw content of the payload of the load command (located right after the
  74. // corresponding struct). In some cases it is either empty or can be
  75. // copied-over without digging into its structure.
  76. std::vector<uint8_t> Payload;
  77. // Some load commands can contain (inside the payload) an array of sections,
  78. // though the contents of the sections are stored separately. The struct
  79. // Section describes only sections' metadata and where to find the
  80. // corresponding content inside the binary.
  81. std::vector<std::unique_ptr<Section>> Sections;
  82. // Returns the segment name if the load command is a segment command.
  83. Optional<StringRef> getSegmentName() const;
  84. // Returns the segment vm address if the load command is a segment command.
  85. Optional<uint64_t> getSegmentVMAddr() const;
  86. };
  87. // A symbol information. Fields which starts with "n_" are same as them in the
  88. // nlist.
  89. struct SymbolEntry {
  90. std::string Name;
  91. bool Referenced = false;
  92. uint32_t Index;
  93. uint8_t n_type;
  94. uint8_t n_sect;
  95. uint16_t n_desc;
  96. uint64_t n_value;
  97. bool isExternalSymbol() const { return n_type & MachO::N_EXT; }
  98. bool isLocalSymbol() const { return !isExternalSymbol(); }
  99. bool isUndefinedSymbol() const {
  100. return (n_type & MachO::N_TYPE) == MachO::N_UNDF;
  101. }
  102. bool isSwiftSymbol() const {
  103. return StringRef(Name).startswith("_$s") ||
  104. StringRef(Name).startswith("_$S");
  105. }
  106. Optional<uint32_t> section() const {
  107. return n_sect == MachO::NO_SECT ? None : Optional<uint32_t>(n_sect);
  108. }
  109. };
  110. /// The location of the symbol table inside the binary is described by LC_SYMTAB
  111. /// load command.
  112. struct SymbolTable {
  113. std::vector<std::unique_ptr<SymbolEntry>> Symbols;
  114. using iterator = pointee_iterator<
  115. std::vector<std::unique_ptr<SymbolEntry>>::const_iterator>;
  116. iterator begin() const { return iterator(Symbols.begin()); }
  117. iterator end() const { return iterator(Symbols.end()); }
  118. const SymbolEntry *getSymbolByIndex(uint32_t Index) const;
  119. SymbolEntry *getSymbolByIndex(uint32_t Index);
  120. void removeSymbols(
  121. function_ref<bool(const std::unique_ptr<SymbolEntry> &)> ToRemove);
  122. };
  123. struct IndirectSymbolEntry {
  124. // The original value in an indirect symbol table. Higher bits encode extra
  125. // information (INDIRECT_SYMBOL_LOCAL and INDIRECT_SYMBOL_ABS).
  126. uint32_t OriginalIndex;
  127. /// The Symbol referenced by this entry. It's None if the index is
  128. /// INDIRECT_SYMBOL_LOCAL or INDIRECT_SYMBOL_ABS.
  129. Optional<SymbolEntry *> Symbol;
  130. IndirectSymbolEntry(uint32_t OriginalIndex, Optional<SymbolEntry *> Symbol)
  131. : OriginalIndex(OriginalIndex), Symbol(Symbol) {}
  132. };
  133. struct IndirectSymbolTable {
  134. std::vector<IndirectSymbolEntry> Symbols;
  135. };
  136. /// The location of the string table inside the binary is described by LC_SYMTAB
  137. /// load command.
  138. struct StringTable {
  139. std::vector<std::string> Strings;
  140. };
  141. struct RelocationInfo {
  142. // The referenced symbol entry. Set if !Scattered && Extern.
  143. Optional<const SymbolEntry *> Symbol;
  144. // The referenced section. Set if !Scattered && !Extern.
  145. Optional<const Section *> Sec;
  146. // True if Info is a scattered_relocation_info.
  147. bool Scattered;
  148. // True if the r_symbolnum points to a section number (i.e. r_extern=0).
  149. bool Extern;
  150. MachO::any_relocation_info Info;
  151. unsigned getPlainRelocationSymbolNum(bool IsLittleEndian) {
  152. if (IsLittleEndian)
  153. return Info.r_word1 & 0xffffff;
  154. return Info.r_word1 >> 8;
  155. }
  156. void setPlainRelocationSymbolNum(unsigned SymbolNum, bool IsLittleEndian) {
  157. assert(SymbolNum < (1 << 24) && "SymbolNum out of range");
  158. if (IsLittleEndian)
  159. Info.r_word1 = (Info.r_word1 & ~0x00ffffff) | SymbolNum;
  160. else
  161. Info.r_word1 = (Info.r_word1 & ~0xffffff00) | (SymbolNum << 8);
  162. }
  163. };
  164. /// The location of the rebase info inside the binary is described by
  165. /// LC_DYLD_INFO load command. Dyld rebases an image whenever dyld loads it at
  166. /// an address different from its preferred address. The rebase information is
  167. /// a stream of byte sized opcodes whose symbolic names start with
  168. /// REBASE_OPCODE_. Conceptually the rebase information is a table of tuples:
  169. /// <seg-index, seg-offset, type>
  170. /// The opcodes are a compressed way to encode the table by only
  171. /// encoding when a column changes. In addition simple patterns
  172. /// like "every n'th offset for m times" can be encoded in a few
  173. /// bytes.
  174. struct RebaseInfo {
  175. // At the moment we do not parse this info (and it is simply copied over),
  176. // but the proper support will be added later.
  177. ArrayRef<uint8_t> Opcodes;
  178. };
  179. /// The location of the bind info inside the binary is described by
  180. /// LC_DYLD_INFO load command. Dyld binds an image during the loading process,
  181. /// if the image requires any pointers to be initialized to symbols in other
  182. /// images. The bind information is a stream of byte sized opcodes whose
  183. /// symbolic names start with BIND_OPCODE_. Conceptually the bind information is
  184. /// a table of tuples: <seg-index, seg-offset, type, symbol-library-ordinal,
  185. /// symbol-name, addend> The opcodes are a compressed way to encode the table by
  186. /// only encoding when a column changes. In addition simple patterns like for
  187. /// runs of pointers initialized to the same value can be encoded in a few
  188. /// bytes.
  189. struct BindInfo {
  190. // At the moment we do not parse this info (and it is simply copied over),
  191. // but the proper support will be added later.
  192. ArrayRef<uint8_t> Opcodes;
  193. };
  194. /// The location of the weak bind info inside the binary is described by
  195. /// LC_DYLD_INFO load command. Some C++ programs require dyld to unique symbols
  196. /// so that all images in the process use the same copy of some code/data. This
  197. /// step is done after binding. The content of the weak_bind info is an opcode
  198. /// stream like the bind_info. But it is sorted alphabetically by symbol name.
  199. /// This enable dyld to walk all images with weak binding information in order
  200. /// and look for collisions. If there are no collisions, dyld does no updating.
  201. /// That means that some fixups are also encoded in the bind_info. For
  202. /// instance, all calls to "operator new" are first bound to libstdc++.dylib
  203. /// using the information in bind_info. Then if some image overrides operator
  204. /// new that is detected when the weak_bind information is processed and the
  205. /// call to operator new is then rebound.
  206. struct WeakBindInfo {
  207. // At the moment we do not parse this info (and it is simply copied over),
  208. // but the proper support will be added later.
  209. ArrayRef<uint8_t> Opcodes;
  210. };
  211. /// The location of the lazy bind info inside the binary is described by
  212. /// LC_DYLD_INFO load command. Some uses of external symbols do not need to be
  213. /// bound immediately. Instead they can be lazily bound on first use. The
  214. /// lazy_bind contains a stream of BIND opcodes to bind all lazy symbols. Normal
  215. /// use is that dyld ignores the lazy_bind section when loading an image.
  216. /// Instead the static linker arranged for the lazy pointer to initially point
  217. /// to a helper function which pushes the offset into the lazy_bind area for the
  218. /// symbol needing to be bound, then jumps to dyld which simply adds the offset
  219. /// to lazy_bind_off to get the information on what to bind.
  220. struct LazyBindInfo {
  221. ArrayRef<uint8_t> Opcodes;
  222. };
  223. /// The location of the export info inside the binary is described by
  224. /// LC_DYLD_INFO load command. The symbols exported by a dylib are encoded in a
  225. /// trie. This is a compact representation that factors out common prefixes. It
  226. /// also reduces LINKEDIT pages in RAM because it encodes all information (name,
  227. /// address, flags) in one small, contiguous range. The export area is a stream
  228. /// of nodes. The first node sequentially is the start node for the trie. Nodes
  229. /// for a symbol start with a uleb128 that is the length of the exported symbol
  230. /// information for the string so far. If there is no exported symbol, the node
  231. /// starts with a zero byte. If there is exported info, it follows the length.
  232. /// First is a uleb128 containing flags. Normally, it is followed by
  233. /// a uleb128 encoded offset which is location of the content named
  234. /// by the symbol from the mach_header for the image. If the flags
  235. /// is EXPORT_SYMBOL_FLAGS_REEXPORT, then following the flags is
  236. /// a uleb128 encoded library ordinal, then a zero terminated
  237. /// UTF8 string. If the string is zero length, then the symbol
  238. /// is re-export from the specified dylib with the same name.
  239. /// If the flags is EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER, then following
  240. /// the flags is two uleb128s: the stub offset and the resolver offset.
  241. /// The stub is used by non-lazy pointers. The resolver is used
  242. /// by lazy pointers and must be called to get the actual address to use.
  243. /// After the optional exported symbol information is a byte of
  244. /// how many edges (0-255) that this node has leaving it,
  245. /// followed by each edge.
  246. /// Each edge is a zero terminated UTF8 of the addition chars
  247. /// in the symbol, followed by a uleb128 offset for the node that
  248. /// edge points to.
  249. struct ExportInfo {
  250. ArrayRef<uint8_t> Trie;
  251. };
  252. struct LinkData {
  253. ArrayRef<uint8_t> Data;
  254. };
  255. struct Object {
  256. MachHeader Header;
  257. std::vector<LoadCommand> LoadCommands;
  258. SymbolTable SymTable;
  259. StringTable StrTable;
  260. RebaseInfo Rebases;
  261. BindInfo Binds;
  262. WeakBindInfo WeakBinds;
  263. LazyBindInfo LazyBinds;
  264. ExportInfo Exports;
  265. IndirectSymbolTable IndirectSymTable;
  266. LinkData DataInCode;
  267. LinkData FunctionStarts;
  268. LinkData CodeSignature;
  269. Optional<uint32_t> SwiftVersion;
  270. /// The index of LC_CODE_SIGNATURE load command if present.
  271. Optional<size_t> CodeSignatureCommandIndex;
  272. /// The index of LC_SYMTAB load command if present.
  273. Optional<size_t> SymTabCommandIndex;
  274. /// The index of LC_DYLD_INFO or LC_DYLD_INFO_ONLY load command if present.
  275. Optional<size_t> DyLdInfoCommandIndex;
  276. /// The index LC_DYSYMTAB load comamnd if present.
  277. Optional<size_t> DySymTabCommandIndex;
  278. /// The index LC_DATA_IN_CODE load comamnd if present.
  279. Optional<size_t> DataInCodeCommandIndex;
  280. /// The index LC_FUNCTION_STARTS load comamnd if present.
  281. Optional<size_t> FunctionStartsCommandIndex;
  282. BumpPtrAllocator Alloc;
  283. StringSaver NewSectionsContents;
  284. Object() : NewSectionsContents(Alloc) {}
  285. Error
  286. removeSections(function_ref<bool(const std::unique_ptr<Section> &)> ToRemove);
  287. Error removeLoadCommands(function_ref<bool(const LoadCommand &)> ToRemove);
  288. void updateLoadCommandIndexes();
  289. /// Creates a new segment load command in the object and returns a reference
  290. /// to the newly created load command. The caller should verify that SegName
  291. /// is not too long (SegName.size() should be less than or equal to 16).
  292. LoadCommand &addSegment(StringRef SegName, uint64_t SegVMSize);
  293. bool is64Bit() const {
  294. return Header.Magic == MachO::MH_MAGIC_64 ||
  295. Header.Magic == MachO::MH_CIGAM_64;
  296. }
  297. uint64_t nextAvailableSegmentAddress() const;
  298. };
  299. } // end namespace macho
  300. } // end namespace objcopy
  301. } // end namespace llvm
  302. #endif // LLVM_OBJCOPY_MACHO_OBJECT_H