GSIStreamBuilder.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. //===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // The data structures defined in this file are based on the reference
  10. // implementation which is available at
  11. // https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp
  12. //
  13. //===----------------------------------------------------------------------===//
  14. #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
  15. #include "llvm/DebugInfo/CodeView/RecordName.h"
  16. #include "llvm/DebugInfo/CodeView/RecordSerialization.h"
  17. #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
  18. #include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
  19. #include "llvm/DebugInfo/MSF/MSFBuilder.h"
  20. #include "llvm/DebugInfo/MSF/MSFCommon.h"
  21. #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
  22. #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
  23. #include "llvm/DebugInfo/PDB/Native/Hash.h"
  24. #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
  25. #include "llvm/Support/BinaryItemStream.h"
  26. #include "llvm/Support/BinaryStreamWriter.h"
  27. #include "llvm/Support/Parallel.h"
  28. #include "llvm/Support/xxhash.h"
  29. #include <algorithm>
  30. #include <vector>
  31. using namespace llvm;
  32. using namespace llvm::msf;
  33. using namespace llvm::pdb;
  34. using namespace llvm::codeview;
  35. // Helper class for building the public and global PDB hash table buckets.
  36. struct llvm::pdb::GSIHashStreamBuilder {
  37. // Sum of the size of all public or global records.
  38. uint32_t RecordByteSize = 0;
  39. std::vector<PSHashRecord> HashRecords;
  40. // The hash bitmap has `ceil((IPHR_HASH + 1) / 32)` words in it. The
  41. // reference implementation builds a hash table with IPHR_HASH buckets in it.
  42. // The last bucket is used to link together free hash table cells in a linked
  43. // list, but it is always empty in the compressed, on-disk format. However,
  44. // the bitmap must have a bit for it.
  45. std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap;
  46. std::vector<support::ulittle32_t> HashBuckets;
  47. uint32_t calculateSerializedLength() const;
  48. Error commit(BinaryStreamWriter &Writer);
  49. void finalizePublicBuckets();
  50. void finalizeGlobalBuckets(uint32_t RecordZeroOffset);
  51. // Assign public and global symbol records into hash table buckets.
  52. // Modifies the list of records to store the bucket index, but does not
  53. // change the order.
  54. void finalizeBuckets(uint32_t RecordZeroOffset,
  55. MutableArrayRef<BulkPublic> Globals);
  56. };
  57. // DenseMapInfo implementation for deduplicating symbol records.
  58. struct llvm::pdb::SymbolDenseMapInfo {
  59. static inline CVSymbol getEmptyKey() {
  60. static CVSymbol Empty;
  61. return Empty;
  62. }
  63. static inline CVSymbol getTombstoneKey() {
  64. static CVSymbol Tombstone(
  65. DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey());
  66. return Tombstone;
  67. }
  68. static unsigned getHashValue(const CVSymbol &Val) {
  69. return xxHash64(Val.RecordData);
  70. }
  71. static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) {
  72. return LHS.RecordData == RHS.RecordData;
  73. }
  74. };
  75. namespace {
  76. LLVM_PACKED_START
  77. struct PublicSym32Layout {
  78. RecordPrefix Prefix;
  79. PublicSym32Header Pub;
  80. // char Name[];
  81. };
  82. LLVM_PACKED_END
  83. } // namespace
  84. // Calculate how much memory this public needs when serialized.
  85. static uint32_t sizeOfPublic(const BulkPublic &Pub) {
  86. uint32_t NameLen = Pub.NameLen;
  87. NameLen = std::min(NameLen,
  88. uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
  89. return alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
  90. }
  91. static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) {
  92. // Assume the caller has allocated sizeOfPublic bytes.
  93. uint32_t NameLen = std::min(
  94. Pub.NameLen, uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
  95. size_t Size = alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
  96. assert(Size == sizeOfPublic(Pub));
  97. auto *FixedMem = reinterpret_cast<PublicSym32Layout *>(Mem);
  98. FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32);
  99. FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - 2);
  100. FixedMem->Pub.Flags = Pub.Flags;
  101. FixedMem->Pub.Offset = Pub.Offset;
  102. FixedMem->Pub.Segment = Pub.Segment;
  103. char *NameMem = reinterpret_cast<char *>(FixedMem + 1);
  104. memcpy(NameMem, Pub.Name, NameLen);
  105. // Zero the null terminator and remaining bytes.
  106. memset(&NameMem[NameLen], 0, Size - sizeof(PublicSym32Layout) - NameLen);
  107. return CVSymbol(ArrayRef(reinterpret_cast<uint8_t *>(Mem), Size));
  108. }
  109. uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {
  110. uint32_t Size = sizeof(GSIHashHeader);
  111. Size += HashRecords.size() * sizeof(PSHashRecord);
  112. Size += HashBitmap.size() * sizeof(uint32_t);
  113. Size += HashBuckets.size() * sizeof(uint32_t);
  114. return Size;
  115. }
  116. Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) {
  117. GSIHashHeader Header;
  118. Header.VerSignature = GSIHashHeader::HdrSignature;
  119. Header.VerHdr = GSIHashHeader::HdrVersion;
  120. Header.HrSize = HashRecords.size() * sizeof(PSHashRecord);
  121. Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4;
  122. if (auto EC = Writer.writeObject(Header))
  123. return EC;
  124. if (auto EC = Writer.writeArray(ArrayRef(HashRecords)))
  125. return EC;
  126. if (auto EC = Writer.writeArray(ArrayRef(HashBitmap)))
  127. return EC;
  128. if (auto EC = Writer.writeArray(ArrayRef(HashBuckets)))
  129. return EC;
  130. return Error::success();
  131. }
  132. static bool isAsciiString(StringRef S) {
  133. return llvm::all_of(S, [](char C) { return unsigned(C) < 0x80; });
  134. }
  135. // See `caseInsensitiveComparePchPchCchCch` in gsi.cpp
  136. static int gsiRecordCmp(StringRef S1, StringRef S2) {
  137. size_t LS = S1.size();
  138. size_t RS = S2.size();
  139. // Shorter strings always compare less than longer strings.
  140. if (LS != RS)
  141. return (LS > RS) - (LS < RS);
  142. // If either string contains non ascii characters, memcmp them.
  143. if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2)))
  144. return memcmp(S1.data(), S2.data(), LS);
  145. // Both strings are ascii, perform a case-insensitive comparison.
  146. return S1.compare_insensitive(S2.data());
  147. }
  148. void GSIStreamBuilder::finalizePublicBuckets() {
  149. PSH->finalizeBuckets(0, Publics);
  150. }
  151. void GSIStreamBuilder::finalizeGlobalBuckets(uint32_t RecordZeroOffset) {
  152. // Build up a list of globals to be bucketed. Use the BulkPublic data
  153. // structure for this purpose, even though these are global records, not
  154. // public records. Most of the same fields are required:
  155. // - Name
  156. // - NameLen
  157. // - SymOffset
  158. // - BucketIdx
  159. // The dead fields are Offset, Segment, and Flags.
  160. std::vector<BulkPublic> Records;
  161. Records.resize(Globals.size());
  162. uint32_t SymOffset = RecordZeroOffset;
  163. for (size_t I = 0, E = Globals.size(); I < E; ++I) {
  164. StringRef Name = getSymbolName(Globals[I]);
  165. Records[I].Name = Name.data();
  166. Records[I].NameLen = Name.size();
  167. Records[I].SymOffset = SymOffset;
  168. SymOffset += Globals[I].length();
  169. }
  170. GSH->finalizeBuckets(RecordZeroOffset, Records);
  171. }
  172. void GSIHashStreamBuilder::finalizeBuckets(
  173. uint32_t RecordZeroOffset, MutableArrayRef<BulkPublic> Records) {
  174. // Hash every name in parallel.
  175. parallelFor(0, Records.size(), [&](size_t I) {
  176. Records[I].setBucketIdx(hashStringV1(Records[I].Name) % IPHR_HASH);
  177. });
  178. // Count up the size of each bucket. Then, use an exclusive prefix sum to
  179. // calculate the bucket start offsets. This is C++17 std::exclusive_scan, but
  180. // we can't use it yet.
  181. uint32_t BucketStarts[IPHR_HASH] = {0};
  182. for (const BulkPublic &P : Records)
  183. ++BucketStarts[P.BucketIdx];
  184. uint32_t Sum = 0;
  185. for (uint32_t &B : BucketStarts) {
  186. uint32_t Size = B;
  187. B = Sum;
  188. Sum += Size;
  189. }
  190. // Place globals into the hash table in bucket order. When placing a global,
  191. // update the bucket start. Every hash table slot should be filled. Always use
  192. // a refcount of one for now.
  193. HashRecords.resize(Records.size());
  194. uint32_t BucketCursors[IPHR_HASH];
  195. memcpy(BucketCursors, BucketStarts, sizeof(BucketCursors));
  196. for (int I = 0, E = Records.size(); I < E; ++I) {
  197. uint32_t HashIdx = BucketCursors[Records[I].BucketIdx]++;
  198. HashRecords[HashIdx].Off = I;
  199. HashRecords[HashIdx].CRef = 1;
  200. }
  201. // Within the buckets, sort each bucket by memcmp of the symbol's name. It's
  202. // important that we use the same sorting algorithm as is used by the
  203. // reference implementation to ensure that the search for a record within a
  204. // bucket can properly early-out when it detects the record won't be found.
  205. // The algorithm used here corresponds to the function
  206. // caseInsensitiveComparePchPchCchCch in the reference implementation.
  207. parallelFor(0, IPHR_HASH, [&](size_t I) {
  208. auto B = HashRecords.begin() + BucketStarts[I];
  209. auto E = HashRecords.begin() + BucketCursors[I];
  210. if (B == E)
  211. return;
  212. auto BucketCmp = [Records](const PSHashRecord &LHash,
  213. const PSHashRecord &RHash) {
  214. const BulkPublic &L = Records[uint32_t(LHash.Off)];
  215. const BulkPublic &R = Records[uint32_t(RHash.Off)];
  216. assert(L.BucketIdx == R.BucketIdx);
  217. int Cmp = gsiRecordCmp(L.getName(), R.getName());
  218. if (Cmp != 0)
  219. return Cmp < 0;
  220. // This comparison is necessary to make the sorting stable in the presence
  221. // of two static globals with the same name. The easiest way to observe
  222. // this is with S_LDATA32 records.
  223. return L.SymOffset < R.SymOffset;
  224. };
  225. llvm::sort(B, E, BucketCmp);
  226. // After we are done sorting, replace the global indices with the stream
  227. // offsets of each global. Add one when writing symbol offsets to disk.
  228. // See GSI1::fixSymRecs.
  229. for (PSHashRecord &HRec : make_range(B, E))
  230. HRec.Off = Records[uint32_t(HRec.Off)].SymOffset + 1;
  231. });
  232. // For each non-empty bucket, push the bucket start offset into HashBuckets
  233. // and set a bit in the hash bitmap.
  234. for (uint32_t I = 0; I < HashBitmap.size(); ++I) {
  235. uint32_t Word = 0;
  236. for (uint32_t J = 0; J < 32; ++J) {
  237. // Skip empty buckets.
  238. uint32_t BucketIdx = I * 32 + J;
  239. if (BucketIdx >= IPHR_HASH ||
  240. BucketStarts[BucketIdx] == BucketCursors[BucketIdx])
  241. continue;
  242. Word |= (1U << J);
  243. // Calculate what the offset of the first hash record in the chain would
  244. // be if it were inflated to contain 32-bit pointers. On a 32-bit system,
  245. // each record would be 12 bytes. See HROffsetCalc in gsi.h.
  246. const int SizeOfHROffsetCalc = 12;
  247. ulittle32_t ChainStartOff =
  248. ulittle32_t(BucketStarts[BucketIdx] * SizeOfHROffsetCalc);
  249. HashBuckets.push_back(ChainStartOff);
  250. }
  251. HashBitmap[I] = Word;
  252. }
  253. }
  254. GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf)
  255. : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()),
  256. GSH(std::make_unique<GSIHashStreamBuilder>()) {}
  257. GSIStreamBuilder::~GSIStreamBuilder() = default;
  258. uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const {
  259. uint32_t Size = 0;
  260. Size += sizeof(PublicsStreamHeader);
  261. Size += PSH->calculateSerializedLength();
  262. Size += Publics.size() * sizeof(uint32_t); // AddrMap
  263. // FIXME: Add thunk map and section offsets for incremental linking.
  264. return Size;
  265. }
  266. uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const {
  267. return GSH->calculateSerializedLength();
  268. }
  269. Error GSIStreamBuilder::finalizeMsfLayout() {
  270. // First we write public symbol records, then we write global symbol records.
  271. finalizePublicBuckets();
  272. finalizeGlobalBuckets(PSH->RecordByteSize);
  273. Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize());
  274. if (!Idx)
  275. return Idx.takeError();
  276. GlobalsStreamIndex = *Idx;
  277. Idx = Msf.addStream(calculatePublicsHashStreamSize());
  278. if (!Idx)
  279. return Idx.takeError();
  280. PublicsStreamIndex = *Idx;
  281. uint32_t RecordBytes = PSH->RecordByteSize + GSH->RecordByteSize;
  282. Idx = Msf.addStream(RecordBytes);
  283. if (!Idx)
  284. return Idx.takeError();
  285. RecordStreamIndex = *Idx;
  286. return Error::success();
  287. }
  288. void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&PublicsIn) {
  289. assert(Publics.empty() && PSH->RecordByteSize == 0 &&
  290. "publics can only be added once");
  291. Publics = std::move(PublicsIn);
  292. // Sort the symbols by name. PDBs contain lots of symbols, so use parallelism.
  293. parallelSort(Publics, [](const BulkPublic &L, const BulkPublic &R) {
  294. return L.getName() < R.getName();
  295. });
  296. // Assign offsets and calculate the length of the public symbol records.
  297. uint32_t SymOffset = 0;
  298. for (BulkPublic &Pub : Publics) {
  299. Pub.SymOffset = SymOffset;
  300. SymOffset += sizeOfPublic(Pub);
  301. }
  302. // Remember the length of the public stream records.
  303. PSH->RecordByteSize = SymOffset;
  304. }
  305. void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) {
  306. serializeAndAddGlobal(Sym);
  307. }
  308. void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) {
  309. serializeAndAddGlobal(Sym);
  310. }
  311. void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) {
  312. serializeAndAddGlobal(Sym);
  313. }
  314. template <typename T>
  315. void GSIStreamBuilder::serializeAndAddGlobal(const T &Symbol) {
  316. T Copy(Symbol);
  317. addGlobalSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
  318. CodeViewContainer::Pdb));
  319. }
  320. void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Symbol) {
  321. // Ignore duplicate typedefs and constants.
  322. if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {
  323. auto Iter = GlobalsSeen.insert(Symbol);
  324. if (!Iter.second)
  325. return;
  326. }
  327. GSH->RecordByteSize += Symbol.length();
  328. Globals.push_back(Symbol);
  329. }
  330. // Serialize each public and write it.
  331. static Error writePublics(BinaryStreamWriter &Writer,
  332. ArrayRef<BulkPublic> Publics) {
  333. std::vector<uint8_t> Storage;
  334. for (const BulkPublic &Pub : Publics) {
  335. Storage.resize(sizeOfPublic(Pub));
  336. serializePublic(Storage.data(), Pub);
  337. if (Error E = Writer.writeBytes(Storage))
  338. return E;
  339. }
  340. return Error::success();
  341. }
  342. static Error writeRecords(BinaryStreamWriter &Writer,
  343. ArrayRef<CVSymbol> Records) {
  344. BinaryItemStream<CVSymbol> ItemStream(support::endianness::little);
  345. ItemStream.setItems(Records);
  346. BinaryStreamRef RecordsRef(ItemStream);
  347. return Writer.writeStreamRef(RecordsRef);
  348. }
  349. Error GSIStreamBuilder::commitSymbolRecordStream(
  350. WritableBinaryStreamRef Stream) {
  351. BinaryStreamWriter Writer(Stream);
  352. // Write public symbol records first, followed by global symbol records. This
  353. // must match the order that we assume in finalizeMsfLayout when computing
  354. // PSHZero and GSHZero.
  355. if (auto EC = writePublics(Writer, Publics))
  356. return EC;
  357. if (auto EC = writeRecords(Writer, Globals))
  358. return EC;
  359. return Error::success();
  360. }
  361. static std::vector<support::ulittle32_t>
  362. computeAddrMap(ArrayRef<BulkPublic> Publics) {
  363. // Build a parallel vector of indices into the Publics vector, and sort it by
  364. // address.
  365. std::vector<ulittle32_t> PubAddrMap;
  366. PubAddrMap.reserve(Publics.size());
  367. for (int I = 0, E = Publics.size(); I < E; ++I)
  368. PubAddrMap.push_back(ulittle32_t(I));
  369. auto AddrCmp = [Publics](const ulittle32_t &LIdx, const ulittle32_t &RIdx) {
  370. const BulkPublic &L = Publics[LIdx];
  371. const BulkPublic &R = Publics[RIdx];
  372. if (L.Segment != R.Segment)
  373. return L.Segment < R.Segment;
  374. if (L.Offset != R.Offset)
  375. return L.Offset < R.Offset;
  376. // parallelSort is unstable, so we have to do name comparison to ensure
  377. // that two names for the same location come out in a deterministic order.
  378. return L.getName() < R.getName();
  379. };
  380. parallelSort(PubAddrMap, AddrCmp);
  381. // Rewrite the public symbol indices into symbol offsets.
  382. for (ulittle32_t &Entry : PubAddrMap)
  383. Entry = Publics[Entry].SymOffset;
  384. return PubAddrMap;
  385. }
  386. Error GSIStreamBuilder::commitPublicsHashStream(
  387. WritableBinaryStreamRef Stream) {
  388. BinaryStreamWriter Writer(Stream);
  389. PublicsStreamHeader Header;
  390. // FIXME: Fill these in. They are for incremental linking.
  391. Header.SymHash = PSH->calculateSerializedLength();
  392. Header.AddrMap = Publics.size() * 4;
  393. Header.NumThunks = 0;
  394. Header.SizeOfThunk = 0;
  395. Header.ISectThunkTable = 0;
  396. memset(Header.Padding, 0, sizeof(Header.Padding));
  397. Header.OffThunkTable = 0;
  398. Header.NumSections = 0;
  399. if (auto EC = Writer.writeObject(Header))
  400. return EC;
  401. if (auto EC = PSH->commit(Writer))
  402. return EC;
  403. std::vector<support::ulittle32_t> PubAddrMap = computeAddrMap(Publics);
  404. assert(PubAddrMap.size() == Publics.size());
  405. if (auto EC = Writer.writeArray(ArrayRef(PubAddrMap)))
  406. return EC;
  407. return Error::success();
  408. }
  409. Error GSIStreamBuilder::commitGlobalsHashStream(
  410. WritableBinaryStreamRef Stream) {
  411. BinaryStreamWriter Writer(Stream);
  412. return GSH->commit(Writer);
  413. }
  414. Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout,
  415. WritableBinaryStreamRef Buffer) {
  416. auto GS = WritableMappedBlockStream::createIndexedStream(
  417. Layout, Buffer, getGlobalsStreamIndex(), Msf.getAllocator());
  418. auto PS = WritableMappedBlockStream::createIndexedStream(
  419. Layout, Buffer, getPublicsStreamIndex(), Msf.getAllocator());
  420. auto PRS = WritableMappedBlockStream::createIndexedStream(
  421. Layout, Buffer, getRecordStreamIndex(), Msf.getAllocator());
  422. if (auto EC = commitSymbolRecordStream(*PRS))
  423. return EC;
  424. if (auto EC = commitGlobalsHashStream(*GS))
  425. return EC;
  426. if (auto EC = commitPublicsHashStream(*PS))
  427. return EC;
  428. return Error::success();
  429. }