SchedClassResolution.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. //===-- SchedClassResolution.cpp --------------------------------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "SchedClassResolution.h"
  9. #include "BenchmarkResult.h"
  10. #include "llvm/ADT/STLExtras.h"
  11. #include "llvm/MC/MCAsmInfo.h"
  12. #include "llvm/MCA/Support.h"
  13. #include "llvm/Support/FormatVariadic.h"
  14. #include <limits>
  15. #include <unordered_set>
  16. #include <vector>
  17. namespace llvm {
  18. namespace exegesis {
  19. // Return the non-redundant list of WriteProcRes used by the given sched class.
  20. // The scheduling model for LLVM is such that each instruction has a certain
  21. // number of uops which consume resources which are described by WriteProcRes
  22. // entries. Each entry describe how many cycles are spent on a specific ProcRes
  23. // kind.
  24. // For example, an instruction might have 3 uOps, one dispatching on P0
  25. // (ProcResIdx=1) and two on P06 (ProcResIdx = 7).
  26. // Note that LLVM additionally denormalizes resource consumption to include
  27. // usage of super resources by subresources. So in practice if there exists a
  28. // P016 (ProcResIdx=10), then the cycles consumed by P0 are also consumed by
  29. // P06 (ProcResIdx = 7) and P016 (ProcResIdx = 10), and the resources consumed
  30. // by P06 are also consumed by P016. In the figure below, parenthesized cycles
  31. // denote implied usage of superresources by subresources:
  32. // P0 P06 P016
  33. // uOp1 1 (1) (1)
  34. // uOp2 1 (1)
  35. // uOp3 1 (1)
  36. // =============================
  37. // 1 3 3
  38. // Eventually we end up with three entries for the WriteProcRes of the
  39. // instruction:
  40. // {ProcResIdx=1, Cycles=1} // P0
  41. // {ProcResIdx=7, Cycles=3} // P06
  42. // {ProcResIdx=10, Cycles=3} // P016
  43. //
  44. // Note that in this case, P016 does not contribute any cycles, so it would
  45. // be removed by this function.
  46. // FIXME: Merge this with the equivalent in llvm-mca.
  47. static SmallVector<MCWriteProcResEntry, 8>
  48. getNonRedundantWriteProcRes(const MCSchedClassDesc &SCDesc,
  49. const MCSubtargetInfo &STI) {
  50. SmallVector<MCWriteProcResEntry, 8> Result;
  51. const auto &SM = STI.getSchedModel();
  52. const unsigned NumProcRes = SM.getNumProcResourceKinds();
  53. // Collect resource masks.
  54. SmallVector<uint64_t> ProcResourceMasks(NumProcRes);
  55. mca::computeProcResourceMasks(SM, ProcResourceMasks);
  56. // Sort entries by smaller resources for (basic) topological ordering.
  57. using ResourceMaskAndEntry = std::pair<uint64_t, const MCWriteProcResEntry *>;
  58. SmallVector<ResourceMaskAndEntry, 8> ResourceMaskAndEntries;
  59. for (const auto *WPR = STI.getWriteProcResBegin(&SCDesc),
  60. *const WPREnd = STI.getWriteProcResEnd(&SCDesc);
  61. WPR != WPREnd; ++WPR) {
  62. uint64_t Mask = ProcResourceMasks[WPR->ProcResourceIdx];
  63. ResourceMaskAndEntries.push_back({Mask, WPR});
  64. }
  65. sort(ResourceMaskAndEntries,
  66. [](const ResourceMaskAndEntry &A, const ResourceMaskAndEntry &B) {
  67. unsigned popcntA = llvm::popcount(A.first);
  68. unsigned popcntB = llvm::popcount(B.first);
  69. if (popcntA < popcntB)
  70. return true;
  71. if (popcntA > popcntB)
  72. return false;
  73. return A.first < B.first;
  74. });
  75. SmallVector<float, 32> ProcResUnitUsage(NumProcRes);
  76. for (const ResourceMaskAndEntry &Entry : ResourceMaskAndEntries) {
  77. const MCWriteProcResEntry *WPR = Entry.second;
  78. const MCProcResourceDesc *const ProcResDesc =
  79. SM.getProcResource(WPR->ProcResourceIdx);
  80. if (ProcResDesc->SubUnitsIdxBegin == nullptr) {
  81. // This is a ProcResUnit.
  82. Result.push_back({WPR->ProcResourceIdx, WPR->Cycles});
  83. ProcResUnitUsage[WPR->ProcResourceIdx] += WPR->Cycles;
  84. } else {
  85. // This is a ProcResGroup. First see if it contributes any cycles or if
  86. // it has cycles just from subunits.
  87. float RemainingCycles = WPR->Cycles;
  88. for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin;
  89. SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits;
  90. ++SubResIdx) {
  91. RemainingCycles -= ProcResUnitUsage[*SubResIdx];
  92. }
  93. if (RemainingCycles < 0.01f) {
  94. // The ProcResGroup contributes no cycles of its own.
  95. continue;
  96. }
  97. // The ProcResGroup contributes `RemainingCycles` cycles of its own.
  98. Result.push_back({WPR->ProcResourceIdx,
  99. static_cast<uint16_t>(std::round(RemainingCycles))});
  100. // Spread the remaining cycles over all subunits.
  101. for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin;
  102. SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits;
  103. ++SubResIdx) {
  104. ProcResUnitUsage[*SubResIdx] += RemainingCycles / ProcResDesc->NumUnits;
  105. }
  106. }
  107. }
  108. return Result;
  109. }
  110. // Distributes a pressure budget as evenly as possible on the provided subunits
  111. // given the already existing port pressure distribution.
  112. //
  113. // The algorithm is as follows: while there is remaining pressure to
  114. // distribute, find the subunits with minimal pressure, and distribute
  115. // remaining pressure equally up to the pressure of the unit with
  116. // second-to-minimal pressure.
  117. // For example, let's assume we want to distribute 2*P1256
  118. // (Subunits = [P1,P2,P5,P6]), and the starting DensePressure is:
  119. // DensePressure = P0 P1 P2 P3 P4 P5 P6 P7
  120. // 0.1 0.3 0.2 0.0 0.0 0.5 0.5 0.5
  121. // RemainingPressure = 2.0
  122. // We sort the subunits by pressure:
  123. // Subunits = [(P2,p=0.2), (P1,p=0.3), (P5,p=0.5), (P6, p=0.5)]
  124. // We'll first start by the subunits with minimal pressure, which are at
  125. // the beginning of the sorted array. In this example there is one (P2).
  126. // The subunit with second-to-minimal pressure is the next one in the
  127. // array (P1). So we distribute 0.1 pressure to P2, and remove 0.1 cycles
  128. // from the budget.
  129. // Subunits = [(P2,p=0.3), (P1,p=0.3), (P5,p=0.5), (P5,p=0.5)]
  130. // RemainingPressure = 1.9
  131. // We repeat this process: distribute 0.2 pressure on each of the minimal
  132. // P2 and P1, decrease budget by 2*0.2:
  133. // Subunits = [(P2,p=0.5), (P1,p=0.5), (P5,p=0.5), (P5,p=0.5)]
  134. // RemainingPressure = 1.5
  135. // There are no second-to-minimal subunits so we just share the remaining
  136. // budget (1.5 cycles) equally:
  137. // Subunits = [(P2,p=0.875), (P1,p=0.875), (P5,p=0.875), (P5,p=0.875)]
  138. // RemainingPressure = 0.0
  139. // We stop as there is no remaining budget to distribute.
  140. static void distributePressure(float RemainingPressure,
  141. SmallVector<uint16_t, 32> Subunits,
  142. SmallVector<float, 32> &DensePressure) {
  143. // Find the number of subunits with minimal pressure (they are at the
  144. // front).
  145. sort(Subunits, [&DensePressure](const uint16_t A, const uint16_t B) {
  146. return DensePressure[A] < DensePressure[B];
  147. });
  148. const auto getPressureForSubunit = [&DensePressure,
  149. &Subunits](size_t I) -> float & {
  150. return DensePressure[Subunits[I]];
  151. };
  152. size_t NumMinimalSU = 1;
  153. while (NumMinimalSU < Subunits.size() &&
  154. getPressureForSubunit(NumMinimalSU) == getPressureForSubunit(0)) {
  155. ++NumMinimalSU;
  156. }
  157. while (RemainingPressure > 0.0f) {
  158. if (NumMinimalSU == Subunits.size()) {
  159. // All units are minimal, just distribute evenly and be done.
  160. for (size_t I = 0; I < NumMinimalSU; ++I) {
  161. getPressureForSubunit(I) += RemainingPressure / NumMinimalSU;
  162. }
  163. return;
  164. }
  165. // Distribute the remaining pressure equally.
  166. const float MinimalPressure = getPressureForSubunit(NumMinimalSU - 1);
  167. const float SecondToMinimalPressure = getPressureForSubunit(NumMinimalSU);
  168. assert(MinimalPressure < SecondToMinimalPressure);
  169. const float Increment = SecondToMinimalPressure - MinimalPressure;
  170. if (RemainingPressure <= NumMinimalSU * Increment) {
  171. // There is not enough remaining pressure.
  172. for (size_t I = 0; I < NumMinimalSU; ++I) {
  173. getPressureForSubunit(I) += RemainingPressure / NumMinimalSU;
  174. }
  175. return;
  176. }
  177. // Bump all minimal pressure subunits to `SecondToMinimalPressure`.
  178. for (size_t I = 0; I < NumMinimalSU; ++I) {
  179. getPressureForSubunit(I) = SecondToMinimalPressure;
  180. RemainingPressure -= SecondToMinimalPressure;
  181. }
  182. while (NumMinimalSU < Subunits.size() &&
  183. getPressureForSubunit(NumMinimalSU) == SecondToMinimalPressure) {
  184. ++NumMinimalSU;
  185. }
  186. }
  187. }
  188. std::vector<std::pair<uint16_t, float>>
  189. computeIdealizedProcResPressure(const MCSchedModel &SM,
  190. SmallVector<MCWriteProcResEntry, 8> WPRS) {
  191. // DensePressure[I] is the port pressure for Proc Resource I.
  192. SmallVector<float, 32> DensePressure(SM.getNumProcResourceKinds());
  193. sort(WPRS, [](const MCWriteProcResEntry &A, const MCWriteProcResEntry &B) {
  194. return A.ProcResourceIdx < B.ProcResourceIdx;
  195. });
  196. for (const MCWriteProcResEntry &WPR : WPRS) {
  197. // Get units for the entry.
  198. const MCProcResourceDesc *const ProcResDesc =
  199. SM.getProcResource(WPR.ProcResourceIdx);
  200. if (ProcResDesc->SubUnitsIdxBegin == nullptr) {
  201. // This is a ProcResUnit.
  202. DensePressure[WPR.ProcResourceIdx] += WPR.Cycles;
  203. } else {
  204. // This is a ProcResGroup.
  205. SmallVector<uint16_t, 32> Subunits(ProcResDesc->SubUnitsIdxBegin,
  206. ProcResDesc->SubUnitsIdxBegin +
  207. ProcResDesc->NumUnits);
  208. distributePressure(WPR.Cycles, Subunits, DensePressure);
  209. }
  210. }
  211. // Turn dense pressure into sparse pressure by removing zero entries.
  212. std::vector<std::pair<uint16_t, float>> Pressure;
  213. for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
  214. if (DensePressure[I] > 0.0f)
  215. Pressure.emplace_back(I, DensePressure[I]);
  216. }
  217. return Pressure;
  218. }
  219. ResolvedSchedClass::ResolvedSchedClass(const MCSubtargetInfo &STI,
  220. unsigned ResolvedSchedClassId,
  221. bool WasVariant)
  222. : SchedClassId(ResolvedSchedClassId),
  223. SCDesc(STI.getSchedModel().getSchedClassDesc(ResolvedSchedClassId)),
  224. WasVariant(WasVariant),
  225. NonRedundantWriteProcRes(getNonRedundantWriteProcRes(*SCDesc, STI)),
  226. IdealizedProcResPressure(computeIdealizedProcResPressure(
  227. STI.getSchedModel(), NonRedundantWriteProcRes)) {
  228. assert((SCDesc == nullptr || !SCDesc->isVariant()) &&
  229. "ResolvedSchedClass should never be variant");
  230. }
  231. static unsigned ResolveVariantSchedClassId(const MCSubtargetInfo &STI,
  232. const MCInstrInfo &InstrInfo,
  233. unsigned SchedClassId,
  234. const MCInst &MCI) {
  235. const auto &SM = STI.getSchedModel();
  236. while (SchedClassId && SM.getSchedClassDesc(SchedClassId)->isVariant()) {
  237. SchedClassId = STI.resolveVariantSchedClass(SchedClassId, &MCI, &InstrInfo,
  238. SM.getProcessorID());
  239. }
  240. return SchedClassId;
  241. }
  242. std::pair<unsigned /*SchedClassId*/, bool /*WasVariant*/>
  243. ResolvedSchedClass::resolveSchedClassId(const MCSubtargetInfo &SubtargetInfo,
  244. const MCInstrInfo &InstrInfo,
  245. const MCInst &MCI) {
  246. unsigned SchedClassId = InstrInfo.get(MCI.getOpcode()).getSchedClass();
  247. const bool WasVariant = SchedClassId && SubtargetInfo.getSchedModel()
  248. .getSchedClassDesc(SchedClassId)
  249. ->isVariant();
  250. SchedClassId =
  251. ResolveVariantSchedClassId(SubtargetInfo, InstrInfo, SchedClassId, MCI);
  252. return std::make_pair(SchedClassId, WasVariant);
  253. }
  254. // Returns a ProxResIdx by id or name.
  255. static unsigned findProcResIdx(const MCSubtargetInfo &STI,
  256. const StringRef NameOrId) {
  257. // Interpret the key as an ProcResIdx.
  258. unsigned ProcResIdx = 0;
  259. if (to_integer(NameOrId, ProcResIdx, 10))
  260. return ProcResIdx;
  261. // Interpret the key as a ProcRes name.
  262. const auto &SchedModel = STI.getSchedModel();
  263. for (int I = 0, E = SchedModel.getNumProcResourceKinds(); I < E; ++I) {
  264. if (NameOrId == SchedModel.getProcResource(I)->Name)
  265. return I;
  266. }
  267. return 0;
  268. }
  269. std::vector<BenchmarkMeasure> ResolvedSchedClass::getAsPoint(
  270. InstructionBenchmark::ModeE Mode, const MCSubtargetInfo &STI,
  271. ArrayRef<PerInstructionStats> Representative) const {
  272. const size_t NumMeasurements = Representative.size();
  273. std::vector<BenchmarkMeasure> SchedClassPoint(NumMeasurements);
  274. if (Mode == InstructionBenchmark::Latency) {
  275. assert(NumMeasurements == 1 && "Latency is a single measure.");
  276. BenchmarkMeasure &LatencyMeasure = SchedClassPoint[0];
  277. // Find the latency.
  278. LatencyMeasure.PerInstructionValue = 0.0;
  279. for (unsigned I = 0; I < SCDesc->NumWriteLatencyEntries; ++I) {
  280. const MCWriteLatencyEntry *const WLE =
  281. STI.getWriteLatencyEntry(SCDesc, I);
  282. LatencyMeasure.PerInstructionValue =
  283. std::max<double>(LatencyMeasure.PerInstructionValue, WLE->Cycles);
  284. }
  285. } else if (Mode == InstructionBenchmark::Uops) {
  286. for (auto I : zip(SchedClassPoint, Representative)) {
  287. BenchmarkMeasure &Measure = std::get<0>(I);
  288. const PerInstructionStats &Stats = std::get<1>(I);
  289. StringRef Key = Stats.key();
  290. uint16_t ProcResIdx = findProcResIdx(STI, Key);
  291. if (ProcResIdx > 0) {
  292. // Find the pressure on ProcResIdx `Key`.
  293. const auto ProcResPressureIt =
  294. llvm::find_if(IdealizedProcResPressure,
  295. [ProcResIdx](const std::pair<uint16_t, float> &WPR) {
  296. return WPR.first == ProcResIdx;
  297. });
  298. Measure.PerInstructionValue =
  299. ProcResPressureIt == IdealizedProcResPressure.end()
  300. ? 0.0
  301. : ProcResPressureIt->second;
  302. } else if (Key == "NumMicroOps") {
  303. Measure.PerInstructionValue = SCDesc->NumMicroOps;
  304. } else {
  305. errs() << "expected `key` to be either a ProcResIdx or a ProcRes "
  306. "name, got "
  307. << Key << "\n";
  308. return {};
  309. }
  310. }
  311. } else if (Mode == InstructionBenchmark::InverseThroughput) {
  312. assert(NumMeasurements == 1 && "Inverse Throughput is a single measure.");
  313. BenchmarkMeasure &RThroughputMeasure = SchedClassPoint[0];
  314. RThroughputMeasure.PerInstructionValue =
  315. MCSchedModel::getReciprocalThroughput(STI, *SCDesc);
  316. } else {
  317. llvm_unreachable("unimplemented measurement matching mode");
  318. }
  319. return SchedClassPoint;
  320. }
  321. } // namespace exegesis
  322. } // namespace llvm