SchedClassResolution.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. //===-- SchedClassResolution.cpp --------------------------------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "SchedClassResolution.h"
  9. #include "BenchmarkResult.h"
  10. #include "llvm/ADT/STLExtras.h"
  11. #include "llvm/MC/MCAsmInfo.h"
  12. #include "llvm/Support/FormatVariadic.h"
  13. #include <limits>
  14. #include <unordered_set>
  15. #include <vector>
  16. namespace llvm {
  17. namespace exegesis {
  18. // Return the non-redundant list of WriteProcRes used by the given sched class.
  19. // The scheduling model for LLVM is such that each instruction has a certain
  20. // number of uops which consume resources which are described by WriteProcRes
  21. // entries. Each entry describe how many cycles are spent on a specific ProcRes
  22. // kind.
  23. // For example, an instruction might have 3 uOps, one dispatching on P0
  24. // (ProcResIdx=1) and two on P06 (ProcResIdx = 7).
  25. // Note that LLVM additionally denormalizes resource consumption to include
  26. // usage of super resources by subresources. So in practice if there exists a
  27. // P016 (ProcResIdx=10), then the cycles consumed by P0 are also consumed by
  28. // P06 (ProcResIdx = 7) and P016 (ProcResIdx = 10), and the resources consumed
  29. // by P06 are also consumed by P016. In the figure below, parenthesized cycles
  30. // denote implied usage of superresources by subresources:
  31. // P0 P06 P016
  32. // uOp1 1 (1) (1)
  33. // uOp2 1 (1)
  34. // uOp3 1 (1)
  35. // =============================
  36. // 1 3 3
  37. // Eventually we end up with three entries for the WriteProcRes of the
  38. // instruction:
  39. // {ProcResIdx=1, Cycles=1} // P0
  40. // {ProcResIdx=7, Cycles=3} // P06
  41. // {ProcResIdx=10, Cycles=3} // P016
  42. //
  43. // Note that in this case, P016 does not contribute any cycles, so it would
  44. // be removed by this function.
  45. // FIXME: Move this to MCSubtargetInfo and use it in llvm-mca.
  46. static SmallVector<MCWriteProcResEntry, 8>
  47. getNonRedundantWriteProcRes(const MCSchedClassDesc &SCDesc,
  48. const MCSubtargetInfo &STI) {
  49. SmallVector<MCWriteProcResEntry, 8> Result;
  50. const auto &SM = STI.getSchedModel();
  51. const unsigned NumProcRes = SM.getNumProcResourceKinds();
  52. // This assumes that the ProcResDescs are sorted in topological order, which
  53. // is guaranteed by the tablegen backend.
  54. SmallVector<float, 32> ProcResUnitUsage(NumProcRes);
  55. for (const auto *WPR = STI.getWriteProcResBegin(&SCDesc),
  56. *const WPREnd = STI.getWriteProcResEnd(&SCDesc);
  57. WPR != WPREnd; ++WPR) {
  58. const MCProcResourceDesc *const ProcResDesc =
  59. SM.getProcResource(WPR->ProcResourceIdx);
  60. if (ProcResDesc->SubUnitsIdxBegin == nullptr) {
  61. // This is a ProcResUnit.
  62. Result.push_back({WPR->ProcResourceIdx, WPR->Cycles});
  63. ProcResUnitUsage[WPR->ProcResourceIdx] += WPR->Cycles;
  64. } else {
  65. // This is a ProcResGroup. First see if it contributes any cycles or if
  66. // it has cycles just from subunits.
  67. float RemainingCycles = WPR->Cycles;
  68. for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin;
  69. SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits;
  70. ++SubResIdx) {
  71. RemainingCycles -= ProcResUnitUsage[*SubResIdx];
  72. }
  73. if (RemainingCycles < 0.01f) {
  74. // The ProcResGroup contributes no cycles of its own.
  75. continue;
  76. }
  77. // The ProcResGroup contributes `RemainingCycles` cycles of its own.
  78. Result.push_back({WPR->ProcResourceIdx,
  79. static_cast<uint16_t>(std::round(RemainingCycles))});
  80. // Spread the remaining cycles over all subunits.
  81. for (const auto *SubResIdx = ProcResDesc->SubUnitsIdxBegin;
  82. SubResIdx != ProcResDesc->SubUnitsIdxBegin + ProcResDesc->NumUnits;
  83. ++SubResIdx) {
  84. ProcResUnitUsage[*SubResIdx] += RemainingCycles / ProcResDesc->NumUnits;
  85. }
  86. }
  87. }
  88. return Result;
  89. }
  90. // Distributes a pressure budget as evenly as possible on the provided subunits
  91. // given the already existing port pressure distribution.
  92. //
  93. // The algorithm is as follows: while there is remaining pressure to
  94. // distribute, find the subunits with minimal pressure, and distribute
  95. // remaining pressure equally up to the pressure of the unit with
  96. // second-to-minimal pressure.
  97. // For example, let's assume we want to distribute 2*P1256
  98. // (Subunits = [P1,P2,P5,P6]), and the starting DensePressure is:
  99. // DensePressure = P0 P1 P2 P3 P4 P5 P6 P7
  100. // 0.1 0.3 0.2 0.0 0.0 0.5 0.5 0.5
  101. // RemainingPressure = 2.0
  102. // We sort the subunits by pressure:
  103. // Subunits = [(P2,p=0.2), (P1,p=0.3), (P5,p=0.5), (P6, p=0.5)]
  104. // We'll first start by the subunits with minimal pressure, which are at
  105. // the beginning of the sorted array. In this example there is one (P2).
  106. // The subunit with second-to-minimal pressure is the next one in the
  107. // array (P1). So we distribute 0.1 pressure to P2, and remove 0.1 cycles
  108. // from the budget.
  109. // Subunits = [(P2,p=0.3), (P1,p=0.3), (P5,p=0.5), (P5,p=0.5)]
  110. // RemainingPressure = 1.9
  111. // We repeat this process: distribute 0.2 pressure on each of the minimal
  112. // P2 and P1, decrease budget by 2*0.2:
  113. // Subunits = [(P2,p=0.5), (P1,p=0.5), (P5,p=0.5), (P5,p=0.5)]
  114. // RemainingPressure = 1.5
  115. // There are no second-to-minimal subunits so we just share the remaining
  116. // budget (1.5 cycles) equally:
  117. // Subunits = [(P2,p=0.875), (P1,p=0.875), (P5,p=0.875), (P5,p=0.875)]
  118. // RemainingPressure = 0.0
  119. // We stop as there is no remaining budget to distribute.
  120. static void distributePressure(float RemainingPressure,
  121. SmallVector<uint16_t, 32> Subunits,
  122. SmallVector<float, 32> &DensePressure) {
  123. // Find the number of subunits with minimal pressure (they are at the
  124. // front).
  125. sort(Subunits, [&DensePressure](const uint16_t A, const uint16_t B) {
  126. return DensePressure[A] < DensePressure[B];
  127. });
  128. const auto getPressureForSubunit = [&DensePressure,
  129. &Subunits](size_t I) -> float & {
  130. return DensePressure[Subunits[I]];
  131. };
  132. size_t NumMinimalSU = 1;
  133. while (NumMinimalSU < Subunits.size() &&
  134. getPressureForSubunit(NumMinimalSU) == getPressureForSubunit(0)) {
  135. ++NumMinimalSU;
  136. }
  137. while (RemainingPressure > 0.0f) {
  138. if (NumMinimalSU == Subunits.size()) {
  139. // All units are minimal, just distribute evenly and be done.
  140. for (size_t I = 0; I < NumMinimalSU; ++I) {
  141. getPressureForSubunit(I) += RemainingPressure / NumMinimalSU;
  142. }
  143. return;
  144. }
  145. // Distribute the remaining pressure equally.
  146. const float MinimalPressure = getPressureForSubunit(NumMinimalSU - 1);
  147. const float SecondToMinimalPressure = getPressureForSubunit(NumMinimalSU);
  148. assert(MinimalPressure < SecondToMinimalPressure);
  149. const float Increment = SecondToMinimalPressure - MinimalPressure;
  150. if (RemainingPressure <= NumMinimalSU * Increment) {
  151. // There is not enough remaining pressure.
  152. for (size_t I = 0; I < NumMinimalSU; ++I) {
  153. getPressureForSubunit(I) += RemainingPressure / NumMinimalSU;
  154. }
  155. return;
  156. }
  157. // Bump all minimal pressure subunits to `SecondToMinimalPressure`.
  158. for (size_t I = 0; I < NumMinimalSU; ++I) {
  159. getPressureForSubunit(I) = SecondToMinimalPressure;
  160. RemainingPressure -= SecondToMinimalPressure;
  161. }
  162. while (NumMinimalSU < Subunits.size() &&
  163. getPressureForSubunit(NumMinimalSU) == SecondToMinimalPressure) {
  164. ++NumMinimalSU;
  165. }
  166. }
  167. }
  168. std::vector<std::pair<uint16_t, float>>
  169. computeIdealizedProcResPressure(const MCSchedModel &SM,
  170. SmallVector<MCWriteProcResEntry, 8> WPRS) {
  171. // DensePressure[I] is the port pressure for Proc Resource I.
  172. SmallVector<float, 32> DensePressure(SM.getNumProcResourceKinds());
  173. sort(WPRS, [](const MCWriteProcResEntry &A, const MCWriteProcResEntry &B) {
  174. return A.ProcResourceIdx < B.ProcResourceIdx;
  175. });
  176. for (const MCWriteProcResEntry &WPR : WPRS) {
  177. // Get units for the entry.
  178. const MCProcResourceDesc *const ProcResDesc =
  179. SM.getProcResource(WPR.ProcResourceIdx);
  180. if (ProcResDesc->SubUnitsIdxBegin == nullptr) {
  181. // This is a ProcResUnit.
  182. DensePressure[WPR.ProcResourceIdx] += WPR.Cycles;
  183. } else {
  184. // This is a ProcResGroup.
  185. SmallVector<uint16_t, 32> Subunits(ProcResDesc->SubUnitsIdxBegin,
  186. ProcResDesc->SubUnitsIdxBegin +
  187. ProcResDesc->NumUnits);
  188. distributePressure(WPR.Cycles, Subunits, DensePressure);
  189. }
  190. }
  191. // Turn dense pressure into sparse pressure by removing zero entries.
  192. std::vector<std::pair<uint16_t, float>> Pressure;
  193. for (unsigned I = 0, E = SM.getNumProcResourceKinds(); I < E; ++I) {
  194. if (DensePressure[I] > 0.0f)
  195. Pressure.emplace_back(I, DensePressure[I]);
  196. }
  197. return Pressure;
  198. }
  199. ResolvedSchedClass::ResolvedSchedClass(const MCSubtargetInfo &STI,
  200. unsigned ResolvedSchedClassId,
  201. bool WasVariant)
  202. : SchedClassId(ResolvedSchedClassId),
  203. SCDesc(STI.getSchedModel().getSchedClassDesc(ResolvedSchedClassId)),
  204. WasVariant(WasVariant),
  205. NonRedundantWriteProcRes(getNonRedundantWriteProcRes(*SCDesc, STI)),
  206. IdealizedProcResPressure(computeIdealizedProcResPressure(
  207. STI.getSchedModel(), NonRedundantWriteProcRes)) {
  208. assert((SCDesc == nullptr || !SCDesc->isVariant()) &&
  209. "ResolvedSchedClass should never be variant");
  210. }
  211. static unsigned ResolveVariantSchedClassId(const MCSubtargetInfo &STI,
  212. const MCInstrInfo &InstrInfo,
  213. unsigned SchedClassId,
  214. const MCInst &MCI) {
  215. const auto &SM = STI.getSchedModel();
  216. while (SchedClassId && SM.getSchedClassDesc(SchedClassId)->isVariant()) {
  217. SchedClassId = STI.resolveVariantSchedClass(SchedClassId, &MCI, &InstrInfo,
  218. SM.getProcessorID());
  219. }
  220. return SchedClassId;
  221. }
  222. std::pair<unsigned /*SchedClassId*/, bool /*WasVariant*/>
  223. ResolvedSchedClass::resolveSchedClassId(const MCSubtargetInfo &SubtargetInfo,
  224. const MCInstrInfo &InstrInfo,
  225. const MCInst &MCI) {
  226. unsigned SchedClassId = InstrInfo.get(MCI.getOpcode()).getSchedClass();
  227. const bool WasVariant = SchedClassId && SubtargetInfo.getSchedModel()
  228. .getSchedClassDesc(SchedClassId)
  229. ->isVariant();
  230. SchedClassId =
  231. ResolveVariantSchedClassId(SubtargetInfo, InstrInfo, SchedClassId, MCI);
  232. return std::make_pair(SchedClassId, WasVariant);
  233. }
  234. // Returns a ProxResIdx by id or name.
  235. static unsigned findProcResIdx(const MCSubtargetInfo &STI,
  236. const StringRef NameOrId) {
  237. // Interpret the key as an ProcResIdx.
  238. unsigned ProcResIdx = 0;
  239. if (to_integer(NameOrId, ProcResIdx, 10))
  240. return ProcResIdx;
  241. // Interpret the key as a ProcRes name.
  242. const auto &SchedModel = STI.getSchedModel();
  243. for (int I = 0, E = SchedModel.getNumProcResourceKinds(); I < E; ++I) {
  244. if (NameOrId == SchedModel.getProcResource(I)->Name)
  245. return I;
  246. }
  247. return 0;
  248. }
  249. std::vector<BenchmarkMeasure> ResolvedSchedClass::getAsPoint(
  250. InstructionBenchmark::ModeE Mode, const MCSubtargetInfo &STI,
  251. ArrayRef<PerInstructionStats> Representative) const {
  252. const size_t NumMeasurements = Representative.size();
  253. std::vector<BenchmarkMeasure> SchedClassPoint(NumMeasurements);
  254. if (Mode == InstructionBenchmark::Latency) {
  255. assert(NumMeasurements == 1 && "Latency is a single measure.");
  256. BenchmarkMeasure &LatencyMeasure = SchedClassPoint[0];
  257. // Find the latency.
  258. LatencyMeasure.PerInstructionValue = 0.0;
  259. for (unsigned I = 0; I < SCDesc->NumWriteLatencyEntries; ++I) {
  260. const MCWriteLatencyEntry *const WLE =
  261. STI.getWriteLatencyEntry(SCDesc, I);
  262. LatencyMeasure.PerInstructionValue =
  263. std::max<double>(LatencyMeasure.PerInstructionValue, WLE->Cycles);
  264. }
  265. } else if (Mode == InstructionBenchmark::Uops) {
  266. for (auto I : zip(SchedClassPoint, Representative)) {
  267. BenchmarkMeasure &Measure = std::get<0>(I);
  268. const PerInstructionStats &Stats = std::get<1>(I);
  269. StringRef Key = Stats.key();
  270. uint16_t ProcResIdx = findProcResIdx(STI, Key);
  271. if (ProcResIdx > 0) {
  272. // Find the pressure on ProcResIdx `Key`.
  273. const auto ProcResPressureIt =
  274. llvm::find_if(IdealizedProcResPressure,
  275. [ProcResIdx](const std::pair<uint16_t, float> &WPR) {
  276. return WPR.first == ProcResIdx;
  277. });
  278. Measure.PerInstructionValue =
  279. ProcResPressureIt == IdealizedProcResPressure.end()
  280. ? 0.0
  281. : ProcResPressureIt->second;
  282. } else if (Key == "NumMicroOps") {
  283. Measure.PerInstructionValue = SCDesc->NumMicroOps;
  284. } else {
  285. errs() << "expected `key` to be either a ProcResIdx or a ProcRes "
  286. "name, got "
  287. << Key << "\n";
  288. return {};
  289. }
  290. }
  291. } else if (Mode == InstructionBenchmark::InverseThroughput) {
  292. assert(NumMeasurements == 1 && "Inverse Throughput is a single measure.");
  293. BenchmarkMeasure &RThroughputMeasure = SchedClassPoint[0];
  294. RThroughputMeasure.PerInstructionValue =
  295. MCSchedModel::getReciprocalThroughput(STI, *SCDesc);
  296. } else {
  297. llvm_unreachable("unimplemented measurement matching mode");
  298. }
  299. return SchedClassPoint;
  300. }
  301. } // namespace exegesis
  302. } // namespace llvm