//===-- X86Counter.cpp ------------------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "X86Counter.h" #if defined(__linux__) && defined(HAVE_LIBPFM) && \ defined(LIBPFM_HAS_FIELD_CYCLES) // FIXME: Use appropriate wrappers for poll.h and mman.h // to support Windows and remove this linux-only guard. #include "llvm/Support/Endian.h" #include "llvm/Support/Errc.h" #error #include #error #include #error #include #include #include #include #include #include #include #include #include #include #include namespace llvm { namespace exegesis { // Number of entries in the LBR. static constexpr int kLbrEntries = 16; static constexpr size_t kBufferPages = 8; static const size_t kDataBufferSize = kBufferPages * getpagesize(); // First page is reserved for perf_event_mmap_page. Data buffer starts on // the next page, so we allocate one more page. static const size_t kMappedBufferSize = (kBufferPages + 1) * getpagesize(); // Waits for the LBR perf events. static int pollLbrPerfEvent(const int FileDescriptor) { struct pollfd PollFd; PollFd.fd = FileDescriptor; PollFd.events = POLLIN; PollFd.revents = 0; return poll(&PollFd, 1 /* num of fds */, 10000 /* timeout in ms */); } // Copies the data-buffer into Buf, given the pointer to MMapped. static void copyDataBuffer(void *MMappedBuffer, char *Buf, uint64_t Tail, size_t DataSize) { // First page is reserved for perf_event_mmap_page. Data buffer starts on // the next page. char *Start = reinterpret_cast(MMappedBuffer) + getpagesize(); // The LBR buffer is a cyclic buffer, we copy data to another buffer. uint64_t Offset = Tail % kDataBufferSize; size_t CopySize = kDataBufferSize - Offset; memcpy(Buf, Start + Offset, CopySize); if (CopySize >= DataSize) return; memcpy(Buf + CopySize, Start, Offset); return; } // Parses the given data-buffer for stats and fill the CycleArray. // If data has been extracted successfully, also modifies the code to jump // out the benchmark loop. static llvm::Error parseDataBuffer(const char *DataBuf, size_t DataSize, const void *From, const void *To, llvm::SmallVector *CycleArray) { const char *DataPtr = DataBuf; while (DataPtr < DataBuf + DataSize) { struct perf_event_header Header; memcpy(&Header, DataPtr, sizeof(struct perf_event_header)); if (Header.type != PERF_RECORD_SAMPLE) { // Ignores non-sample records. DataPtr += Header.size; continue; } DataPtr += sizeof(Header); uint64_t Count = llvm::support::endian::read64(DataPtr, support::native); DataPtr += sizeof(Count); struct perf_branch_entry Entry; memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry)); // Read the perf_branch_entry array. for (uint64_t i = 0; i < Count; ++i) { const uint64_t BlockStart = From == nullptr ? std::numeric_limits::min() : reinterpret_cast(From); const uint64_t BlockEnd = To == nullptr ? std::numeric_limits::max() : reinterpret_cast(To); if (BlockStart <= Entry.from && BlockEnd >= Entry.to) CycleArray->push_back(Entry.cycles); if (i == Count - 1) // We've reached the last entry. return llvm::Error::success(); // Advance to next entry DataPtr += sizeof(Entry); memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry)); } } return llvm::make_error("Unable to parse databuffer.", llvm::errc::io_error); } X86LbrPerfEvent::X86LbrPerfEvent(unsigned SamplingPeriod) { assert(SamplingPeriod > 0 && "SamplingPeriod must be positive"); EventString = "BR_INST_RETIRED.NEAR_TAKEN"; Attr = new perf_event_attr(); Attr->size = sizeof(*Attr); Attr->type = PERF_TYPE_RAW; // FIXME This is SKL's encoding. Not sure if it'll change. Attr->config = 0x20c4; // BR_INST_RETIRED.NEAR_TAKEN Attr->sample_type = PERF_SAMPLE_BRANCH_STACK; // Don't need to specify "USER" because we've already excluded HV and Kernel. Attr->branch_sample_type = PERF_SAMPLE_BRANCH_ANY; Attr->sample_period = SamplingPeriod; Attr->wakeup_events = 1; // We need this even when using ioctl REFRESH. Attr->disabled = 1; Attr->exclude_kernel = 1; Attr->exclude_hv = 1; Attr->read_format = PERF_FORMAT_GROUP; FullQualifiedEventString = EventString; } X86LbrCounter::X86LbrCounter(pfm::PerfEvent &&NewEvent) : Counter(std::move(NewEvent)) { MMappedBuffer = mmap(nullptr, kMappedBufferSize, PROT_READ | PROT_WRITE, MAP_SHARED, FileDescriptor, 0); if (MMappedBuffer == MAP_FAILED) llvm::errs() << "Failed to mmap buffer."; } X86LbrCounter::~X86LbrCounter() { if (0 != munmap(MMappedBuffer, kMappedBufferSize)) llvm::errs() << "Failed to munmap buffer."; } void X86LbrCounter::start() { ioctl(FileDescriptor, PERF_EVENT_IOC_REFRESH, 1024 /* kMaxPollsPerFd */); } llvm::Error X86LbrCounter::checkLbrSupport() { // Do a sample read and check if the results contain non-zero values. X86LbrCounter counter(X86LbrPerfEvent(123)); counter.start(); // Prevent the compiler from unrolling the loop and get rid of all the // branches. We need at least 16 iterations. int Sum = 0; int V = 1; volatile int *P = &V; auto TimeLimit = std::chrono::high_resolution_clock::now() + std::chrono::microseconds(5); for (int I = 0; I < kLbrEntries || std::chrono::high_resolution_clock::now() < TimeLimit; ++I) { Sum += *P; } counter.stop(); (void)Sum; auto ResultOrError = counter.doReadCounter(nullptr, nullptr); if (ResultOrError) if (!ResultOrError.get().empty()) // If there is at least one non-zero entry, then LBR is supported. for (const int64_t &Value : ResultOrError.get()) if (Value != 0) return Error::success(); return llvm::make_error( "LBR format with cycles is not suppported on the host.", llvm::errc::not_supported); } llvm::Expected> X86LbrCounter::readOrError(StringRef FunctionBytes) const { // Disable the event before reading ioctl(FileDescriptor, PERF_EVENT_IOC_DISABLE, 0); // Find the boundary of the function so that we could filter the LBRs // to keep only the relevant records. if (FunctionBytes.empty()) return llvm::make_error("Empty function bytes", llvm::errc::invalid_argument); const void *From = reinterpret_cast(FunctionBytes.data()); const void *To = reinterpret_cast(FunctionBytes.data() + FunctionBytes.size()); return doReadCounter(From, To); } llvm::Expected> X86LbrCounter::doReadCounter(const void *From, const void *To) const { // The max number of time-outs/retries before we give up. static constexpr int kMaxTimeouts = 160; // Parses the LBR buffer and fills CycleArray with the sequence of cycle // counts from the buffer. llvm::SmallVector CycleArray; auto DataBuf = std::make_unique(kDataBufferSize); int NumTimeouts = 0; int PollResult = 0; while (PollResult <= 0) { PollResult = pollLbrPerfEvent(FileDescriptor); if (PollResult > 0) break; if (PollResult == -1) return llvm::make_error("Cannot poll LBR perf event.", llvm::errc::io_error); if (NumTimeouts++ >= kMaxTimeouts) return llvm::make_error( "LBR polling still timed out after max number of attempts.", llvm::errc::device_or_resource_busy); } struct perf_event_mmap_page Page; memcpy(&Page, MMappedBuffer, sizeof(struct perf_event_mmap_page)); const uint64_t DataTail = Page.data_tail; const uint64_t DataHead = Page.data_head; // We're supposed to use a barrier after reading data_head. std::atomic_thread_fence(std::memory_order_acq_rel); const size_t DataSize = DataHead - DataTail; if (DataSize > kDataBufferSize) return llvm::make_error( "DataSize larger than buffer size.", llvm::errc::invalid_argument); copyDataBuffer(MMappedBuffer, DataBuf.get(), DataTail, DataSize); llvm::Error error = parseDataBuffer(DataBuf.get(), DataSize, From, To, &CycleArray); if (!error) return CycleArray; return std::move(error); } } // namespace exegesis } // namespace llvm #endif // defined(__linux__) && defined(HAVE_LIBPFM) && // defined(LIBPFM_HAS_FIELD_CYCLES)