1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681 |
- //===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===//
- //
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- // See https://llvm.org/LICENSE.txt for license information.
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- //
- //===----------------------------------------------------------------------===//
- //
- // This pass performs various transformations related to eliminating memcpy
- // calls, or transforming sets of stores into memset's.
- //
- //===----------------------------------------------------------------------===//
- #include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
- #include "llvm/ADT/DenseSet.h"
- #include "llvm/ADT/None.h"
- #include "llvm/ADT/STLExtras.h"
- #include "llvm/ADT/SmallVector.h"
- #include "llvm/ADT/Statistic.h"
- #include "llvm/ADT/iterator_range.h"
- #include "llvm/Analysis/AliasAnalysis.h"
- #include "llvm/Analysis/AssumptionCache.h"
- #include "llvm/Analysis/CaptureTracking.h"
- #include "llvm/Analysis/GlobalsModRef.h"
- #include "llvm/Analysis/Loads.h"
- #include "llvm/Analysis/MemoryLocation.h"
- #include "llvm/Analysis/MemorySSA.h"
- #include "llvm/Analysis/MemorySSAUpdater.h"
- #include "llvm/Analysis/TargetLibraryInfo.h"
- #include "llvm/Analysis/ValueTracking.h"
- #include "llvm/IR/Argument.h"
- #include "llvm/IR/BasicBlock.h"
- #include "llvm/IR/Constants.h"
- #include "llvm/IR/DataLayout.h"
- #include "llvm/IR/DerivedTypes.h"
- #include "llvm/IR/Dominators.h"
- #include "llvm/IR/Function.h"
- #include "llvm/IR/GetElementPtrTypeIterator.h"
- #include "llvm/IR/GlobalVariable.h"
- #include "llvm/IR/IRBuilder.h"
- #include "llvm/IR/InstrTypes.h"
- #include "llvm/IR/Instruction.h"
- #include "llvm/IR/Instructions.h"
- #include "llvm/IR/IntrinsicInst.h"
- #include "llvm/IR/Intrinsics.h"
- #include "llvm/IR/LLVMContext.h"
- #include "llvm/IR/Module.h"
- #include "llvm/IR/Operator.h"
- #include "llvm/IR/PassManager.h"
- #include "llvm/IR/Type.h"
- #include "llvm/IR/User.h"
- #include "llvm/IR/Value.h"
- #include "llvm/InitializePasses.h"
- #include "llvm/Pass.h"
- #include "llvm/Support/Casting.h"
- #include "llvm/Support/Debug.h"
- #include "llvm/Support/MathExtras.h"
- #include "llvm/Support/raw_ostream.h"
- #include "llvm/Transforms/Scalar.h"
- #include "llvm/Transforms/Utils/Local.h"
- #include <algorithm>
- #include <cassert>
- #include <cstdint>
- #include <utility>
- using namespace llvm;
- #define DEBUG_TYPE "memcpyopt"
- static cl::opt<bool> EnableMemCpyOptWithoutLibcalls(
- "enable-memcpyopt-without-libcalls", cl::init(false), cl::Hidden,
- cl::ZeroOrMore,
- cl::desc("Enable memcpyopt even when libcalls are disabled"));
- STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
- STATISTIC(NumMemSetInfer, "Number of memsets inferred");
- STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy");
- STATISTIC(NumCpyToSet, "Number of memcpys converted to memset");
- STATISTIC(NumCallSlot, "Number of call slot optimizations performed");
- namespace {
- /// Represents a range of memset'd bytes with the ByteVal value.
- /// This allows us to analyze stores like:
- /// store 0 -> P+1
- /// store 0 -> P+0
- /// store 0 -> P+3
- /// store 0 -> P+2
- /// which sometimes happens with stores to arrays of structs etc. When we see
- /// the first store, we make a range [1, 2). The second store extends the range
- /// to [0, 2). The third makes a new range [2, 3). The fourth store joins the
- /// two ranges into [0, 3) which is memset'able.
- struct MemsetRange {
- // Start/End - A semi range that describes the span that this range covers.
- // The range is closed at the start and open at the end: [Start, End).
- int64_t Start, End;
- /// StartPtr - The getelementptr instruction that points to the start of the
- /// range.
- Value *StartPtr;
- /// Alignment - The known alignment of the first store.
- unsigned Alignment;
- /// TheStores - The actual stores that make up this range.
- SmallVector<Instruction*, 16> TheStores;
- bool isProfitableToUseMemset(const DataLayout &DL) const;
- };
- } // end anonymous namespace
- bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
- // If we found more than 4 stores to merge or 16 bytes, use memset.
- if (TheStores.size() >= 4 || End-Start >= 16) return true;
- // If there is nothing to merge, don't do anything.
- if (TheStores.size() < 2) return false;
- // If any of the stores are a memset, then it is always good to extend the
- // memset.
- for (Instruction *SI : TheStores)
- if (!isa<StoreInst>(SI))
- return true;
- // Assume that the code generator is capable of merging pairs of stores
- // together if it wants to.
- if (TheStores.size() == 2) return false;
- // If we have fewer than 8 stores, it can still be worthwhile to do this.
- // For example, merging 4 i8 stores into an i32 store is useful almost always.
- // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
- // memset will be split into 2 32-bit stores anyway) and doing so can
- // pessimize the llvm optimizer.
- //
- // Since we don't have perfect knowledge here, make some assumptions: assume
- // the maximum GPR width is the same size as the largest legal integer
- // size. If so, check to see whether we will end up actually reducing the
- // number of stores used.
- unsigned Bytes = unsigned(End-Start);
- unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8;
- if (MaxIntSize == 0)
- MaxIntSize = 1;
- unsigned NumPointerStores = Bytes / MaxIntSize;
- // Assume the remaining bytes if any are done a byte at a time.
- unsigned NumByteStores = Bytes % MaxIntSize;
- // If we will reduce the # stores (according to this heuristic), do the
- // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
- // etc.
- return TheStores.size() > NumPointerStores+NumByteStores;
- }
- namespace {
- class MemsetRanges {
- using range_iterator = SmallVectorImpl<MemsetRange>::iterator;
- /// A sorted list of the memset ranges.
- SmallVector<MemsetRange, 8> Ranges;
- const DataLayout &DL;
- public:
- MemsetRanges(const DataLayout &DL) : DL(DL) {}
- using const_iterator = SmallVectorImpl<MemsetRange>::const_iterator;
- const_iterator begin() const { return Ranges.begin(); }
- const_iterator end() const { return Ranges.end(); }
- bool empty() const { return Ranges.empty(); }
- void addInst(int64_t OffsetFromFirst, Instruction *Inst) {
- if (auto *SI = dyn_cast<StoreInst>(Inst))
- addStore(OffsetFromFirst, SI);
- else
- addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst));
- }
- void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
- TypeSize StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
- assert(!StoreSize.isScalable() && "Can't track scalable-typed stores");
- addRange(OffsetFromFirst, StoreSize.getFixedSize(), SI->getPointerOperand(),
- SI->getAlign().value(), SI);
- }
- void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
- int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue();
- addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getDestAlignment(), MSI);
- }
- void addRange(int64_t Start, int64_t Size, Value *Ptr,
- unsigned Alignment, Instruction *Inst);
- };
- } // end anonymous namespace
- /// Add a new store to the MemsetRanges data structure. This adds a
- /// new range for the specified store at the specified offset, merging into
- /// existing ranges as appropriate.
- void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
- unsigned Alignment, Instruction *Inst) {
- int64_t End = Start+Size;
- range_iterator I = partition_point(
- Ranges, [=](const MemsetRange &O) { return O.End < Start; });
- // We now know that I == E, in which case we didn't find anything to merge
- // with, or that Start <= I->End. If End < I->Start or I == E, then we need
- // to insert a new range. Handle this now.
- if (I == Ranges.end() || End < I->Start) {
- MemsetRange &R = *Ranges.insert(I, MemsetRange());
- R.Start = Start;
- R.End = End;
- R.StartPtr = Ptr;
- R.Alignment = Alignment;
- R.TheStores.push_back(Inst);
- return;
- }
- // This store overlaps with I, add it.
- I->TheStores.push_back(Inst);
- // At this point, we may have an interval that completely contains our store.
- // If so, just add it to the interval and return.
- if (I->Start <= Start && I->End >= End)
- return;
- // Now we know that Start <= I->End and End >= I->Start so the range overlaps
- // but is not entirely contained within the range.
- // See if the range extends the start of the range. In this case, it couldn't
- // possibly cause it to join the prior range, because otherwise we would have
- // stopped on *it*.
- if (Start < I->Start) {
- I->Start = Start;
- I->StartPtr = Ptr;
- I->Alignment = Alignment;
- }
- // Now we know that Start <= I->End and Start >= I->Start (so the startpoint
- // is in or right at the end of I), and that End >= I->Start. Extend I out to
- // End.
- if (End > I->End) {
- I->End = End;
- range_iterator NextI = I;
- while (++NextI != Ranges.end() && End >= NextI->Start) {
- // Merge the range in.
- I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end());
- if (NextI->End > I->End)
- I->End = NextI->End;
- Ranges.erase(NextI);
- NextI = I;
- }
- }
- }
- //===----------------------------------------------------------------------===//
- // MemCpyOptLegacyPass Pass
- //===----------------------------------------------------------------------===//
- namespace {
- class MemCpyOptLegacyPass : public FunctionPass {
- MemCpyOptPass Impl;
- public:
- static char ID; // Pass identification, replacement for typeid
- MemCpyOptLegacyPass() : FunctionPass(ID) {
- initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry());
- }
- bool runOnFunction(Function &F) override;
- private:
- // This transformation requires dominator postdominator info
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<AAResultsWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
- AU.addRequired<MemorySSAWrapperPass>();
- AU.addPreserved<MemorySSAWrapperPass>();
- }
- };
- } // end anonymous namespace
- char MemCpyOptLegacyPass::ID = 0;
- /// The public interface to this file...
- FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); }
- INITIALIZE_PASS_BEGIN(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
- false, false)
- INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
- INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
- INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
- INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
- INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
- INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
- INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
- false, false)
- // Check that V is either not accessible by the caller, or unwinding cannot
- // occur between Start and End.
- static bool mayBeVisibleThroughUnwinding(Value *V, Instruction *Start,
- Instruction *End) {
- assert(Start->getParent() == End->getParent() && "Must be in same block");
- // Function can't unwind, so it also can't be visible through unwinding.
- if (Start->getFunction()->doesNotThrow())
- return false;
- // Object is not visible on unwind.
- // TODO: Support RequiresNoCaptureBeforeUnwind case.
- bool RequiresNoCaptureBeforeUnwind;
- if (isNotVisibleOnUnwind(getUnderlyingObject(V),
- RequiresNoCaptureBeforeUnwind) &&
- !RequiresNoCaptureBeforeUnwind)
- return false;
- // Check whether there are any unwinding instructions in the range.
- return any_of(make_range(Start->getIterator(), End->getIterator()),
- [](const Instruction &I) { return I.mayThrow(); });
- }
- void MemCpyOptPass::eraseInstruction(Instruction *I) {
- MSSAU->removeMemoryAccess(I);
- I->eraseFromParent();
- }
- // Check for mod or ref of Loc between Start and End, excluding both boundaries.
- // Start and End must be in the same block
- static bool accessedBetween(AliasAnalysis &AA, MemoryLocation Loc,
- const MemoryUseOrDef *Start,
- const MemoryUseOrDef *End) {
- assert(Start->getBlock() == End->getBlock() && "Only local supported");
- for (const MemoryAccess &MA :
- make_range(++Start->getIterator(), End->getIterator())) {
- if (isModOrRefSet(AA.getModRefInfo(cast<MemoryUseOrDef>(MA).getMemoryInst(),
- Loc)))
- return true;
- }
- return false;
- }
- // Check for mod of Loc between Start and End, excluding both boundaries.
- // Start and End can be in different blocks.
- static bool writtenBetween(MemorySSA *MSSA, MemoryLocation Loc,
- const MemoryUseOrDef *Start,
- const MemoryUseOrDef *End) {
- // TODO: Only walk until we hit Start.
- MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
- End->getDefiningAccess(), Loc);
- return !MSSA->dominates(Clobber, Start);
- }
- /// When scanning forward over instructions, we look for some other patterns to
- /// fold away. In particular, this looks for stores to neighboring locations of
- /// memory. If it sees enough consecutive ones, it attempts to merge them
- /// together into a memcpy/memset.
- Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
- Value *StartPtr,
- Value *ByteVal) {
- const DataLayout &DL = StartInst->getModule()->getDataLayout();
- // We can't track scalable types
- if (auto *SI = dyn_cast<StoreInst>(StartInst))
- if (DL.getTypeStoreSize(SI->getOperand(0)->getType()).isScalable())
- return nullptr;
- // Okay, so we now have a single store that can be splatable. Scan to find
- // all subsequent stores of the same value to offset from the same pointer.
- // Join these together into ranges, so we can decide whether contiguous blocks
- // are stored.
- MemsetRanges Ranges(DL);
- BasicBlock::iterator BI(StartInst);
- // Keeps track of the last memory use or def before the insertion point for
- // the new memset. The new MemoryDef for the inserted memsets will be inserted
- // after MemInsertPoint. It points to either LastMemDef or to the last user
- // before the insertion point of the memset, if there are any such users.
- MemoryUseOrDef *MemInsertPoint = nullptr;
- // Keeps track of the last MemoryDef between StartInst and the insertion point
- // for the new memset. This will become the defining access of the inserted
- // memsets.
- MemoryDef *LastMemDef = nullptr;
- for (++BI; !BI->isTerminator(); ++BI) {
- auto *CurrentAcc = cast_or_null<MemoryUseOrDef>(
- MSSAU->getMemorySSA()->getMemoryAccess(&*BI));
- if (CurrentAcc) {
- MemInsertPoint = CurrentAcc;
- if (auto *CurrentDef = dyn_cast<MemoryDef>(CurrentAcc))
- LastMemDef = CurrentDef;
- }
- // Calls that only access inaccessible memory do not block merging
- // accessible stores.
- if (auto *CB = dyn_cast<CallBase>(BI)) {
- if (CB->onlyAccessesInaccessibleMemory())
- continue;
- }
- if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
- // If the instruction is readnone, ignore it, otherwise bail out. We
- // don't even allow readonly here because we don't want something like:
- // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
- if (BI->mayWriteToMemory() || BI->mayReadFromMemory())
- break;
- continue;
- }
- if (auto *NextStore = dyn_cast<StoreInst>(BI)) {
- // If this is a store, see if we can merge it in.
- if (!NextStore->isSimple()) break;
- Value *StoredVal = NextStore->getValueOperand();
- // Don't convert stores of non-integral pointer types to memsets (which
- // stores integers).
- if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
- break;
- // We can't track ranges involving scalable types.
- if (DL.getTypeStoreSize(StoredVal->getType()).isScalable())
- break;
- // Check to see if this stored value is of the same byte-splattable value.
- Value *StoredByte = isBytewiseValue(StoredVal, DL);
- if (isa<UndefValue>(ByteVal) && StoredByte)
- ByteVal = StoredByte;
- if (ByteVal != StoredByte)
- break;
- // Check to see if this store is to a constant offset from the start ptr.
- Optional<int64_t> Offset =
- isPointerOffset(StartPtr, NextStore->getPointerOperand(), DL);
- if (!Offset)
- break;
- Ranges.addStore(*Offset, NextStore);
- } else {
- auto *MSI = cast<MemSetInst>(BI);
- if (MSI->isVolatile() || ByteVal != MSI->getValue() ||
- !isa<ConstantInt>(MSI->getLength()))
- break;
- // Check to see if this store is to a constant offset from the start ptr.
- Optional<int64_t> Offset = isPointerOffset(StartPtr, MSI->getDest(), DL);
- if (!Offset)
- break;
- Ranges.addMemSet(*Offset, MSI);
- }
- }
- // If we have no ranges, then we just had a single store with nothing that
- // could be merged in. This is a very common case of course.
- if (Ranges.empty())
- return nullptr;
- // If we had at least one store that could be merged in, add the starting
- // store as well. We try to avoid this unless there is at least something
- // interesting as a small compile-time optimization.
- Ranges.addInst(0, StartInst);
- // If we create any memsets, we put it right before the first instruction that
- // isn't part of the memset block. This ensure that the memset is dominated
- // by any addressing instruction needed by the start of the block.
- IRBuilder<> Builder(&*BI);
- // Now that we have full information about ranges, loop over the ranges and
- // emit memset's for anything big enough to be worthwhile.
- Instruction *AMemSet = nullptr;
- for (const MemsetRange &Range : Ranges) {
- if (Range.TheStores.size() == 1) continue;
- // If it is profitable to lower this range to memset, do so now.
- if (!Range.isProfitableToUseMemset(DL))
- continue;
- // Otherwise, we do want to transform this! Create a new memset.
- // Get the starting pointer of the block.
- StartPtr = Range.StartPtr;
- AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End - Range.Start,
- MaybeAlign(Range.Alignment));
- LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI
- : Range.TheStores) dbgs()
- << *SI << '\n';
- dbgs() << "With: " << *AMemSet << '\n');
- if (!Range.TheStores.empty())
- AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
- assert(LastMemDef && MemInsertPoint &&
- "Both LastMemDef and MemInsertPoint need to be set");
- auto *NewDef =
- cast<MemoryDef>(MemInsertPoint->getMemoryInst() == &*BI
- ? MSSAU->createMemoryAccessBefore(
- AMemSet, LastMemDef, MemInsertPoint)
- : MSSAU->createMemoryAccessAfter(
- AMemSet, LastMemDef, MemInsertPoint));
- MSSAU->insertDef(NewDef, /*RenameUses=*/true);
- LastMemDef = NewDef;
- MemInsertPoint = NewDef;
- // Zap all the stores.
- for (Instruction *SI : Range.TheStores)
- eraseInstruction(SI);
- ++NumMemSetInfer;
- }
- return AMemSet;
- }
- // This method try to lift a store instruction before position P.
- // It will lift the store and its argument + that anything that
- // may alias with these.
- // The method returns true if it was successful.
- bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) {
- // If the store alias this position, early bail out.
- MemoryLocation StoreLoc = MemoryLocation::get(SI);
- if (isModOrRefSet(AA->getModRefInfo(P, StoreLoc)))
- return false;
- // Keep track of the arguments of all instruction we plan to lift
- // so we can make sure to lift them as well if appropriate.
- DenseSet<Instruction*> Args;
- if (auto *Ptr = dyn_cast<Instruction>(SI->getPointerOperand()))
- if (Ptr->getParent() == SI->getParent())
- Args.insert(Ptr);
- // Instruction to lift before P.
- SmallVector<Instruction *, 8> ToLift{SI};
- // Memory locations of lifted instructions.
- SmallVector<MemoryLocation, 8> MemLocs{StoreLoc};
- // Lifted calls.
- SmallVector<const CallBase *, 8> Calls;
- const MemoryLocation LoadLoc = MemoryLocation::get(LI);
- for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) {
- auto *C = &*I;
- // Make sure hoisting does not perform a store that was not guaranteed to
- // happen.
- if (!isGuaranteedToTransferExecutionToSuccessor(C))
- return false;
- bool MayAlias = isModOrRefSet(AA->getModRefInfo(C, None));
- bool NeedLift = false;
- if (Args.erase(C))
- NeedLift = true;
- else if (MayAlias) {
- NeedLift = llvm::any_of(MemLocs, [C, this](const MemoryLocation &ML) {
- return isModOrRefSet(AA->getModRefInfo(C, ML));
- });
- if (!NeedLift)
- NeedLift = llvm::any_of(Calls, [C, this](const CallBase *Call) {
- return isModOrRefSet(AA->getModRefInfo(C, Call));
- });
- }
- if (!NeedLift)
- continue;
- if (MayAlias) {
- // Since LI is implicitly moved downwards past the lifted instructions,
- // none of them may modify its source.
- if (isModSet(AA->getModRefInfo(C, LoadLoc)))
- return false;
- else if (const auto *Call = dyn_cast<CallBase>(C)) {
- // If we can't lift this before P, it's game over.
- if (isModOrRefSet(AA->getModRefInfo(P, Call)))
- return false;
- Calls.push_back(Call);
- } else if (isa<LoadInst>(C) || isa<StoreInst>(C) || isa<VAArgInst>(C)) {
- // If we can't lift this before P, it's game over.
- auto ML = MemoryLocation::get(C);
- if (isModOrRefSet(AA->getModRefInfo(P, ML)))
- return false;
- MemLocs.push_back(ML);
- } else
- // We don't know how to lift this instruction.
- return false;
- }
- ToLift.push_back(C);
- for (unsigned k = 0, e = C->getNumOperands(); k != e; ++k)
- if (auto *A = dyn_cast<Instruction>(C->getOperand(k))) {
- if (A->getParent() == SI->getParent()) {
- // Cannot hoist user of P above P
- if(A == P) return false;
- Args.insert(A);
- }
- }
- }
- // Find MSSA insertion point. Normally P will always have a corresponding
- // memory access before which we can insert. However, with non-standard AA
- // pipelines, there may be a mismatch between AA and MSSA, in which case we
- // will scan for a memory access before P. In either case, we know for sure
- // that at least the load will have a memory access.
- // TODO: Simplify this once P will be determined by MSSA, in which case the
- // discrepancy can no longer occur.
- MemoryUseOrDef *MemInsertPoint = nullptr;
- if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(P)) {
- MemInsertPoint = cast<MemoryUseOrDef>(--MA->getIterator());
- } else {
- const Instruction *ConstP = P;
- for (const Instruction &I : make_range(++ConstP->getReverseIterator(),
- ++LI->getReverseIterator())) {
- if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(&I)) {
- MemInsertPoint = MA;
- break;
- }
- }
- }
- // We made it, we need to lift.
- for (auto *I : llvm::reverse(ToLift)) {
- LLVM_DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
- I->moveBefore(P);
- assert(MemInsertPoint && "Must have found insert point");
- if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(I)) {
- MSSAU->moveAfter(MA, MemInsertPoint);
- MemInsertPoint = MA;
- }
- }
- return true;
- }
- bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
- if (!SI->isSimple()) return false;
- // Avoid merging nontemporal stores since the resulting
- // memcpy/memset would not be able to preserve the nontemporal hint.
- // In theory we could teach how to propagate the !nontemporal metadata to
- // memset calls. However, that change would force the backend to
- // conservatively expand !nontemporal memset calls back to sequences of
- // store instructions (effectively undoing the merging).
- if (SI->getMetadata(LLVMContext::MD_nontemporal))
- return false;
- const DataLayout &DL = SI->getModule()->getDataLayout();
- Value *StoredVal = SI->getValueOperand();
- // Not all the transforms below are correct for non-integral pointers, bail
- // until we've audited the individual pieces.
- if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
- return false;
- // Load to store forwarding can be interpreted as memcpy.
- if (auto *LI = dyn_cast<LoadInst>(StoredVal)) {
- if (LI->isSimple() && LI->hasOneUse() &&
- LI->getParent() == SI->getParent()) {
- auto *T = LI->getType();
- // Don't introduce calls to memcpy/memmove intrinsics out of thin air if
- // the corresponding libcalls are not available.
- // TODO: We should really distinguish between libcall availability and
- // our ability to introduce intrinsics.
- if (T->isAggregateType() &&
- (EnableMemCpyOptWithoutLibcalls ||
- (TLI->has(LibFunc_memcpy) && TLI->has(LibFunc_memmove)))) {
- MemoryLocation LoadLoc = MemoryLocation::get(LI);
- // We use alias analysis to check if an instruction may store to
- // the memory we load from in between the load and the store. If
- // such an instruction is found, we try to promote there instead
- // of at the store position.
- // TODO: Can use MSSA for this.
- Instruction *P = SI;
- for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) {
- if (isModSet(AA->getModRefInfo(&I, LoadLoc))) {
- P = &I;
- break;
- }
- }
- // We found an instruction that may write to the loaded memory.
- // We can try to promote at this position instead of the store
- // position if nothing aliases the store memory after this and the store
- // destination is not in the range.
- if (P && P != SI) {
- if (!moveUp(SI, P, LI))
- P = nullptr;
- }
- // If a valid insertion position is found, then we can promote
- // the load/store pair to a memcpy.
- if (P) {
- // If we load from memory that may alias the memory we store to,
- // memmove must be used to preserve semantic. If not, memcpy can
- // be used. Also, if we load from constant memory, memcpy can be used
- // as the constant memory won't be modified.
- bool UseMemMove = false;
- if (isModSet(AA->getModRefInfo(SI, LoadLoc)))
- UseMemMove = true;
- uint64_t Size = DL.getTypeStoreSize(T);
- IRBuilder<> Builder(P);
- Instruction *M;
- if (UseMemMove)
- M = Builder.CreateMemMove(
- SI->getPointerOperand(), SI->getAlign(),
- LI->getPointerOperand(), LI->getAlign(), Size);
- else
- M = Builder.CreateMemCpy(
- SI->getPointerOperand(), SI->getAlign(),
- LI->getPointerOperand(), LI->getAlign(), Size);
- LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => "
- << *M << "\n");
- auto *LastDef =
- cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI));
- auto *NewAccess = MSSAU->createMemoryAccessAfter(M, LastDef, LastDef);
- MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
- eraseInstruction(SI);
- eraseInstruction(LI);
- ++NumMemCpyInstr;
- // Make sure we do not invalidate the iterator.
- BBI = M->getIterator();
- return true;
- }
- }
- // Detect cases where we're performing call slot forwarding, but
- // happen to be using a load-store pair to implement it, rather than
- // a memcpy.
- CallInst *C = nullptr;
- if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>(
- MSSA->getWalker()->getClobberingMemoryAccess(LI))) {
- // The load most post-dom the call. Limit to the same block for now.
- // TODO: Support non-local call-slot optimization?
- if (LoadClobber->getBlock() == SI->getParent())
- C = dyn_cast_or_null<CallInst>(LoadClobber->getMemoryInst());
- }
- if (C) {
- // Check that nothing touches the dest of the "copy" between
- // the call and the store.
- MemoryLocation StoreLoc = MemoryLocation::get(SI);
- if (accessedBetween(*AA, StoreLoc, MSSA->getMemoryAccess(C),
- MSSA->getMemoryAccess(SI)))
- C = nullptr;
- }
- if (C) {
- bool changed = performCallSlotOptzn(
- LI, SI, SI->getPointerOperand()->stripPointerCasts(),
- LI->getPointerOperand()->stripPointerCasts(),
- DL.getTypeStoreSize(SI->getOperand(0)->getType()),
- commonAlignment(SI->getAlign(), LI->getAlign()), C);
- if (changed) {
- eraseInstruction(SI);
- eraseInstruction(LI);
- ++NumMemCpyInstr;
- return true;
- }
- }
- }
- }
- // The following code creates memset intrinsics out of thin air. Don't do
- // this if the corresponding libfunc is not available.
- // TODO: We should really distinguish between libcall availability and
- // our ability to introduce intrinsics.
- if (!(TLI->has(LibFunc_memset) || EnableMemCpyOptWithoutLibcalls))
- return false;
- // There are two cases that are interesting for this code to handle: memcpy
- // and memset. Right now we only handle memset.
- // Ensure that the value being stored is something that can be memset'able a
- // byte at a time like "0" or "-1" or any width, as well as things like
- // 0xA0A0A0A0 and 0.0.
- auto *V = SI->getOperand(0);
- if (Value *ByteVal = isBytewiseValue(V, DL)) {
- if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(),
- ByteVal)) {
- BBI = I->getIterator(); // Don't invalidate iterator.
- return true;
- }
- // If we have an aggregate, we try to promote it to memset regardless
- // of opportunity for merging as it can expose optimization opportunities
- // in subsequent passes.
- auto *T = V->getType();
- if (T->isAggregateType()) {
- uint64_t Size = DL.getTypeStoreSize(T);
- IRBuilder<> Builder(SI);
- auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size,
- SI->getAlign());
- LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
- // The newly inserted memset is immediately overwritten by the original
- // store, so we do not need to rename uses.
- auto *StoreDef = cast<MemoryDef>(MSSA->getMemoryAccess(SI));
- auto *NewAccess = MSSAU->createMemoryAccessBefore(
- M, StoreDef->getDefiningAccess(), StoreDef);
- MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/false);
- eraseInstruction(SI);
- NumMemSetInfer++;
- // Make sure we do not invalidate the iterator.
- BBI = M->getIterator();
- return true;
- }
- }
- return false;
- }
- bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
- // See if there is another memset or store neighboring this memset which
- // allows us to widen out the memset to do a single larger store.
- if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile())
- if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(),
- MSI->getValue())) {
- BBI = I->getIterator(); // Don't invalidate iterator.
- return true;
- }
- return false;
- }
- /// Takes a memcpy and a call that it depends on,
- /// and checks for the possibility of a call slot optimization by having
- /// the call write its result directly into the destination of the memcpy.
- bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
- Instruction *cpyStore, Value *cpyDest,
- Value *cpySrc, TypeSize cpySize,
- Align cpyAlign, CallInst *C) {
- // The general transformation to keep in mind is
- //
- // call @func(..., src, ...)
- // memcpy(dest, src, ...)
- //
- // ->
- //
- // memcpy(dest, src, ...)
- // call @func(..., dest, ...)
- //
- // Since moving the memcpy is technically awkward, we additionally check that
- // src only holds uninitialized values at the moment of the call, meaning that
- // the memcpy can be discarded rather than moved.
- // We can't optimize scalable types.
- if (cpySize.isScalable())
- return false;
- // Lifetime marks shouldn't be operated on.
- if (Function *F = C->getCalledFunction())
- if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
- return false;
- // Require that src be an alloca. This simplifies the reasoning considerably.
- auto *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
- if (!srcAlloca)
- return false;
- ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize());
- if (!srcArraySize)
- return false;
- const DataLayout &DL = cpyLoad->getModule()->getDataLayout();
- uint64_t srcSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType()) *
- srcArraySize->getZExtValue();
- if (cpySize < srcSize)
- return false;
- // Check that accessing the first srcSize bytes of dest will not cause a
- // trap. Otherwise the transform is invalid since it might cause a trap
- // to occur earlier than it otherwise would.
- if (!isDereferenceableAndAlignedPointer(cpyDest, Align(1), APInt(64, cpySize),
- DL, C, DT)) {
- LLVM_DEBUG(dbgs() << "Call Slot: Dest pointer not dereferenceable\n");
- return false;
- }
- // Make sure that nothing can observe cpyDest being written early. There are
- // a number of cases to consider:
- // 1. cpyDest cannot be accessed between C and cpyStore as a precondition of
- // the transform.
- // 2. C itself may not access cpyDest (prior to the transform). This is
- // checked further below.
- // 3. If cpyDest is accessible to the caller of this function (potentially
- // captured and not based on an alloca), we need to ensure that we cannot
- // unwind between C and cpyStore. This is checked here.
- // 4. If cpyDest is potentially captured, there may be accesses to it from
- // another thread. In this case, we need to check that cpyStore is
- // guaranteed to be executed if C is. As it is a non-atomic access, it
- // renders accesses from other threads undefined.
- // TODO: This is currently not checked.
- if (mayBeVisibleThroughUnwinding(cpyDest, C, cpyStore)) {
- LLVM_DEBUG(dbgs() << "Call Slot: Dest may be visible through unwinding");
- return false;
- }
- // Check that dest points to memory that is at least as aligned as src.
- Align srcAlign = srcAlloca->getAlign();
- bool isDestSufficientlyAligned = srcAlign <= cpyAlign;
- // If dest is not aligned enough and we can't increase its alignment then
- // bail out.
- if (!isDestSufficientlyAligned && !isa<AllocaInst>(cpyDest))
- return false;
- // Check that src is not accessed except via the call and the memcpy. This
- // guarantees that it holds only undefined values when passed in (so the final
- // memcpy can be dropped), that it is not read or written between the call and
- // the memcpy, and that writing beyond the end of it is undefined.
- SmallVector<User *, 8> srcUseList(srcAlloca->users());
- while (!srcUseList.empty()) {
- User *U = srcUseList.pop_back_val();
- if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) {
- append_range(srcUseList, U->users());
- continue;
- }
- if (const auto *G = dyn_cast<GetElementPtrInst>(U)) {
- if (!G->hasAllZeroIndices())
- return false;
- append_range(srcUseList, U->users());
- continue;
- }
- if (const auto *IT = dyn_cast<IntrinsicInst>(U))
- if (IT->isLifetimeStartOrEnd())
- continue;
- if (U != C && U != cpyLoad)
- return false;
- }
- // Check whether src is captured by the called function, in which case there
- // may be further indirect uses of src.
- bool SrcIsCaptured = any_of(C->args(), [&](Use &U) {
- return U->stripPointerCasts() == cpySrc &&
- !C->doesNotCapture(C->getArgOperandNo(&U));
- });
- // If src is captured, then check whether there are any potential uses of
- // src through the captured pointer before the lifetime of src ends, either
- // due to a lifetime.end or a return from the function.
- if (SrcIsCaptured) {
- // Check that dest is not captured before/at the call. We have already
- // checked that src is not captured before it. If either had been captured,
- // then the call might be comparing the argument against the captured dest
- // or src pointer.
- Value *DestObj = getUnderlyingObject(cpyDest);
- if (!isIdentifiedFunctionLocal(DestObj) ||
- PointerMayBeCapturedBefore(DestObj, /* ReturnCaptures */ true,
- /* StoreCaptures */ true, C, DT,
- /* IncludeI */ true))
- return false;
- MemoryLocation SrcLoc =
- MemoryLocation(srcAlloca, LocationSize::precise(srcSize));
- for (Instruction &I :
- make_range(++C->getIterator(), C->getParent()->end())) {
- // Lifetime of srcAlloca ends at lifetime.end.
- if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
- if (II->getIntrinsicID() == Intrinsic::lifetime_end &&
- II->getArgOperand(1)->stripPointerCasts() == srcAlloca &&
- cast<ConstantInt>(II->getArgOperand(0))->uge(srcSize))
- break;
- }
- // Lifetime of srcAlloca ends at return.
- if (isa<ReturnInst>(&I))
- break;
- // Ignore the direct read of src in the load.
- if (&I == cpyLoad)
- continue;
- // Check whether this instruction may mod/ref src through the captured
- // pointer (we have already any direct mod/refs in the loop above).
- // Also bail if we hit a terminator, as we don't want to scan into other
- // blocks.
- if (isModOrRefSet(AA->getModRefInfo(&I, SrcLoc)) || I.isTerminator())
- return false;
- }
- }
- // Since we're changing the parameter to the callsite, we need to make sure
- // that what would be the new parameter dominates the callsite.
- if (!DT->dominates(cpyDest, C)) {
- // Support moving a constant index GEP before the call.
- auto *GEP = dyn_cast<GetElementPtrInst>(cpyDest);
- if (GEP && GEP->hasAllConstantIndices() &&
- DT->dominates(GEP->getPointerOperand(), C))
- GEP->moveBefore(C);
- else
- return false;
- }
- // In addition to knowing that the call does not access src in some
- // unexpected manner, for example via a global, which we deduce from
- // the use analysis, we also need to know that it does not sneakily
- // access dest. We rely on AA to figure this out for us.
- ModRefInfo MR = AA->getModRefInfo(C, cpyDest, LocationSize::precise(srcSize));
- // If necessary, perform additional analysis.
- if (isModOrRefSet(MR))
- MR = AA->callCapturesBefore(C, cpyDest, LocationSize::precise(srcSize), DT);
- if (isModOrRefSet(MR))
- return false;
- // We can't create address space casts here because we don't know if they're
- // safe for the target.
- if (cpySrc->getType()->getPointerAddressSpace() !=
- cpyDest->getType()->getPointerAddressSpace())
- return false;
- for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI)
- if (C->getArgOperand(ArgI)->stripPointerCasts() == cpySrc &&
- cpySrc->getType()->getPointerAddressSpace() !=
- C->getArgOperand(ArgI)->getType()->getPointerAddressSpace())
- return false;
- // All the checks have passed, so do the transformation.
- bool changedArgument = false;
- for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI)
- if (C->getArgOperand(ArgI)->stripPointerCasts() == cpySrc) {
- Value *Dest = cpySrc->getType() == cpyDest->getType() ? cpyDest
- : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(),
- cpyDest->getName(), C);
- changedArgument = true;
- if (C->getArgOperand(ArgI)->getType() == Dest->getType())
- C->setArgOperand(ArgI, Dest);
- else
- C->setArgOperand(ArgI, CastInst::CreatePointerCast(
- Dest, C->getArgOperand(ArgI)->getType(),
- Dest->getName(), C));
- }
- if (!changedArgument)
- return false;
- // If the destination wasn't sufficiently aligned then increase its alignment.
- if (!isDestSufficientlyAligned) {
- assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!");
- cast<AllocaInst>(cpyDest)->setAlignment(srcAlign);
- }
- // Update AA metadata
- // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be
- // handled here, but combineMetadata doesn't support them yet
- unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
- LLVMContext::MD_noalias,
- LLVMContext::MD_invariant_group,
- LLVMContext::MD_access_group};
- combineMetadata(C, cpyLoad, KnownIDs, true);
- if (cpyLoad != cpyStore)
- combineMetadata(C, cpyStore, KnownIDs, true);
- ++NumCallSlot;
- return true;
- }
- /// We've found that the (upward scanning) memory dependence of memcpy 'M' is
- /// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can.
- bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
- MemCpyInst *MDep) {
- // We can only transforms memcpy's where the dest of one is the source of the
- // other.
- if (M->getSource() != MDep->getDest() || MDep->isVolatile())
- return false;
- // If dep instruction is reading from our current input, then it is a noop
- // transfer and substituting the input won't change this instruction. Just
- // ignore the input and let someone else zap MDep. This handles cases like:
- // memcpy(a <- a)
- // memcpy(b <- a)
- if (M->getSource() == MDep->getSource())
- return false;
- // Second, the length of the memcpy's must be the same, or the preceding one
- // must be larger than the following one.
- if (MDep->getLength() != M->getLength()) {
- auto *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
- auto *MLen = dyn_cast<ConstantInt>(M->getLength());
- if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
- return false;
- }
- // Verify that the copied-from memory doesn't change in between the two
- // transfers. For example, in:
- // memcpy(a <- b)
- // *b = 42;
- // memcpy(c <- a)
- // It would be invalid to transform the second memcpy into memcpy(c <- b).
- //
- // TODO: If the code between M and MDep is transparent to the destination "c",
- // then we could still perform the xform by moving M up to the first memcpy.
- // TODO: It would be sufficient to check the MDep source up to the memcpy
- // size of M, rather than MDep.
- if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep),
- MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(M)))
- return false;
- // If the dest of the second might alias the source of the first, then the
- // source and dest might overlap. In addition, if the source of the first
- // points to constant memory, they won't overlap by definition. Otherwise, we
- // still want to eliminate the intermediate value, but we have to generate a
- // memmove instead of memcpy.
- bool UseMemMove = false;
- if (isModSet(AA->getModRefInfo(M, MemoryLocation::getForSource(MDep))))
- UseMemMove = true;
- // If all checks passed, then we can transform M.
- LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy->memcpy src:\n"
- << *MDep << '\n' << *M << '\n');
- // TODO: Is this worth it if we're creating a less aligned memcpy? For
- // example we could be moving from movaps -> movq on x86.
- IRBuilder<> Builder(M);
- Instruction *NewM;
- if (UseMemMove)
- NewM = Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(),
- MDep->getRawSource(), MDep->getSourceAlign(),
- M->getLength(), M->isVolatile());
- else if (isa<MemCpyInlineInst>(M)) {
- // llvm.memcpy may be promoted to llvm.memcpy.inline, but the converse is
- // never allowed since that would allow the latter to be lowered as a call
- // to an external function.
- NewM = Builder.CreateMemCpyInline(
- M->getRawDest(), M->getDestAlign(), MDep->getRawSource(),
- MDep->getSourceAlign(), M->getLength(), M->isVolatile());
- } else
- NewM = Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(),
- MDep->getRawSource(), MDep->getSourceAlign(),
- M->getLength(), M->isVolatile());
- assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M)));
- auto *LastDef = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M));
- auto *NewAccess = MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef);
- MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
- // Remove the instruction we're replacing.
- eraseInstruction(M);
- ++NumMemCpyInstr;
- return true;
- }
- /// We've found that the (upward scanning) memory dependence of \p MemCpy is
- /// \p MemSet. Try to simplify \p MemSet to only set the trailing bytes that
- /// weren't copied over by \p MemCpy.
- ///
- /// In other words, transform:
- /// \code
- /// memset(dst, c, dst_size);
- /// memcpy(dst, src, src_size);
- /// \endcode
- /// into:
- /// \code
- /// memcpy(dst, src, src_size);
- /// memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size);
- /// \endcode
- bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
- MemSetInst *MemSet) {
- // We can only transform memset/memcpy with the same destination.
- if (!AA->isMustAlias(MemSet->getDest(), MemCpy->getDest()))
- return false;
- // Check that src and dst of the memcpy aren't the same. While memcpy
- // operands cannot partially overlap, exact equality is allowed.
- if (isModSet(AA->getModRefInfo(MemCpy, MemoryLocation::getForSource(MemCpy))))
- return false;
- // We know that dst up to src_size is not written. We now need to make sure
- // that dst up to dst_size is not accessed. (If we did not move the memset,
- // checking for reads would be sufficient.)
- if (accessedBetween(*AA, MemoryLocation::getForDest(MemSet),
- MSSA->getMemoryAccess(MemSet),
- MSSA->getMemoryAccess(MemCpy)))
- return false;
- // Use the same i8* dest as the memcpy, killing the memset dest if different.
- Value *Dest = MemCpy->getRawDest();
- Value *DestSize = MemSet->getLength();
- Value *SrcSize = MemCpy->getLength();
- if (mayBeVisibleThroughUnwinding(Dest, MemSet, MemCpy))
- return false;
- // If the sizes are the same, simply drop the memset instead of generating
- // a replacement with zero size.
- if (DestSize == SrcSize) {
- eraseInstruction(MemSet);
- return true;
- }
- // By default, create an unaligned memset.
- unsigned Align = 1;
- // If Dest is aligned, and SrcSize is constant, use the minimum alignment
- // of the sum.
- const unsigned DestAlign =
- std::max(MemSet->getDestAlignment(), MemCpy->getDestAlignment());
- if (DestAlign > 1)
- if (auto *SrcSizeC = dyn_cast<ConstantInt>(SrcSize))
- Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign);
- IRBuilder<> Builder(MemCpy);
- // If the sizes have different types, zext the smaller one.
- if (DestSize->getType() != SrcSize->getType()) {
- if (DestSize->getType()->getIntegerBitWidth() >
- SrcSize->getType()->getIntegerBitWidth())
- SrcSize = Builder.CreateZExt(SrcSize, DestSize->getType());
- else
- DestSize = Builder.CreateZExt(DestSize, SrcSize->getType());
- }
- Value *Ule = Builder.CreateICmpULE(DestSize, SrcSize);
- Value *SizeDiff = Builder.CreateSub(DestSize, SrcSize);
- Value *MemsetLen = Builder.CreateSelect(
- Ule, ConstantInt::getNullValue(DestSize->getType()), SizeDiff);
- unsigned DestAS = Dest->getType()->getPointerAddressSpace();
- Instruction *NewMemSet = Builder.CreateMemSet(
- Builder.CreateGEP(Builder.getInt8Ty(),
- Builder.CreatePointerCast(Dest,
- Builder.getInt8PtrTy(DestAS)),
- SrcSize),
- MemSet->getOperand(1), MemsetLen, MaybeAlign(Align));
- assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)) &&
- "MemCpy must be a MemoryDef");
- // The new memset is inserted after the memcpy, but it is known that its
- // defining access is the memset about to be removed which immediately
- // precedes the memcpy.
- auto *LastDef =
- cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy));
- auto *NewAccess = MSSAU->createMemoryAccessBefore(
- NewMemSet, LastDef->getDefiningAccess(), LastDef);
- MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
- eraseInstruction(MemSet);
- return true;
- }
- /// Determine whether the instruction has undefined content for the given Size,
- /// either because it was freshly alloca'd or started its lifetime.
- static bool hasUndefContents(MemorySSA *MSSA, AliasAnalysis *AA, Value *V,
- MemoryDef *Def, Value *Size) {
- if (MSSA->isLiveOnEntryDef(Def))
- return isa<AllocaInst>(getUnderlyingObject(V));
- if (auto *II = dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) {
- if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
- auto *LTSize = cast<ConstantInt>(II->getArgOperand(0));
- if (auto *CSize = dyn_cast<ConstantInt>(Size)) {
- if (AA->isMustAlias(V, II->getArgOperand(1)) &&
- LTSize->getZExtValue() >= CSize->getZExtValue())
- return true;
- }
- // If the lifetime.start covers a whole alloca (as it almost always
- // does) and we're querying a pointer based on that alloca, then we know
- // the memory is definitely undef, regardless of how exactly we alias.
- // The size also doesn't matter, as an out-of-bounds access would be UB.
- if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V))) {
- if (getUnderlyingObject(II->getArgOperand(1)) == Alloca) {
- const DataLayout &DL = Alloca->getModule()->getDataLayout();
- if (Optional<TypeSize> AllocaSize =
- Alloca->getAllocationSizeInBits(DL))
- if (*AllocaSize == LTSize->getValue() * 8)
- return true;
- }
- }
- }
- }
- return false;
- }
- /// Transform memcpy to memset when its source was just memset.
- /// In other words, turn:
- /// \code
- /// memset(dst1, c, dst1_size);
- /// memcpy(dst2, dst1, dst2_size);
- /// \endcode
- /// into:
- /// \code
- /// memset(dst1, c, dst1_size);
- /// memset(dst2, c, dst2_size);
- /// \endcode
- /// When dst2_size <= dst1_size.
- bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
- MemSetInst *MemSet) {
- // Make sure that memcpy(..., memset(...), ...), that is we are memsetting and
- // memcpying from the same address. Otherwise it is hard to reason about.
- if (!AA->isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource()))
- return false;
- Value *MemSetSize = MemSet->getLength();
- Value *CopySize = MemCpy->getLength();
- if (MemSetSize != CopySize) {
- // Make sure the memcpy doesn't read any more than what the memset wrote.
- // Don't worry about sizes larger than i64.
- // A known memset size is required.
- auto *CMemSetSize = dyn_cast<ConstantInt>(MemSetSize);
- if (!CMemSetSize)
- return false;
- // A known memcpy size is also required.
- auto *CCopySize = dyn_cast<ConstantInt>(CopySize);
- if (!CCopySize)
- return false;
- if (CCopySize->getZExtValue() > CMemSetSize->getZExtValue()) {
- // If the memcpy is larger than the memset, but the memory was undef prior
- // to the memset, we can just ignore the tail. Technically we're only
- // interested in the bytes from MemSetSize..CopySize here, but as we can't
- // easily represent this location, we use the full 0..CopySize range.
- MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy);
- bool CanReduceSize = false;
- MemoryUseOrDef *MemSetAccess = MSSA->getMemoryAccess(MemSet);
- MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
- MemSetAccess->getDefiningAccess(), MemCpyLoc);
- if (auto *MD = dyn_cast<MemoryDef>(Clobber))
- if (hasUndefContents(MSSA, AA, MemCpy->getSource(), MD, CopySize))
- CanReduceSize = true;
- if (!CanReduceSize)
- return false;
- CopySize = MemSetSize;
- }
- }
- IRBuilder<> Builder(MemCpy);
- Instruction *NewM =
- Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1),
- CopySize, MaybeAlign(MemCpy->getDestAlignment()));
- auto *LastDef =
- cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy));
- auto *NewAccess = MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef);
- MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
- return true;
- }
- /// Perform simplification of memcpy's. If we have memcpy A
- /// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
- /// B to be a memcpy from X to Z (or potentially a memmove, depending on
- /// circumstances). This allows later passes to remove the first memcpy
- /// altogether.
- bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
- // We can only optimize non-volatile memcpy's.
- if (M->isVolatile()) return false;
- // If the source and destination of the memcpy are the same, then zap it.
- if (M->getSource() == M->getDest()) {
- ++BBI;
- eraseInstruction(M);
- return true;
- }
- // If copying from a constant, try to turn the memcpy into a memset.
- if (auto *GV = dyn_cast<GlobalVariable>(M->getSource()))
- if (GV->isConstant() && GV->hasDefinitiveInitializer())
- if (Value *ByteVal = isBytewiseValue(GV->getInitializer(),
- M->getModule()->getDataLayout())) {
- IRBuilder<> Builder(M);
- Instruction *NewM =
- Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(),
- MaybeAlign(M->getDestAlignment()), false);
- auto *LastDef =
- cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M));
- auto *NewAccess =
- MSSAU->createMemoryAccessAfter(NewM, LastDef, LastDef);
- MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
- eraseInstruction(M);
- ++NumCpyToSet;
- return true;
- }
- MemoryUseOrDef *MA = MSSA->getMemoryAccess(M);
- MemoryAccess *AnyClobber = MSSA->getWalker()->getClobberingMemoryAccess(MA);
- MemoryLocation DestLoc = MemoryLocation::getForDest(M);
- const MemoryAccess *DestClobber =
- MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, DestLoc);
- // Try to turn a partially redundant memset + memcpy into
- // memcpy + smaller memset. We don't need the memcpy size for this.
- // The memcpy most post-dom the memset, so limit this to the same basic
- // block. A non-local generalization is likely not worthwhile.
- if (auto *MD = dyn_cast<MemoryDef>(DestClobber))
- if (auto *MDep = dyn_cast_or_null<MemSetInst>(MD->getMemoryInst()))
- if (DestClobber->getBlock() == M->getParent())
- if (processMemSetMemCpyDependence(M, MDep))
- return true;
- MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess(
- AnyClobber, MemoryLocation::getForSource(M));
- // There are four possible optimizations we can do for memcpy:
- // a) memcpy-memcpy xform which exposes redundance for DSE.
- // b) call-memcpy xform for return slot optimization.
- // c) memcpy from freshly alloca'd space or space that has just started
- // its lifetime copies undefined data, and we can therefore eliminate
- // the memcpy in favor of the data that was already at the destination.
- // d) memcpy from a just-memset'd source can be turned into memset.
- if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) {
- if (Instruction *MI = MD->getMemoryInst()) {
- if (auto *CopySize = dyn_cast<ConstantInt>(M->getLength())) {
- if (auto *C = dyn_cast<CallInst>(MI)) {
- // The memcpy must post-dom the call. Limit to the same block for
- // now. Additionally, we need to ensure that there are no accesses
- // to dest between the call and the memcpy. Accesses to src will be
- // checked by performCallSlotOptzn().
- // TODO: Support non-local call-slot optimization?
- if (C->getParent() == M->getParent() &&
- !accessedBetween(*AA, DestLoc, MD, MA)) {
- // FIXME: Can we pass in either of dest/src alignment here instead
- // of conservatively taking the minimum?
- Align Alignment = std::min(M->getDestAlign().valueOrOne(),
- M->getSourceAlign().valueOrOne());
- if (performCallSlotOptzn(
- M, M, M->getDest(), M->getSource(),
- TypeSize::getFixed(CopySize->getZExtValue()), Alignment,
- C)) {
- LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n"
- << " call: " << *C << "\n"
- << " memcpy: " << *M << "\n");
- eraseInstruction(M);
- ++NumMemCpyInstr;
- return true;
- }
- }
- }
- }
- if (auto *MDep = dyn_cast<MemCpyInst>(MI))
- return processMemCpyMemCpyDependence(M, MDep);
- if (auto *MDep = dyn_cast<MemSetInst>(MI)) {
- if (performMemCpyToMemSetOptzn(M, MDep)) {
- LLVM_DEBUG(dbgs() << "Converted memcpy to memset\n");
- eraseInstruction(M);
- ++NumCpyToSet;
- return true;
- }
- }
- }
- if (hasUndefContents(MSSA, AA, M->getSource(), MD, M->getLength())) {
- LLVM_DEBUG(dbgs() << "Removed memcpy from undef\n");
- eraseInstruction(M);
- ++NumMemCpyInstr;
- return true;
- }
- }
- return false;
- }
- /// Transforms memmove calls to memcpy calls when the src/dst are guaranteed
- /// not to alias.
- bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
- // See if the source could be modified by this memmove potentially.
- if (isModSet(AA->getModRefInfo(M, MemoryLocation::getForSource(M))))
- return false;
- LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M
- << "\n");
- // If not, then we know we can transform this.
- Type *ArgTys[3] = { M->getRawDest()->getType(),
- M->getRawSource()->getType(),
- M->getLength()->getType() };
- M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(),
- Intrinsic::memcpy, ArgTys));
- // For MemorySSA nothing really changes (except that memcpy may imply stricter
- // aliasing guarantees).
- ++NumMoveToCpy;
- return true;
- }
- /// This is called on every byval argument in call sites.
- bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
- const DataLayout &DL = CB.getCaller()->getParent()->getDataLayout();
- // Find out what feeds this byval argument.
- Value *ByValArg = CB.getArgOperand(ArgNo);
- Type *ByValTy = CB.getParamByValType(ArgNo);
- TypeSize ByValSize = DL.getTypeAllocSize(ByValTy);
- MemoryLocation Loc(ByValArg, LocationSize::precise(ByValSize));
- MemoryUseOrDef *CallAccess = MSSA->getMemoryAccess(&CB);
- if (!CallAccess)
- return false;
- MemCpyInst *MDep = nullptr;
- MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
- CallAccess->getDefiningAccess(), Loc);
- if (auto *MD = dyn_cast<MemoryDef>(Clobber))
- MDep = dyn_cast_or_null<MemCpyInst>(MD->getMemoryInst());
- // If the byval argument isn't fed by a memcpy, ignore it. If it is fed by
- // a memcpy, see if we can byval from the source of the memcpy instead of the
- // result.
- if (!MDep || MDep->isVolatile() ||
- ByValArg->stripPointerCasts() != MDep->getDest())
- return false;
- // The length of the memcpy must be larger or equal to the size of the byval.
- auto *C1 = dyn_cast<ConstantInt>(MDep->getLength());
- if (!C1 || !TypeSize::isKnownGE(
- TypeSize::getFixed(C1->getValue().getZExtValue()), ByValSize))
- return false;
- // Get the alignment of the byval. If the call doesn't specify the alignment,
- // then it is some target specific value that we can't know.
- MaybeAlign ByValAlign = CB.getParamAlign(ArgNo);
- if (!ByValAlign) return false;
- // If it is greater than the memcpy, then we check to see if we can force the
- // source of the memcpy to the alignment we need. If we fail, we bail out.
- MaybeAlign MemDepAlign = MDep->getSourceAlign();
- if ((!MemDepAlign || *MemDepAlign < *ByValAlign) &&
- getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, &CB, AC,
- DT) < *ByValAlign)
- return false;
- // The address space of the memcpy source must match the byval argument
- if (MDep->getSource()->getType()->getPointerAddressSpace() !=
- ByValArg->getType()->getPointerAddressSpace())
- return false;
- // Verify that the copied-from memory doesn't change in between the memcpy and
- // the byval call.
- // memcpy(a <- b)
- // *b = 42;
- // foo(*a)
- // It would be invalid to transform the second memcpy into foo(*b).
- if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep),
- MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(&CB)))
- return false;
- Value *TmpCast = MDep->getSource();
- if (MDep->getSource()->getType() != ByValArg->getType()) {
- BitCastInst *TmpBitCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
- "tmpcast", &CB);
- // Set the tmpcast's DebugLoc to MDep's
- TmpBitCast->setDebugLoc(MDep->getDebugLoc());
- TmpCast = TmpBitCast;
- }
- LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n"
- << " " << *MDep << "\n"
- << " " << CB << "\n");
- // Otherwise we're good! Update the byval argument.
- CB.setArgOperand(ArgNo, TmpCast);
- ++NumMemCpyInstr;
- return true;
- }
- /// Executes one iteration of MemCpyOptPass.
- bool MemCpyOptPass::iterateOnFunction(Function &F) {
- bool MadeChange = false;
- // Walk all instruction in the function.
- for (BasicBlock &BB : F) {
- // Skip unreachable blocks. For example processStore assumes that an
- // instruction in a BB can't be dominated by a later instruction in the
- // same BB (which is a scenario that can happen for an unreachable BB that
- // has itself as a predecessor).
- if (!DT->isReachableFromEntry(&BB))
- continue;
- for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
- // Avoid invalidating the iterator.
- Instruction *I = &*BI++;
- bool RepeatInstruction = false;
- if (auto *SI = dyn_cast<StoreInst>(I))
- MadeChange |= processStore(SI, BI);
- else if (auto *M = dyn_cast<MemSetInst>(I))
- RepeatInstruction = processMemSet(M, BI);
- else if (auto *M = dyn_cast<MemCpyInst>(I))
- RepeatInstruction = processMemCpy(M, BI);
- else if (auto *M = dyn_cast<MemMoveInst>(I))
- RepeatInstruction = processMemMove(M);
- else if (auto *CB = dyn_cast<CallBase>(I)) {
- for (unsigned i = 0, e = CB->arg_size(); i != e; ++i)
- if (CB->isByValArgument(i))
- MadeChange |= processByValArgument(*CB, i);
- }
- // Reprocess the instruction if desired.
- if (RepeatInstruction) {
- if (BI != BB.begin())
- --BI;
- MadeChange = true;
- }
- }
- }
- return MadeChange;
- }
- PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
- auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
- auto *AA = &AM.getResult<AAManager>(F);
- auto *AC = &AM.getResult<AssumptionAnalysis>(F);
- auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
- auto *MSSA = &AM.getResult<MemorySSAAnalysis>(F);
- bool MadeChange = runImpl(F, &TLI, AA, AC, DT, &MSSA->getMSSA());
- if (!MadeChange)
- return PreservedAnalyses::all();
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- PA.preserve<MemorySSAAnalysis>();
- return PA;
- }
- bool MemCpyOptPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
- AliasAnalysis *AA_, AssumptionCache *AC_,
- DominatorTree *DT_, MemorySSA *MSSA_) {
- bool MadeChange = false;
- TLI = TLI_;
- AA = AA_;
- AC = AC_;
- DT = DT_;
- MSSA = MSSA_;
- MemorySSAUpdater MSSAU_(MSSA_);
- MSSAU = &MSSAU_;
- while (true) {
- if (!iterateOnFunction(F))
- break;
- MadeChange = true;
- }
- if (VerifyMemorySSA)
- MSSA_->verifyMemorySSA();
- return MadeChange;
- }
- /// This is the main transformation entry point for a function.
- bool MemCpyOptLegacyPass::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
- auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
- return Impl.runImpl(F, TLI, AA, AC, DT, MSSA);
- }
|