Skip to content

Commit

Permalink
Add patch D28786
Browse files Browse the repository at this point in the history
  • Loading branch information
Keno committed Jan 16, 2017
1 parent 840b10f commit d006cc4
Show file tree
Hide file tree
Showing 2 changed files with 318 additions and 0 deletions.
1 change: 1 addition & 0 deletions deps/llvm.mk
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,7 @@ $(eval $(call LLVM_PATCH,llvm-PR277939)) # Issue #19976, Remove for 4.0
$(eval $(call LLVM_PATCH,llvm-PR278321)) # Issue #19976, Remove for 4.0
$(eval $(call LLVM_PATCH,llvm-PR278923)) # Issue #19976, Remove for 4.0
$(eval $(call LLVM_PATCH,llvm-D28759-loopclearance))
$(eval $(call LLVM_PATCH,llvm-D28786-callclearance))
endif # LLVM_VER

ifeq ($(LLVM_VER),3.7.1)
Expand Down
317 changes: 317 additions & 0 deletions deps/patches/llvm-D28786-callclearance.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,317 @@
From 33f8c8a6e85e395b2ffcf8fda199be151976d02f Mon Sep 17 00:00:00 2001
From: Keno Fischer <keno@juliacomputing.com>
Date: Mon, 16 Jan 2017 18:40:28 -0500
Subject: [PATCH] [ExecutionDepsFix] Kill clearance at function calls

Summary:
This is a follow up to D28759 and together with that commit fixes
almost all (maybe all, pending another look at the benchmarks) of
the benchmarks that regressed due to rL278321 (while keeping the
performance enhancements in cases where rL278321 was beneficial).

Prior to this commit, the analysis would simply ignore any function
calls for the clearance calulation, causing incorrect results after
any function call (for the benchmarks that regressed rL278321 just
happened to pick a register that was worse than the xmm0 default).
With this patch, we kill clearance for all registers when a function
call occurs. This is obviously more pessimistic than reality in a lot
of cases (if the callee function doesn't def some of the registers),
but in most cases having to insert an extra xorps even if unnecessary
is better than taking the 3-5x performance penalty of picking the wrong
register.

As an added optimization, also make sure to only insert one dependency
breaking instruction if there's multiple undef reads from the same
register.

Reviewers: MatzeB, myatsina, mkuper, atrick

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D28786
---
lib/CodeGen/ExecutionDepsFix.cpp | 125 +++++++++++++++++++++++++++---------
test/CodeGen/X86/break-false-dep.ll | 33 ++++++++++
test/CodeGen/X86/half.ll | 1 +
3 files changed, 127 insertions(+), 32 deletions(-)

diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index 6ac1db4..18c5eab 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -164,7 +164,8 @@ class ExeDepsFix : public MachineFunctionPass {
MBBInfoMap MBBInfos;

/// List of undefined register reads in this block in forward order.
- std::vector<std::pair<MachineInstr*, unsigned> > UndefReads;
+ std::vector<std::tuple<MachineInstr *, unsigned, unsigned, unsigned>>
+ UndefReads;

/// Storage for register unit liveness.
LivePhysRegs LiveRegSet;
@@ -214,12 +215,12 @@ private:
bool isBlockDone(MachineBasicBlock *);
void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass, bool Done);
void updateSuccessors(MachineBasicBlock *MBB, bool Primary, bool Done);
- bool visitInstr(MachineInstr *);
+ bool visitInstr(MachineInstr *, bool PrimaryPass);
void processDefs(MachineInstr *, bool BlockDone, bool Kill);
void visitSoftInstr(MachineInstr*, unsigned mask);
void visitHardInstr(MachineInstr*, unsigned domain);
- void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
- unsigned Pref);
+ unsigned pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
+ unsigned Pref, bool &TrueDependency);
bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref);
void processUndefReads(MachineBasicBlock*);
};
@@ -470,24 +471,37 @@ void ExeDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) {
LiveRegs = nullptr;
}

-bool ExeDepsFix::visitInstr(MachineInstr *MI) {
- // Update instructions with explicit execution domains.
- std::pair<uint16_t, uint16_t> DomP = TII->getExecutionDomain(*MI);
- if (DomP.first) {
- if (DomP.second)
- visitSoftInstr(MI, DomP.second);
- else
- visitHardInstr(MI, DomP.first);
+bool ExeDepsFix::visitInstr(MachineInstr *MI, bool PrimaryPass) {
+ bool Kill = false;
+
+ if (PrimaryPass) {
+ // Update instructions with explicit execution domains.
+ std::pair<uint16_t, uint16_t> DomP = TII->getExecutionDomain(*MI);
+ if (DomP.first) {
+ if (DomP.second)
+ visitSoftInstr(MI, DomP.second);
+ else
+ visitHardInstr(MI, DomP.first);
+ }
+ Kill = !DomP.first;
+ }
+
+ // If this is a call, pretend all registers we are considering are def'd here.
+ // We have no idea which registers the callee may use.
+ if (MI->isCall()) {
+ for (unsigned i = 0, e = NumRegs; i != e; ++i)
+ LiveRegs[i].Def = CurInstr;
}

- return !DomP.first;
+ return Kill;
}

/// \brief Helps avoid false dependencies on undef registers by updating the
/// machine instructions' undef operand to use a register that the instruction
/// is truly dependent on, or use a register with clearance higher than Pref.
-void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
- unsigned Pref) {
+unsigned ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
+ unsigned Pref,
+ bool &TrueDependency) {
MachineOperand &MO = MI->getOperand(OpIdx);
assert(MO.isUndef() && "Expected undef machine operand");

@@ -495,7 +509,7 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,

// Update only undef operands that are mapped to one register.
if (AliasMap[OriginalReg].size() != 1)
- return;
+ return (unsigned)-1;

// Get the undef operand's register class
const TargetRegisterClass *OpRC =
@@ -510,7 +524,8 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
// We found a true dependency - replace the undef register with the true
// dependency.
MO.setReg(CurrMO.getReg());
- return;
+ TrueDependency = true;
+ return (unsigned)-1;
}

// Go over all registers in the register class and find the register with
@@ -535,6 +550,8 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
// Update the operand if we found a register with better clearance.
if (MaxClearanceReg != OriginalReg)
MO.setReg(MaxClearanceReg);
+
+ return MaxClearance;
}

/// \brief Return true to if it makes sense to break dependence on a partial def
@@ -571,9 +588,16 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool BlockDone, bool Kill) {
if (BlockDone) {
unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI);
if (Pref) {
- pickBestRegisterForUndef(MI, OpNum, Pref);
- if (shouldBreakDependence(MI, OpNum, Pref))
- UndefReads.push_back(std::make_pair(MI, OpNum));
+ bool TrueDependency = false;
+ unsigned MaxClearance =
+ pickBestRegisterForUndef(MI, OpNum, Pref, TrueDependency);
+ // Don't bother adding true dependencies to UndefReads. All we'd find out
+ // is that the register is live (since this very instruction depends on
+ // it), so we can't do anything.
+ if (!TrueDependency && shouldBreakDependence(MI, OpNum, Pref)) {
+ UndefReads.push_back(
+ std::make_tuple(MI, OpNum, MaxClearance, CurInstr));
+ }
}
}
const MCInstrDesc &MCID = MI->getDesc();
@@ -619,31 +643,70 @@ void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) {
if (UndefReads.empty())
return;

+ // We will make two passes through the list of undef regs. The first
+ // (backward) to determine legality of inserting the dependency breaking
+ // instruction, the second (forward) to do the actual insertion. This allows
+ // us to avoid inserting the dependency breaking instruction twice, if an
+ // earlier such instruction also breaks the dependency for the latter (e.g.
+ // because there's multiple undef reads from the same register in close
+ // succession).
+
+ //// First pass
// Collect this block's live out register units.
LiveRegSet.init(TRI);
// We do not need to care about pristine registers as they are just preserved
// but not actually used in the function.
LiveRegSet.addLiveOutsNoPristines(*MBB);

- MachineInstr *UndefMI = UndefReads.back().first;
- unsigned OpIdx = UndefReads.back().second;
+ size_t i = UndefReads.size();
+ MachineInstr *UndefMI = std::get<0>(UndefReads[i]);

for (MachineInstr &I : make_range(MBB->rbegin(), MBB->rend())) {
// Update liveness, including the current instruction's defs.
LiveRegSet.stepBackward(I);

if (UndefMI == &I) {
- if (!LiveRegSet.contains(UndefMI->getOperand(OpIdx).getReg()))
- TII->breakPartialRegDependency(*UndefMI, OpIdx, TRI);
+ unsigned OpIdx = std::get<1>(UndefReads[i]);
+ if (LiveRegSet.contains(UndefMI->getOperand(OpIdx).getReg())) {
+ // This is illegal. Null out the Instruction, so we now to skip this
+ // on the next pass
+ std::get<0>(UndefReads[i]) = nullptr;
+ }

- UndefReads.pop_back();
- if (UndefReads.empty())
- return;
+ if (i == 0)
+ break;
+ UndefMI = std::get<0>(UndefReads[--i]);
+ }
+ }

- UndefMI = UndefReads.back().first;
- OpIdx = UndefReads.back().second;
+ //// Second pass
+ // Keep track of the instruction number at which we broke the instruction.
+ // If the clearance for a later instruction is larger than the
+ // difference between that instruction and where we broke the dependence,
+ // then the first insertion also breaks the dependence for the second
+ // instruction.
+ unsigned *BrokeDependenceAt = new unsigned[NumRegs];
+ memset(BrokeDependenceAt, (unsigned)-1, sizeof(unsigned) * NumRegs);
+ for (auto ToBreak : UndefReads) {
+ MachineInstr *MI = std::get<0>(ToBreak);
+ unsigned OpIdx = std::get<1>(ToBreak);
+ unsigned Clearance = std::get<2>(ToBreak);
+ unsigned Here = std::get<3>(ToBreak);
+ if (!MI)
+ continue;
+ for (int rx : regIndices(MI->getOperand(OpIdx).getReg())) {
+ if (BrokeDependenceAt[rx] != (unsigned)-1 && Clearance != (unsigned)-1) {
+ unsigned There = BrokeDependenceAt[rx];
+ if (Clearance > Here - There)
+ continue;
+ }
+ TII->breakPartialRegDependency(*MI, OpIdx, TRI);
+ BrokeDependenceAt[rx] = Here;
}
}
+ delete[] BrokeDependenceAt;
+
+ UndefReads.clear();
}

// A hard instruction only works in one domain. All input registers will be
@@ -793,9 +856,7 @@ void ExeDepsFix::processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass,
enterBasicBlock(MBB);
for (MachineInstr &MI : *MBB) {
if (!MI.isDebugValue()) {
- bool Kill = false;
- if (PrimaryPass)
- Kill = visitInstr(&MI);
+ bool Kill = visitInstr(&MI, PrimaryPass);
processDefs(&MI, isBlockDone(MBB), Kill);
}
}
diff --git a/test/CodeGen/X86/break-false-dep.ll b/test/CodeGen/X86/break-false-dep.ll
index 0ba1825..859de63 100644
--- a/test/CodeGen/X86/break-false-dep.ll
+++ b/test/CodeGen/X86/break-false-dep.ll
@@ -310,6 +310,7 @@ inner_loop:
loop_end:
%nexti = add i64 %phi_i, 1
%nextj = add i64 %phi_j, 1
+;AVX-LABEL:@loopclearance2
; Register use, plus us clobbering 7-15 above, basically forces xmm7 here as
; the only reasonable choice. The primary thing we care about is that it's
; not one of the registers used in the loop (e.g. not the output reg here)
@@ -334,3 +335,35 @@ loop_end:
loopdone:
ret void
}
+
+; Make sure that calls kill register clearance and that a we don't insert
+; an extra dependency-breaking instruction if one suffices.
+declare double @sin(double %x)
+define void @callclearance(double *%x, i64 *%y, i64 *%z) {
+entry:
+ br label %loop
+
+loop:
+ %idx = phi i32 [0, %entry], [%idx, %loop]
+ %valptr = getelementptr i64, i64* %y, i32 %idx
+ %valptr2 = getelementptr i64, i64* %z, i32 %idx
+ %outptr = getelementptr double, double* %x, i32 %idx
+;AVX-LABEL:@callclearance
+;AVX: vxorps [[THEXMM:%xmm[0-9]+]], [[THEXMM]], [[THEXMM]]
+;AVX: vcvtsi2sdq {{.*}}, [[THEXMM]], {{%xmm[0-9]+}}
+;AVX-NOT: vxorps
+;AVX: vcvtsi2sdq {{.*}}, [[THEXMM]], {{%xmm[0-9]+}}
+ %val = load i64, i64 *%valptr
+ %val_f = sitofp i64 %val to double
+ %val2 = load i64, i64 *%valptr2
+ %val2_f = sitofp i64 %val2 to double
+ %sined = call double @sin(double %val_f)
+ %sined2 = call double @sin(double %val2_f)
+ %sum = fadd double %sined, %sined2
+ store double %sum, double *%x
+ %done = icmp sgt i32 %idx, 10000
+ br i1 %done, label %end, label %loop
+
+end:
+ ret void
+}
diff --git a/test/CodeGen/X86/half.ll b/test/CodeGen/X86/half.ll
index 4c8003f..4669d67 100644
--- a/test/CodeGen/X86/half.ll
+++ b/test/CodeGen/X86/half.ll
@@ -287,6 +287,7 @@ define half @test_f80trunc_nodagcombine() #0 {
; CHECK-LIBCALL-NEXT: movzwl (%rsi), %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
; CHECK-LIBCALL-NEXT: movss %xmm0, 12(%rsp)
+; CHECK-LIBCALL-NEXT: xorps %xmm0, %xmm0
; CHECK-LIBCALL-NEXT: cvtsi2ssl %ebx, %xmm0
; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
; CHECK-LIBCALL-NEXT: movzwl %ax, %edi
--
2.9.3

0 comments on commit d006cc4

Please sign in to comment.