Skip to content

Commit

Permalink
added NandBootstrap, using memcpy, 8 nands
Browse files Browse the repository at this point in the history
  • Loading branch information
Wei Dai committed May 22, 2018
1 parent 10a5390 commit 7c30949
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 24 deletions.
7 changes: 7 additions & 0 deletions cufhe/include/bootstrap_gpu.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,11 @@ void Bootstrap(LWESample* out,
Torus mu,
cudaStream_t st = 0);

void NandBootstrap(LWESample* out,
LWESample* in0,
LWESample* in1,
Torus mu,
Torus fix,
cudaStream_t st);

} // namespace cufhe
60 changes: 60 additions & 0 deletions cufhe/lib/bootstrap_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,53 @@ void __Bootstrap__(Torus* out, Torus* in, Torus mu,
KeySwitch<lwe_n, tlwe_n, ks_bits, ks_size>(out, tlwe, ksk);
}

__global__
void __NandBootstrap__(Torus* out, Torus* in0, Torus* in1, Torus mu, Torus fix,
FFP* bk,
Torus* ksk,
CuNTTHandler<> ntt) {
// Assert(bk.k() == 1);
// Assert(bk.l() == 2);
// Assert(bk.n() == 1024);
__shared__ FFP sh[6 * 1024];
FFP* sh_acc_ntt[4] = { sh, sh + 1024, sh + 2048, sh + 3072 };
FFP* sh_res_ntt[2] = { sh, sh + 4096 };
Torus* tlwe = (Torus*)&sh[5120];

// test vector
// acc.a = 0; acc.b = vec(mu) * x ^ (in.b()/2048)
//register int32_t bar = 2048 - ModSwitch2048(in[500]);
register int32_t bar = 2048 - ModSwitch2048(fix - in0[500] - in1[500]);
register uint32_t tid = ThisThreadRankInBlock();
register uint32_t bdim = ThisBlockSize();
register uint32_t cmp, neg, pos;
#pragma unroll
for (int i = tid; i < 1024; i += bdim) {
tlwe[i] = 0; // part a
if (bar == 2048)
tlwe[i + 1024] = mu;
else {
cmp = (uint32_t)(i < (bar & 1023));
neg = -(cmp ^ (bar >> 10));
pos = -((1 - cmp) ^ (bar >> 10));
tlwe[i + 1024] = (mu & pos) + ((-mu) & neg); // part b
}
}
__syncthreads();
// accumulate
#pragma unroll
for (int i = 0; i < 500; i ++) { // 500 iterations
bar = ModSwitch2048(0 - in0[i] - in1[i]);
Accumulate(tlwe, sh_acc_ntt, sh_res_ntt, bar, bk + (i << 13), ntt);
}

static const uint32_t lwe_n = 500;
static const uint32_t tlwe_n = 1024;
static const uint32_t ks_bits = 2;
static const uint32_t ks_size = 8;
KeySwitch<lwe_n, tlwe_n, ks_bits, ks_size>(out, tlwe, ksk);
}

void Bootstrap(LWESample* out,
LWESample* in,
Torus mu,
Expand All @@ -294,4 +341,17 @@ void Bootstrap(LWESample* out,
CuCheckError();
}

void NandBootstrap(LWESample* out,
LWESample* in0,
LWESample* in1,
Torus mu,
Torus fix,
cudaStream_t st) {
dim3 grid(1);
dim3 block(512);
__NandBootstrap__<<<grid, block, 0, st>>>(out->data(), in0->data(),
in1->data(), mu, fix, bk_ntt->data(), ksk_dev->data(), *ntt_handler);
CuCheckError();
}

} // namespace cufhe
45 changes: 24 additions & 21 deletions cufhe/lib/cufhe_gates_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,37 +36,40 @@ void CleanUp() {
DeleteKeySwitchingKey();
}

//void Initialize(PubKey pub_key);
//void And (Ctxt& out, const Ctxt& in0, const Ctxt& in1, const PubKey& pub_key);
//void Or (Ctxt& out, const Ctxt& in0, const Ctxt& in1, const PubKey& pub_key);
//void Xor (Ctxt& out, const Ctxt& in0, const Ctxt& in1, const PubKey& pub_key);
inline void CtxtCopyH2D(const Ctxt& c, Stream st) {
cudaMemcpyAsync(c.lwe_sample_device_->data(),
c.lwe_sample_->data(),
c.lwe_sample_->SizeData(),
cudaMemcpyHostToDevice,
st.st());
}

inline void CtxtCopyD2H(const Ctxt& c, Stream st) {
cudaMemcpyAsync(c.lwe_sample_->data(),
c.lwe_sample_device_->data(),
c.lwe_sample_->SizeData(),
cudaMemcpyDeviceToHost,
st.st());
}

void Nand(Ctxt& out,
const Ctxt& in0,
const Ctxt& in1,
Stream st) {
static const Torus mu = ModSwitchToTorus(1, 8);
static const Torus fix = ModSwitchToTorus(1, 8);
for (int i = 0; i <= in0.lwe_sample_->n(); i ++)
/* for (int i = 0; i <= in0.lwe_sample_->n(); i ++)
out.lwe_sample_->data()[i] = 0 - in0.lwe_sample_->data()[i]
- in1.lwe_sample_->data()[i];
out.lwe_sample_->b() += fix;
cudaMemcpyAsync(out.lwe_sample_device_->data(),
out.lwe_sample_->data(),
out.lwe_sample_->SizeData(),
cudaMemcpyHostToDevice,
st.st());
*/
CtxtCopyH2D(in0, st);
CtxtCopyH2D(in1, st);
//Bootstrap(out.lwe_sample_, out.lwe_sample_, mu, st.st());
Bootstrap(out.lwe_sample_device_, out.lwe_sample_device_, mu, st.st());
cudaMemcpyAsync(out.lwe_sample_->data(),
out.lwe_sample_device_->data(),
out.lwe_sample_->SizeData(),
cudaMemcpyDeviceToHost,
st.st());
cudaEvent_t end_of_gate;
cudaEventCreate(&end_of_gate);
cudaEventRecord(end_of_gate, st.st());
cudaStreamWaitEvent(0, end_of_gate, 0);
//Synchronize();
//CtxtCopyH2D(out, st);
//Bootstrap(out.lwe_sample_device_, out.lwe_sample_device_, mu, st.st());
NandBootstrap(out.lwe_sample_device_, in0.lwe_sample_device_, in1.lwe_sample_device_, mu, fix, st.st());
CtxtCopyD2H(out, st);
}

void Or(Ctxt& out,
Expand Down
6 changes: 3 additions & 3 deletions cufhe/test/test_api_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ int main() {
cudaGetDeviceProperties(&prop, 0);
uint32_t kNumSMs = prop.multiProcessorCount;
uint32_t kNumTests = kNumSMs * 8;
uint32_t kNumLevels = 8;
uint32_t kNumLevels = 2;

SetSeed(); // set random seed

Expand Down Expand Up @@ -107,8 +107,8 @@ int main() {
cudaEventRecord(start, 0);

// Here, pass streams to gates for parallel gates.
for (int i = 0; i < kNumTests; i ++)
for (int j = 0; j < kNumLevels; j ++)
for (int j = 0; j < kNumLevels; j ++)
for (int i = 0; i < kNumTests; i ++)
Nand(ct[i], ct[i], ct[i + kNumTests], st[i % kNumSMs]);
Synchronize();

Expand Down

0 comments on commit 7c30949

Please sign in to comment.