Skip to content

Commit

Permalink
all gates
Browse files Browse the repository at this point in the history
  • Loading branch information
WeiDaiWD committed Jun 1, 2018
1 parent 34c1642 commit 6031f62
Show file tree
Hide file tree
Showing 4 changed files with 271 additions and 83 deletions.
42 changes: 35 additions & 7 deletions cufhe/include/bootstrap_gpu.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,45 @@ void BootstrappingKeyToNTT(const BootstrappingKey* bk);
void KeySwitchingKeyToDevice(const KeySwitchingKey* ksk);
void DeleteBootstrappingKeyNTT();
void DeleteKeySwitchingKey();

void Bootstrap(LWESample* out,
LWESample* in,
Torus mu,
cudaStream_t st = 0);

void NandBootstrap(LWESample* out,
LWESample* in0,
LWESample* in1,
Torus mu,
Torus fix,
cudaStream_t st);
LWESample* in0,
LWESample* in1,
Torus mu,
Torus fix,
cudaStream_t st);
void OrBootstrap(LWESample* out,
LWESample* in0,
LWESample* in1,
Torus mu,
Torus fix,
cudaStream_t st);
void AndBootstrap(LWESample* out,
LWESample* in0,
LWESample* in1,
Torus mu,
Torus fix,
cudaStream_t st);
void NorBootstrap(LWESample* out,
LWESample* in0,
LWESample* in1,
Torus mu,
Torus fix,
cudaStream_t st);
void XorBootstrap(LWESample* out,
LWESample* in0,
LWESample* in1,
Torus mu,
Torus fix,
cudaStream_t st);
void XnorBootstrap(LWESample* out,
LWESample* in0,
LWESample* in1,
Torus mu,
Torus fix,
cudaStream_t st);

} // namespace cufhe
233 changes: 198 additions & 35 deletions cufhe/lib/bootstrap_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -286,20 +286,106 @@ void __Bootstrap__(Torus* out, Torus* in, Torus mu,

__global__
void __NandBootstrap__(Torus* out, Torus* in0, Torus* in1, Torus mu, Torus fix,
FFP* bk,
Torus* ksk,
CuNTTHandler<> ntt) {
// Assert(bk.k() == 1);
// Assert(bk.l() == 2);
// Assert(bk.n() == 1024);
FFP* bk, Torus* ksk, CuNTTHandler<> ntt) {
__shared__ FFP sh[6 * 1024];
// FFP* sh_acc_ntt[4] = { sh, sh + 1024, sh + 2048, sh + 3072 };
// FFP* sh_res_ntt[2] = { sh, sh + 4096 };
Torus* tlwe = (Torus*)&sh[5120];
// test vector: acc.a = 0; acc.b = vec(mu) * x ^ (in.b()/2048)
register int32_t bar = 2048 - ModSwitch2048(fix - in0[500] - in1[500]);
register uint32_t tid = ThisThreadRankInBlock();
register uint32_t bdim = ThisBlockSize();
register uint32_t cmp, neg, pos;
#pragma unroll
for (int i = tid; i < 1024; i += bdim) {
tlwe[i] = 0; // part a
if (bar == 2048)
tlwe[i + 1024] = mu;
else {
cmp = (uint32_t)(i < (bar & 1023));
neg = -(cmp ^ (bar >> 10));
pos = -((1 - cmp) ^ (bar >> 10));
tlwe[i + 1024] = (mu & pos) + ((-mu) & neg); // part b
}
}
__syncthreads();
// accumulate
#pragma unroll
for (int i = 0; i < 500; i ++) { // 500 iterations
bar = ModSwitch2048(0 - in0[i] - in1[i]);
Accumulate(tlwe, sh, sh, bar, bk + (i << 13), ntt);
}
KeySwitch<500, 1024, 2, 8>(out, tlwe, ksk);
}

// test vector
// acc.a = 0; acc.b = vec(mu) * x ^ (in.b()/2048)
//register int32_t bar = 2048 - ModSwitch2048(in[500]);
__global__
void __OrBootstrap__(Torus* out, Torus* in0, Torus* in1, Torus mu, Torus fix,
FFP* bk, Torus* ksk, CuNTTHandler<> ntt) {
__shared__ FFP sh[6 * 1024];
Torus* tlwe = (Torus*)&sh[5120];
// test vector: acc.a = 0; acc.b = vec(mu) * x ^ (in.b()/2048)
register int32_t bar = 2048 - ModSwitch2048(fix + in0[500] + in1[500]);
register uint32_t tid = ThisThreadRankInBlock();
register uint32_t bdim = ThisBlockSize();
register uint32_t cmp, neg, pos;
#pragma unroll
for (int i = tid; i < 1024; i += bdim) {
tlwe[i] = 0; // part a
if (bar == 2048)
tlwe[i + 1024] = mu;
else {
cmp = (uint32_t)(i < (bar & 1023));
neg = -(cmp ^ (bar >> 10));
pos = -((1 - cmp) ^ (bar >> 10));
tlwe[i + 1024] = (mu & pos) + ((-mu) & neg); // part b
}
}
__syncthreads();
// accumulate
#pragma unroll
for (int i = 0; i < 500; i ++) { // 500 iterations
bar = ModSwitch2048(0 + in0[i] + in1[i]);
Accumulate(tlwe, sh, sh, bar, bk + (i << 13), ntt);
}
KeySwitch<500, 1024, 2, 8>(out, tlwe, ksk);
}

__global__
void __AndBootstrap__(Torus* out, Torus* in0, Torus* in1, Torus mu, Torus fix,
FFP* bk, Torus* ksk, CuNTTHandler<> ntt) {
__shared__ FFP sh[6 * 1024];
Torus* tlwe = (Torus*)&sh[5120];
// test vector: acc.a = 0; acc.b = vec(mu) * x ^ (in.b()/2048)
register int32_t bar = 2048 - ModSwitch2048(fix + in0[500] + in1[500]);
register uint32_t tid = ThisThreadRankInBlock();
register uint32_t bdim = ThisBlockSize();
register uint32_t cmp, neg, pos;
#pragma unroll
for (int i = tid; i < 1024; i += bdim) {
tlwe[i] = 0; // part a
if (bar == 2048)
tlwe[i + 1024] = mu;
else {
cmp = (uint32_t)(i < (bar & 1023));
neg = -(cmp ^ (bar >> 10));
pos = -((1 - cmp) ^ (bar >> 10));
tlwe[i + 1024] = (mu & pos) + ((-mu) & neg); // part b
}
}
__syncthreads();
// accumulate
#pragma unroll
for (int i = 0; i < 500; i ++) { // 500 iterations
bar = ModSwitch2048(0 + in0[i] + in1[i]);
Accumulate(tlwe, sh, sh, bar, bk + (i << 13), ntt);
}
KeySwitch<500, 1024, 2, 8>(out, tlwe, ksk);
}

__global__
void __NorBootstrap__(Torus* out, Torus* in0, Torus* in1, Torus mu, Torus fix,
FFP* bk, Torus* ksk, CuNTTHandler<> ntt) {
__shared__ FFP sh[6 * 1024];
Torus* tlwe = (Torus*)&sh[5120];
// test vector: acc.a = 0; acc.b = vec(mu) * x ^ (in.b()/2048)
register int32_t bar = 2048 - ModSwitch2048(fix - in0[500] - in1[500]);
register uint32_t tid = ThisThreadRankInBlock();
register uint32_t bdim = ThisBlockSize();
Expand All @@ -323,12 +409,71 @@ void __NandBootstrap__(Torus* out, Torus* in0, Torus* in1, Torus mu, Torus fix,
bar = ModSwitch2048(0 - in0[i] - in1[i]);
Accumulate(tlwe, sh, sh, bar, bk + (i << 13), ntt);
}
KeySwitch<500, 1024, 2, 8>(out, tlwe, ksk);
}

static const uint32_t lwe_n = 500;
static const uint32_t tlwe_n = 1024;
static const uint32_t ks_bits = 2;
static const uint32_t ks_size = 8;
KeySwitch<lwe_n, tlwe_n, ks_bits, ks_size>(out, tlwe, ksk);
__global__
void __XorBootstrap__(Torus* out, Torus* in0, Torus* in1, Torus mu, Torus fix,
FFP* bk, Torus* ksk, CuNTTHandler<> ntt) {
__shared__ FFP sh[6 * 1024];
Torus* tlwe = (Torus*)&sh[5120];
// test vector: acc.a = 0; acc.b = vec(mu) * x ^ (in.b()/2048)
register int32_t bar = 2048 - ModSwitch2048(fix + 2*in0[500] + 2*in1[500]);
register uint32_t tid = ThisThreadRankInBlock();
register uint32_t bdim = ThisBlockSize();
register uint32_t cmp, neg, pos;
#pragma unroll
for (int i = tid; i < 1024; i += bdim) {
tlwe[i] = 0; // part a
if (bar == 2048)
tlwe[i + 1024] = mu;
else {
cmp = (uint32_t)(i < (bar & 1023));
neg = -(cmp ^ (bar >> 10));
pos = -((1 - cmp) ^ (bar >> 10));
tlwe[i + 1024] = (mu & pos) + ((-mu) & neg); // part b
}
}
__syncthreads();
// accumulate
#pragma unroll
for (int i = 0; i < 500; i ++) { // 500 iterations
bar = ModSwitch2048(0 + 2*in0[i] + 2*in1[i]);
Accumulate(tlwe, sh, sh, bar, bk + (i << 13), ntt);
}
KeySwitch<500, 1024, 2, 8>(out, tlwe, ksk);
}

__global__
void __XnorBootstrap__(Torus* out, Torus* in0, Torus* in1, Torus mu, Torus fix,
FFP* bk, Torus* ksk, CuNTTHandler<> ntt) {
__shared__ FFP sh[6 * 1024];
Torus* tlwe = (Torus*)&sh[5120];
// test vector: acc.a = 0; acc.b = vec(mu) * x ^ (in.b()/2048)
register int32_t bar = 2048 - ModSwitch2048(fix - 2*in0[500] - 2*in1[500]);
register uint32_t tid = ThisThreadRankInBlock();
register uint32_t bdim = ThisBlockSize();
register uint32_t cmp, neg, pos;
#pragma unroll
for (int i = tid; i < 1024; i += bdim) {
tlwe[i] = 0; // part a
if (bar == 2048)
tlwe[i + 1024] = mu;
else {
cmp = (uint32_t)(i < (bar & 1023));
neg = -(cmp ^ (bar >> 10));
pos = -((1 - cmp) ^ (bar >> 10));
tlwe[i + 1024] = (mu & pos) + ((-mu) & neg); // part b
}
}
__syncthreads();
// accumulate
#pragma unroll
for (int i = 0; i < 500; i ++) { // 500 iterations
bar = ModSwitch2048(0 - 2*in0[i] - 2*in1[i]);
Accumulate(tlwe, sh, sh, bar, bk + (i << 13), ntt);
}
KeySwitch<500, 1024, 2, 8>(out, tlwe, ksk);
}

void Bootstrap(LWESample* out,
Expand All @@ -342,26 +487,44 @@ void Bootstrap(LWESample* out,
CuCheckError();
}

static int count = 0;
void NandBootstrap(LWESample* out, LWESample* in0, LWESample* in1,
Torus mu, Torus fix, cudaStream_t st) {
__NandBootstrap__<<<1, 512, 0, st>>>(out->data(), in0->data(),
in1->data(), mu, fix, bk_ntt->data(), ksk_dev->data(), *ntt_handler);
CuCheckError();
}

void OrBootstrap(LWESample* out, LWESample* in0, LWESample* in1,
Torus mu, Torus fix, cudaStream_t st) {
__OrBootstrap__<<<1, 512, 0, st>>>(out->data(), in0->data(),
in1->data(), mu, fix, bk_ntt->data(), ksk_dev->data(), *ntt_handler);
CuCheckError();
}

void NandBootstrap(LWESample* out,
LWESample* in0,
LWESample* in1,
Torus mu,
Torus fix,
cudaStream_t st) {
dim3 grid(1);
dim3 block(512);
/* if (count == 0) {
cudaFuncAttributes attr;
cudaFuncGetAttributes(&attr, __Bootstrap__);
std::cout<< attr.numRegs << " regs\t" << attr.localSizeBytes << " Bytes\t";
cudaFuncGetAttributes(&attr, __NandBootstrap__);
std::cout<< attr.numRegs << " regs\t" << attr.localSizeBytes << " Bytes\t";
std::cout<< std::endl;
count ++;
}*/
__NandBootstrap__<<<grid, block, 0, st>>>(out->data(), in0->data(),
void AndBootstrap(LWESample* out, LWESample* in0, LWESample* in1,
Torus mu, Torus fix, cudaStream_t st) {
__AndBootstrap__<<<1, 512, 0, st>>>(out->data(), in0->data(),
in1->data(), mu, fix, bk_ntt->data(), ksk_dev->data(), *ntt_handler);
CuCheckError();
}

void NorBootstrap(LWESample* out, LWESample* in0, LWESample* in1,
Torus mu, Torus fix, cudaStream_t st) {
__NorBootstrap__<<<1, 512, 0, st>>>(out->data(), in0->data(),
in1->data(), mu, fix, bk_ntt->data(), ksk_dev->data(), *ntt_handler);
CuCheckError();
}

void XorBootstrap(LWESample* out, LWESample* in0, LWESample* in1,
Torus mu, Torus fix, cudaStream_t st) {
__XorBootstrap__<<<1, 512, 0, st>>>(out->data(), in0->data(),
in1->data(), mu, fix, bk_ntt->data(), ksk_dev->data(), *ntt_handler);
CuCheckError();
}

void XnorBootstrap(LWESample* out, LWESample* in0, LWESample* in1,
Torus mu, Torus fix, cudaStream_t st) {
__XnorBootstrap__<<<1, 512, 0, st>>>(out->data(), in0->data(),
in1->data(), mu, fix, bk_ntt->data(), ksk_dev->data(), *ntt_handler);
CuCheckError();
}
Expand Down
Loading

0 comments on commit 6031f62

Please sign in to comment.