added NandBootstrap, using memcpy, 8 nands

vernamlab · May 22, 2018 · 7c30949 · 7c30949
1 parent 10a5390
commit 7c30949
Show file tree

Hide file tree

Showing 4 changed files with 94 additions and 24 deletions.
diff --git a/cufhe/include/bootstrap_gpu.cuh b/cufhe/include/bootstrap_gpu.cuh
@@ -37,4 +37,11 @@ void Bootstrap(LWESample* out,
                Torus mu,
                cudaStream_t st = 0);
 
+void NandBootstrap(LWESample* out,
+               LWESample* in0,
+               LWESample* in1,
+               Torus mu,
+               Torus fix,
+               cudaStream_t st);
+
 } // namespace cufhe
diff --git a/cufhe/lib/bootstrap_gpu.cu b/cufhe/lib/bootstrap_gpu.cu
@@ -283,6 +283,53 @@ void __Bootstrap__(Torus* out, Torus* in, Torus mu,
   KeySwitch<lwe_n, tlwe_n, ks_bits, ks_size>(out, tlwe, ksk);
 }
 
+__global__
+void __NandBootstrap__(Torus* out, Torus* in0, Torus* in1, Torus mu, Torus fix,
+                   FFP* bk,
+                   Torus* ksk,
+                   CuNTTHandler<> ntt) {
+//  Assert(bk.k() == 1);
+//  Assert(bk.l() == 2);
+//  Assert(bk.n() == 1024);
+  __shared__ FFP sh[6 * 1024];
+  FFP* sh_acc_ntt[4] = { sh, sh + 1024, sh + 2048, sh + 3072 };
+  FFP* sh_res_ntt[2] = { sh, sh + 4096 };
+  Torus* tlwe = (Torus*)&sh[5120];
+
+  // test vector
+  // acc.a = 0; acc.b = vec(mu) * x ^ (in.b()/2048)
+  //register int32_t bar = 2048 - ModSwitch2048(in[500]);
+  register int32_t bar = 2048 - ModSwitch2048(fix - in0[500] - in1[500]);
+  register uint32_t tid = ThisThreadRankInBlock();
+  register uint32_t bdim = ThisBlockSize();
+  register uint32_t cmp, neg, pos;
+  #pragma unroll
+  for (int i = tid; i < 1024; i += bdim) {
+    tlwe[i] = 0; // part a
+    if (bar == 2048)
+      tlwe[i + 1024] = mu;
+    else {
+      cmp = (uint32_t)(i < (bar & 1023));
+      neg = -(cmp ^ (bar >> 10));
+      pos = -((1 - cmp) ^ (bar >> 10));
+      tlwe[i + 1024] = (mu & pos) + ((-mu) & neg); // part b
+    }
+  }
+  __syncthreads();
+  // accumulate
+  #pragma unroll
+  for (int i = 0; i < 500; i ++) { // 500 iterations
+    bar = ModSwitch2048(0 - in0[i] - in1[i]);
+    Accumulate(tlwe, sh_acc_ntt, sh_res_ntt, bar, bk + (i << 13), ntt);
+  }
+
+  static const uint32_t lwe_n = 500;
+  static const uint32_t tlwe_n = 1024;
+  static const uint32_t ks_bits = 2;
+  static const uint32_t ks_size = 8;
+  KeySwitch<lwe_n, tlwe_n, ks_bits, ks_size>(out, tlwe, ksk);
+}
+
 void Bootstrap(LWESample* out,
                LWESample* in,
                Torus mu,
@@ -294,4 +341,17 @@ void Bootstrap(LWESample* out,
   CuCheckError();
 }
 
+void NandBootstrap(LWESample* out,
+               LWESample* in0,
+               LWESample* in1,
+               Torus mu,
+               Torus fix,
+               cudaStream_t st) {
+  dim3 grid(1);
+  dim3 block(512);
+  __NandBootstrap__<<<grid, block, 0, st>>>(out->data(), in0->data(),
+      in1->data(), mu, fix, bk_ntt->data(), ksk_dev->data(), *ntt_handler);
+  CuCheckError();
+}
+
 } // namespace cufhe
diff --git a/cufhe/lib/cufhe_gates_gpu.cu b/cufhe/lib/cufhe_gates_gpu.cu
@@ -36,37 +36,40 @@ void CleanUp() {
   DeleteKeySwitchingKey();
 }
 
-//void Initialize(PubKey pub_key);
-//void And (Ctxt& out, const Ctxt& in0, const Ctxt& in1, const PubKey& pub_key);
-//void Or  (Ctxt& out, const Ctxt& in0, const Ctxt& in1, const PubKey& pub_key);
-//void Xor (Ctxt& out, const Ctxt& in0, const Ctxt& in1, const PubKey& pub_key);
+inline void CtxtCopyH2D(const Ctxt& c, Stream st) {
+  cudaMemcpyAsync(c.lwe_sample_device_->data(),
+                  c.lwe_sample_->data(),
+                  c.lwe_sample_->SizeData(),
+                  cudaMemcpyHostToDevice,
+                  st.st());
+}
+
+inline void CtxtCopyD2H(const Ctxt& c, Stream st) {
+  cudaMemcpyAsync(c.lwe_sample_->data(),
+                  c.lwe_sample_device_->data(),
+                  c.lwe_sample_->SizeData(),
+                  cudaMemcpyDeviceToHost,
+                  st.st());
+}
+
 void Nand(Ctxt& out,
           const Ctxt& in0,
           const Ctxt& in1,
           Stream st) {
   static const Torus mu = ModSwitchToTorus(1, 8);
   static const Torus fix = ModSwitchToTorus(1, 8);
-  for (int i = 0; i <= in0.lwe_sample_->n(); i ++)
+/*  for (int i = 0; i <= in0.lwe_sample_->n(); i ++)
     out.lwe_sample_->data()[i] = 0 - in0.lwe_sample_->data()[i]
                                    - in1.lwe_sample_->data()[i];
   out.lwe_sample_->b() += fix;
-  cudaMemcpyAsync(out.lwe_sample_device_->data(),
-                  out.lwe_sample_->data(),
-                  out.lwe_sample_->SizeData(),
-                  cudaMemcpyHostToDevice,
-                  st.st());
+*/
+  CtxtCopyH2D(in0, st);
+  CtxtCopyH2D(in1, st);
   //Bootstrap(out.lwe_sample_, out.lwe_sample_, mu, st.st());
-  Bootstrap(out.lwe_sample_device_, out.lwe_sample_device_, mu, st.st());
-  cudaMemcpyAsync(out.lwe_sample_->data(),
-                  out.lwe_sample_device_->data(),
-                  out.lwe_sample_->SizeData(),
-                  cudaMemcpyDeviceToHost,
-                  st.st());
-  cudaEvent_t end_of_gate;
-  cudaEventCreate(&end_of_gate);
-  cudaEventRecord(end_of_gate, st.st());
-  cudaStreamWaitEvent(0, end_of_gate, 0);
-  //Synchronize();
+  //CtxtCopyH2D(out, st);
+  //Bootstrap(out.lwe_sample_device_, out.lwe_sample_device_, mu, st.st());
+  NandBootstrap(out.lwe_sample_device_, in0.lwe_sample_device_, in1.lwe_sample_device_, mu, fix, st.st());
+  CtxtCopyD2H(out, st);
 }
 
 void Or(Ctxt& out,

diff --git a/cufhe/test/test_api_gpu.cu b/cufhe/test/test_api_gpu.cu
@@ -49,7 +49,7 @@ int main() {
   cudaGetDeviceProperties(&prop, 0);
   uint32_t kNumSMs = prop.multiProcessorCount;
   uint32_t kNumTests = kNumSMs * 8;
-  uint32_t kNumLevels = 8;
+  uint32_t kNumLevels = 2;
 
   SetSeed(); // set random seed
 
@@ -107,8 +107,8 @@ int main() {
   cudaEventRecord(start, 0);
 
   // Here, pass streams to gates for parallel gates.
-  for (int i = 0; i < kNumTests; i ++)
-    for (int j = 0; j < kNumLevels; j ++)
+  for (int j = 0; j < kNumLevels; j ++)
+    for (int i = 0; i < kNumTests; i ++)
       Nand(ct[i], ct[i], ct[i + kNumTests], st[i % kNumSMs]);
   Synchronize();