Merge pull request #1 from vernamlab/master

Sync with cuFHE
vernamlab · Sep 19, 2018 · 4178b74 · 4178b74
2 parents 6cb1976 + f75f734
commit 4178b74
Show file tree

Hide file tree

Showing 20 changed files with 785 additions and 115 deletions.
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ The cuFHE library is an open-source library for Fully Homomorphic Encryption (FH
 
 | [TFHE lib](https://github.com/tfhe/tfhe) | cuFHE | Speedup |
 |---|---|---|
-| 13 ms | 0.6 ms | 22 times |
+| 13 ms | **0.5 ms** | 26 times |
 
 ### System Requirements
 **The library has been tested on Ubuntu Desktop 16.04 only.**

diff --git a/cufhe/Makefile b/cufhe/Makefile
@@ -1,9 +1,10 @@
 ## Compilers
 CC = g++
-CU = /usr/local/cuda/bin/nvcc
+CU = nvcc
 ## Flags
 FLAGS = -std=c++11 -O3 -w
-CU_FLAGS = -Wno-deprecated-gpu-targets
+CU_FLAGS = #-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70
+##-Wno-deprecated-gpu-targets
 INC = -I./
 ## Python header files
 PYTHON_VERSION = 2.7
@@ -60,20 +61,20 @@ $(DIR_OBJ)/test/test_api_cpu.o: test/test_api_cpu.cc
 
 $(DIR_BIN)/test_api_gpu: $(DIR_OBJ)/test/test_api_gpu.o
 	$(dir_guard)
-	$(CU) $(FLAGS) -o $@ $(DIR_OBJ)/test/test_api_gpu.o -L$(DIR_BIN) -lcufhe_gpu
+	$(CU) $(FLAGS) $(CU_FLAGS) -o $@ $(DIR_OBJ)/test/test_api_gpu.o -L$(DIR_BIN) -lcufhe_gpu
 
 $(DIR_OBJ)/test/test_api_gpu.o: test/test_api_gpu.cu
 	$(dir_guard)
-	$(CU) $(FLAGS) $(INC) -M -o $(@:%.o=%.d) $<
-	$(CU) $(FLAGS) $(INC) -c -o $@ $<
+	$(CU) $(FLAGS) $(CU_FLAGS) $(INC) -M -o $(@:%.o=%.d) $<
+	$(CU) $(FLAGS) $(CU_FLAGS) $(INC) -c -o $@ $<
 
 $(DIR_BIN)/libcufhe_cpu.so: $(CC_OBJ)
 	$(dir_guard)
-	$(CU) $(FLAGS) -shared -o $@ $(CC_OBJ)
+	$(CU) $(FLAGS) $(CU_FLAGS) -shared -o $@ $(CC_OBJ)
 
 $(DIR_BIN)/libcufhe_gpu.so: $(CU_OBJ) $(DIR_OBJ)/cufhe.o $(DIR_OBJ)/cufhe_io.o
 	$(dir_guard)
-	$(CU) $(FLAGS) -shared -o $@ $(CU_OBJ) $(DIR_OBJ)/cufhe.o $(DIR_OBJ)/cufhe_io.o
+	$(CU) $(FLAGS) $(CU_FLAGS) -shared -o $@ $(CU_OBJ) $(DIR_OBJ)/cufhe.o $(DIR_OBJ)/cufhe_io.o
 
 $(CC_OBJ): $(CC_SRC)
 	$(dir_guard)
@@ -84,9 +85,9 @@ $(CC_OBJ): $(CC_SRC)
 
 $(CU_OBJ): $(CU_SRC)
 	$(dir_guard)
-	$(CU) $(FLAGS) $(INC) -M -o $(@:%.o=%.d) \
+	$(CU) $(FLAGS) $(CU_FLAGS) $(INC) -M -o $(@:%.o=%.d) \
 			$(patsubst $(DIR_OBJ)/%,$(DIR_SRC)/%,$(@:%.o=%.cu))
-	$(CU) $(FLAGS) $(INC) -c -o $@ \
+	$(CU) $(FLAGS) $(CU_FLAGS) $(INC) -c -o $@ \
 			$(patsubst $(DIR_OBJ)/%,$(DIR_SRC)/%,$(@:%.o=%.cu)) -Xcompiler '-fPIC'
 
 $(DIR_PY)/fhepy_cpu.so: $(DIR_PY)/fhepy_cpu.o
@@ -96,8 +97,7 @@ $(DIR_PY)/fhepy_cpu.o: $(DIR_PY)/fhepy.cpp
 	$(CC) $(INC) $(PY_INC) -fPIC -c $(DIR_PY)/fhepy.cpp $(FLAGS) -o $@
 
 $(DIR_PY)/fhepy_gpu.so: $(DIR_PY)/fhepy_gpu.o
-	$(CU) -shared $(DIR_PY)/fhepy_gpu.o $(PY_LIB) -L$(DIR_BIN) -lcufhe_gpu -o $@
+	$(CU) $(CU_FLAGS) -shared $(DIR_PY)/fhepy_gpu.o $(PY_LIB) -L$(DIR_BIN) -lcufhe_gpu -o $@
 
 $(DIR_PY)/fhepy_gpu.o: $(DIR_PY)/fhepy.cu
-	$(CU) $(INC) $(PY_INC) -Xcompiler '-fPIC' -c $(DIR_PY)/fhepy.cu $(FLAGS) -o $@
-
+	$(CU) $(CU_FLAGS) $(INC) $(PY_INC) -Xcompiler '-fPIC' -c $(DIR_PY)/fhepy.cu $(FLAGS) -o $@
diff --git a/cufhe/include/bootstrap_gpu.cuh b/cufhe/include/bootstrap_gpu.cuh
@@ -31,10 +31,45 @@ void BootstrappingKeyToNTT(const BootstrappingKey* bk);
 void KeySwitchingKeyToDevice(const KeySwitchingKey* ksk);
 void DeleteBootstrappingKeyNTT();
 void DeleteKeySwitchingKey();
-
 void Bootstrap(LWESample* out,
                LWESample* in,
                Torus mu,
                cudaStream_t st = 0);
+void NandBootstrap(LWESample* out,
+                   LWESample* in0,
+                   LWESample* in1,
+                   Torus mu,
+                   Torus fix,
+                   cudaStream_t st);
+void OrBootstrap(LWESample* out,
+                   LWESample* in0,
+                   LWESample* in1,
+                   Torus mu,
+                   Torus fix,
+                   cudaStream_t st);
+void AndBootstrap(LWESample* out,
+                   LWESample* in0,
+                   LWESample* in1,
+                   Torus mu,
+                   Torus fix,
+                   cudaStream_t st);
+void NorBootstrap(LWESample* out,
+                   LWESample* in0,
+                   LWESample* in1,
+                   Torus mu,
+                   Torus fix,
+                   cudaStream_t st);
+void XorBootstrap(LWESample* out,
+                   LWESample* in0,
+                   LWESample* in1,
+                   Torus mu,
+                   Torus fix,
+                   cudaStream_t st);
+void XnorBootstrap(LWESample* out,
+                  LWESample* in0,
+                  LWESample* in1,
+                  Torus mu,
+                  Torus fix,
+                  cudaStream_t st);
 
 } // namespace cufhe
diff --git a/cufhe/include/cufhe.h b/cufhe/include/cufhe.h
@@ -108,6 +108,8 @@ struct Ctxt {
   ~Ctxt();
   LWESample* lwe_sample_;
   MemoryDeleter lwe_sample_deleter_;
+  LWESample* lwe_sample_device_;
+  MemoryDeleter lwe_sample_device_deleter_;
 };
 
 /** Plaintext is in {0, 1}. */

diff --git a/cufhe/include/cufhe_gpu.cuh b/cufhe/include/cufhe_gpu.cuh
@@ -59,7 +59,7 @@ public:
   inline Stream() {}
   inline Stream(int id) { Assert(id == 0); st_ = 0; }
   inline ~Stream() {}
-  inline void Create() { cudaStreamCreateWithFlags(&this->st_, cudaStreamDefault); }
+  inline void Create() { cudaStreamCreateWithFlags(&this->st_, cudaStreamNonBlocking); }
   inline void Destroy() { cudaStreamDestroy(this->st_); }
   inline cudaStream_t st() { return st_; };
 private:

diff --git a/cufhe/include/ntt_gpu/ntt.cuh b/cufhe/include/ntt_gpu/ntt.cuh
@@ -27,6 +27,7 @@
 #include "ntt_1024_device.cuh"
 #include "ntt_1024_twiddle.cuh"
 #include <include/details/math.h>
+#include <include/details/error_gpu.cuh>
 
 namespace cufhe {
 
@@ -41,6 +42,18 @@ public:
   __host__ __device__ inline
   ~CuNTTHandler() {};
 
+  inline
+  void CreateConstant() {
+    CuSafeCall(cudaMemcpyToSymbol(con_twd, this->twd_,
+        sizeof(FFP) * 1024, 0,  cudaMemcpyDeviceToDevice));
+    CuSafeCall(cudaMemcpyToSymbol(con_twd_inv, this->twd_inv_,
+        sizeof(FFP) * 1024, 0, cudaMemcpyDeviceToDevice));
+    CuSafeCall(cudaMemcpyToSymbol(con_twd_sqrt, this->twd_sqrt_,
+        sizeof(FFP) * 1024, 0,  cudaMemcpyDeviceToDevice));
+    CuSafeCall(cudaMemcpyToSymbol(con_twd_sqrt_inv, this->twd_sqrt_inv_,
+        sizeof(FFP) * 1024, 0, cudaMemcpyDeviceToDevice));
+  }
+
   template <typename T>
   __device__ inline
   void NTT(FFP* out,

diff --git a/cufhe/include/ntt_gpu/ntt_1024_device.cuh b/cufhe/include/ntt_gpu/ntt_1024_device.cuh
@@ -29,6 +29,11 @@
 
 namespace cufhe {
 
+__constant__ FFP con_twd[1024];
+__constant__ FFP con_twd_inv[1024];
+__constant__ FFP con_twd_sqrt[1024];
+__constant__ FFP con_twd_sqrt_inv[1024];
+
 __device__ inline
 void NTT1024Core(FFP* r,
                  FFP* s,
@@ -37,10 +42,10 @@ void NTT1024Core(FFP* r,
                  const uint32_t& t1d,
                  const uint3& t3d) {
   FFP *ptr = nullptr;
-  __syncthreads();
   #pragma unroll
   for (int i = 0; i < 8; i ++)
-    r[i] *= twd_sqrt[(i << 7) | t1d]; // mult twiddle sqrt
+    //r[i] *= twd_sqrt[(i << 7) | t1d]; // mult twiddle sqrt
+    r[i] *= con_twd_sqrt[(i << 7) | t1d]; // mult twiddle sqrt
   NTT8(r);
   NTT8x2Lsh(r, t3d.z); // if (t1d >= 64) NTT8x2<1>(r);
   ptr = &s[(t3d.y << 7) | (t3d.z << 6) | (t3d.x << 2)];
@@ -65,7 +70,8 @@ void NTT1024Core(FFP* r,
   ptr = &s[t1d];
   #pragma unroll
   for (int i = 0; i < 8; i ++)
-    r[i] = ptr[i << 7] * twd[(i << 7) | t1d]; // mult twiddle
+    //r[i] = ptr[i << 7] * twd[(i << 7) | t1d]; // mult twiddle
+    r[i] = ptr[i << 7] * con_twd[(i << 7) | t1d]; // mult twiddle
   NTT8(r);
   #pragma unroll
   for (int i = 0; i < 8; i ++)
@@ -89,7 +95,6 @@ void NTTInv1024Core(FFP* r,
                     const uint3& t3d) {
 
   FFP *ptr = nullptr;
-  __syncthreads();
   NTTInv8(r);
   NTTInv8x2Lsh(r, t3d.z); // if (t1d >= 64) NTT8x2<1>(r);
   ptr = &s[(t3d.y << 7) | (t3d.z << 6) | (t3d.x << 2)];
@@ -114,7 +119,8 @@ void NTTInv1024Core(FFP* r,
   ptr = &s[t1d];
   #pragma unroll
   for (int i = 0; i < 8; i ++)
-    r[i] = ptr[i << 7] * twd_inv[(i << 7) | t1d]; // mult twiddle
+    //r[i] = ptr[i << 7] * twd_inv[(i << 7) | t1d]; // mult twiddle
+    r[i] = ptr[i << 7] * con_twd_inv[(i << 7) | t1d]; // mult twiddle
   NTTInv8(r);
   #pragma unroll
   for (int i = 0; i < 8; i ++)
@@ -129,7 +135,8 @@ void NTTInv1024Core(FFP* r,
   NTTInv8(r);
   #pragma unroll
   for (int i = 0; i < 8; i ++)
-    r[i] *= twd_sqrt_inv[(i << 7) | t1d]; // mult twiddle sqrt
+    //r[i] *= twd_sqrt_inv[(i << 7) | t1d]; // mult twiddle sqrt
+    r[i] *= con_twd_sqrt_inv[(i << 7) | t1d]; // mult twiddle sqrt
 }
 
 template <typename T>
@@ -147,6 +154,7 @@ void NTT1024(FFP* out,
   #pragma unroll
   for (int i = 0; i < 8; i ++)
     r[i] = FFP((T)in[(i << 7) | t1d]);
+  __syncthreads();
   NTT1024Core(r, temp_shared, twd, twd_sqrt, t1d, t3d);
   #pragma unroll
   for (int i = 0; i < 8; i ++)
@@ -171,6 +179,7 @@ void NTT1024Decomp(FFP* out,
   #pragma unroll
   for (int i = 0; i < 8; i ++)
     r[i] = FFP(((in[(i << 7) | t1d] >> rsh_bits) & mask) - offset);
+  __syncthreads();
   NTT1024Core(r, temp_shared, twd, twd_sqrt, t1d, t3d);
   #pragma unroll
   for (int i = 0; i < 8; i ++)
@@ -192,6 +201,7 @@ void NTTInv1024(T* out,
   #pragma unroll
   for (int i = 0; i < 8; i ++)
     r[i] = in[(i << 7) | t1d];
+  __syncthreads();
   NTTInv1024Core(r, temp_shared, twd_inv, twd_sqrt_inv, t1d, t3d);
   // mod 2^32 specifically
   uint64_t med = FFP::kModulus() / 2;
@@ -215,6 +225,7 @@ void NTTInv1024Add(T* out,
   #pragma unroll
   for (int i = 0; i < 8; i ++)
     r[i] = in[(i << 7) | t1d];
+  __syncthreads();
   NTTInv1024Core(r, temp_shared, twd_inv, twd_sqrt_inv, t1d, t3d);
   // mod 2^32 specifically
   uint64_t med = FFP::kModulus() / 2;

diff --git a/cufhe/lib/bootstrap_cpu.cc b/cufhe/lib/bootstrap_cpu.cc
@@ -158,6 +158,7 @@ void Bootstrap(LWESample* out,
     }
   }
 
+  delete ksk_entry;
   for (int i = 0; i < kpl; i ++)
     delete [] decomp[i];
   delete [] decomp;