Skip to content

Commit

Permalink
Merge pull request #1 from vernamlab/master
Browse files Browse the repository at this point in the history
Sync with cuFHE
  • Loading branch information
WeiDaiWD authored Sep 19, 2018
2 parents 6cb1976 + f75f734 commit 4178b74
Show file tree
Hide file tree
Showing 20 changed files with 785 additions and 115 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ The cuFHE library is an open-source library for Fully Homomorphic Encryption (FH

| [TFHE lib](https://github.com/tfhe/tfhe) | cuFHE | Speedup |
|---|---|---|
| 13 ms | 0.6 ms | 22 times |
| 13 ms | **0.5 ms** | 26 times |

### System Requirements
**The library has been tested on Ubuntu Desktop 16.04 only.**
Expand Down
24 changes: 12 additions & 12 deletions cufhe/Makefile
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
## Compilers
CC = g++
CU = /usr/local/cuda/bin/nvcc
CU = nvcc
## Flags
FLAGS = -std=c++11 -O3 -w
CU_FLAGS = -Wno-deprecated-gpu-targets
CU_FLAGS = #-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70
##-Wno-deprecated-gpu-targets
INC = -I./
## Python header files
PYTHON_VERSION = 2.7
Expand Down Expand Up @@ -60,20 +61,20 @@ $(DIR_OBJ)/test/test_api_cpu.o: test/test_api_cpu.cc

$(DIR_BIN)/test_api_gpu: $(DIR_OBJ)/test/test_api_gpu.o
$(dir_guard)
$(CU) $(FLAGS) -o $@ $(DIR_OBJ)/test/test_api_gpu.o -L$(DIR_BIN) -lcufhe_gpu
$(CU) $(FLAGS) $(CU_FLAGS) -o $@ $(DIR_OBJ)/test/test_api_gpu.o -L$(DIR_BIN) -lcufhe_gpu

$(DIR_OBJ)/test/test_api_gpu.o: test/test_api_gpu.cu
$(dir_guard)
$(CU) $(FLAGS) $(INC) -M -o $(@:%.o=%.d) $<
$(CU) $(FLAGS) $(INC) -c -o $@ $<
$(CU) $(FLAGS) $(CU_FLAGS) $(INC) -M -o $(@:%.o=%.d) $<
$(CU) $(FLAGS) $(CU_FLAGS) $(INC) -c -o $@ $<

$(DIR_BIN)/libcufhe_cpu.so: $(CC_OBJ)
$(dir_guard)
$(CU) $(FLAGS) -shared -o $@ $(CC_OBJ)
$(CU) $(FLAGS) $(CU_FLAGS) -shared -o $@ $(CC_OBJ)

$(DIR_BIN)/libcufhe_gpu.so: $(CU_OBJ) $(DIR_OBJ)/cufhe.o $(DIR_OBJ)/cufhe_io.o
$(dir_guard)
$(CU) $(FLAGS) -shared -o $@ $(CU_OBJ) $(DIR_OBJ)/cufhe.o $(DIR_OBJ)/cufhe_io.o
$(CU) $(FLAGS) $(CU_FLAGS) -shared -o $@ $(CU_OBJ) $(DIR_OBJ)/cufhe.o $(DIR_OBJ)/cufhe_io.o

$(CC_OBJ): $(CC_SRC)
$(dir_guard)
Expand All @@ -84,9 +85,9 @@ $(CC_OBJ): $(CC_SRC)

$(CU_OBJ): $(CU_SRC)
$(dir_guard)
$(CU) $(FLAGS) $(INC) -M -o $(@:%.o=%.d) \
$(CU) $(FLAGS) $(CU_FLAGS) $(INC) -M -o $(@:%.o=%.d) \
$(patsubst $(DIR_OBJ)/%,$(DIR_SRC)/%,$(@:%.o=%.cu))
$(CU) $(FLAGS) $(INC) -c -o $@ \
$(CU) $(FLAGS) $(CU_FLAGS) $(INC) -c -o $@ \
$(patsubst $(DIR_OBJ)/%,$(DIR_SRC)/%,$(@:%.o=%.cu)) -Xcompiler '-fPIC'

$(DIR_PY)/fhepy_cpu.so: $(DIR_PY)/fhepy_cpu.o
Expand All @@ -96,8 +97,7 @@ $(DIR_PY)/fhepy_cpu.o: $(DIR_PY)/fhepy.cpp
$(CC) $(INC) $(PY_INC) -fPIC -c $(DIR_PY)/fhepy.cpp $(FLAGS) -o $@

$(DIR_PY)/fhepy_gpu.so: $(DIR_PY)/fhepy_gpu.o
$(CU) -shared $(DIR_PY)/fhepy_gpu.o $(PY_LIB) -L$(DIR_BIN) -lcufhe_gpu -o $@
$(CU) $(CU_FLAGS) -shared $(DIR_PY)/fhepy_gpu.o $(PY_LIB) -L$(DIR_BIN) -lcufhe_gpu -o $@

$(DIR_PY)/fhepy_gpu.o: $(DIR_PY)/fhepy.cu
$(CU) $(INC) $(PY_INC) -Xcompiler '-fPIC' -c $(DIR_PY)/fhepy.cu $(FLAGS) -o $@

$(CU) $(CU_FLAGS) $(INC) $(PY_INC) -Xcompiler '-fPIC' -c $(DIR_PY)/fhepy.cu $(FLAGS) -o $@
37 changes: 36 additions & 1 deletion cufhe/include/bootstrap_gpu.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,45 @@ void BootstrappingKeyToNTT(const BootstrappingKey* bk);
void KeySwitchingKeyToDevice(const KeySwitchingKey* ksk);
void DeleteBootstrappingKeyNTT();
void DeleteKeySwitchingKey();

void Bootstrap(LWESample* out,
LWESample* in,
Torus mu,
cudaStream_t st = 0);
void NandBootstrap(LWESample* out,
LWESample* in0,
LWESample* in1,
Torus mu,
Torus fix,
cudaStream_t st);
void OrBootstrap(LWESample* out,
LWESample* in0,
LWESample* in1,
Torus mu,
Torus fix,
cudaStream_t st);
void AndBootstrap(LWESample* out,
LWESample* in0,
LWESample* in1,
Torus mu,
Torus fix,
cudaStream_t st);
void NorBootstrap(LWESample* out,
LWESample* in0,
LWESample* in1,
Torus mu,
Torus fix,
cudaStream_t st);
void XorBootstrap(LWESample* out,
LWESample* in0,
LWESample* in1,
Torus mu,
Torus fix,
cudaStream_t st);
void XnorBootstrap(LWESample* out,
LWESample* in0,
LWESample* in1,
Torus mu,
Torus fix,
cudaStream_t st);

} // namespace cufhe
2 changes: 2 additions & 0 deletions cufhe/include/cufhe.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ struct Ctxt {
~Ctxt();
LWESample* lwe_sample_;
MemoryDeleter lwe_sample_deleter_;
LWESample* lwe_sample_device_;
MemoryDeleter lwe_sample_device_deleter_;
};

/** Plaintext is in {0, 1}. */
Expand Down
2 changes: 1 addition & 1 deletion cufhe/include/cufhe_gpu.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public:
inline Stream() {}
inline Stream(int id) { Assert(id == 0); st_ = 0; }
inline ~Stream() {}
inline void Create() { cudaStreamCreateWithFlags(&this->st_, cudaStreamDefault); }
inline void Create() { cudaStreamCreateWithFlags(&this->st_, cudaStreamNonBlocking); }
inline void Destroy() { cudaStreamDestroy(this->st_); }
inline cudaStream_t st() { return st_; };
private:
Expand Down
13 changes: 13 additions & 0 deletions cufhe/include/ntt_gpu/ntt.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "ntt_1024_device.cuh"
#include "ntt_1024_twiddle.cuh"
#include <include/details/math.h>
#include <include/details/error_gpu.cuh>

namespace cufhe {

Expand All @@ -41,6 +42,18 @@ public:
__host__ __device__ inline
~CuNTTHandler() {};

inline
void CreateConstant() {
CuSafeCall(cudaMemcpyToSymbol(con_twd, this->twd_,
sizeof(FFP) * 1024, 0, cudaMemcpyDeviceToDevice));
CuSafeCall(cudaMemcpyToSymbol(con_twd_inv, this->twd_inv_,
sizeof(FFP) * 1024, 0, cudaMemcpyDeviceToDevice));
CuSafeCall(cudaMemcpyToSymbol(con_twd_sqrt, this->twd_sqrt_,
sizeof(FFP) * 1024, 0, cudaMemcpyDeviceToDevice));
CuSafeCall(cudaMemcpyToSymbol(con_twd_sqrt_inv, this->twd_sqrt_inv_,
sizeof(FFP) * 1024, 0, cudaMemcpyDeviceToDevice));
}

template <typename T>
__device__ inline
void NTT(FFP* out,
Expand Down
23 changes: 17 additions & 6 deletions cufhe/include/ntt_gpu/ntt_1024_device.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@

namespace cufhe {

__constant__ FFP con_twd[1024];
__constant__ FFP con_twd_inv[1024];
__constant__ FFP con_twd_sqrt[1024];
__constant__ FFP con_twd_sqrt_inv[1024];

__device__ inline
void NTT1024Core(FFP* r,
FFP* s,
Expand All @@ -37,10 +42,10 @@ void NTT1024Core(FFP* r,
const uint32_t& t1d,
const uint3& t3d) {
FFP *ptr = nullptr;
__syncthreads();
#pragma unroll
for (int i = 0; i < 8; i ++)
r[i] *= twd_sqrt[(i << 7) | t1d]; // mult twiddle sqrt
//r[i] *= twd_sqrt[(i << 7) | t1d]; // mult twiddle sqrt
r[i] *= con_twd_sqrt[(i << 7) | t1d]; // mult twiddle sqrt
NTT8(r);
NTT8x2Lsh(r, t3d.z); // if (t1d >= 64) NTT8x2<1>(r);
ptr = &s[(t3d.y << 7) | (t3d.z << 6) | (t3d.x << 2)];
Expand All @@ -65,7 +70,8 @@ void NTT1024Core(FFP* r,
ptr = &s[t1d];
#pragma unroll
for (int i = 0; i < 8; i ++)
r[i] = ptr[i << 7] * twd[(i << 7) | t1d]; // mult twiddle
//r[i] = ptr[i << 7] * twd[(i << 7) | t1d]; // mult twiddle
r[i] = ptr[i << 7] * con_twd[(i << 7) | t1d]; // mult twiddle
NTT8(r);
#pragma unroll
for (int i = 0; i < 8; i ++)
Expand All @@ -89,7 +95,6 @@ void NTTInv1024Core(FFP* r,
const uint3& t3d) {

FFP *ptr = nullptr;
__syncthreads();
NTTInv8(r);
NTTInv8x2Lsh(r, t3d.z); // if (t1d >= 64) NTT8x2<1>(r);
ptr = &s[(t3d.y << 7) | (t3d.z << 6) | (t3d.x << 2)];
Expand All @@ -114,7 +119,8 @@ void NTTInv1024Core(FFP* r,
ptr = &s[t1d];
#pragma unroll
for (int i = 0; i < 8; i ++)
r[i] = ptr[i << 7] * twd_inv[(i << 7) | t1d]; // mult twiddle
//r[i] = ptr[i << 7] * twd_inv[(i << 7) | t1d]; // mult twiddle
r[i] = ptr[i << 7] * con_twd_inv[(i << 7) | t1d]; // mult twiddle
NTTInv8(r);
#pragma unroll
for (int i = 0; i < 8; i ++)
Expand All @@ -129,7 +135,8 @@ void NTTInv1024Core(FFP* r,
NTTInv8(r);
#pragma unroll
for (int i = 0; i < 8; i ++)
r[i] *= twd_sqrt_inv[(i << 7) | t1d]; // mult twiddle sqrt
//r[i] *= twd_sqrt_inv[(i << 7) | t1d]; // mult twiddle sqrt
r[i] *= con_twd_sqrt_inv[(i << 7) | t1d]; // mult twiddle sqrt
}

template <typename T>
Expand All @@ -147,6 +154,7 @@ void NTT1024(FFP* out,
#pragma unroll
for (int i = 0; i < 8; i ++)
r[i] = FFP((T)in[(i << 7) | t1d]);
__syncthreads();
NTT1024Core(r, temp_shared, twd, twd_sqrt, t1d, t3d);
#pragma unroll
for (int i = 0; i < 8; i ++)
Expand All @@ -171,6 +179,7 @@ void NTT1024Decomp(FFP* out,
#pragma unroll
for (int i = 0; i < 8; i ++)
r[i] = FFP(((in[(i << 7) | t1d] >> rsh_bits) & mask) - offset);
__syncthreads();
NTT1024Core(r, temp_shared, twd, twd_sqrt, t1d, t3d);
#pragma unroll
for (int i = 0; i < 8; i ++)
Expand All @@ -192,6 +201,7 @@ void NTTInv1024(T* out,
#pragma unroll
for (int i = 0; i < 8; i ++)
r[i] = in[(i << 7) | t1d];
__syncthreads();
NTTInv1024Core(r, temp_shared, twd_inv, twd_sqrt_inv, t1d, t3d);
// mod 2^32 specifically
uint64_t med = FFP::kModulus() / 2;
Expand All @@ -215,6 +225,7 @@ void NTTInv1024Add(T* out,
#pragma unroll
for (int i = 0; i < 8; i ++)
r[i] = in[(i << 7) | t1d];
__syncthreads();
NTTInv1024Core(r, temp_shared, twd_inv, twd_sqrt_inv, t1d, t3d);
// mod 2^32 specifically
uint64_t med = FFP::kModulus() / 2;
Expand Down
1 change: 1 addition & 0 deletions cufhe/lib/bootstrap_cpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ void Bootstrap(LWESample* out,
}
}

delete ksk_entry;
for (int i = 0; i < kpl; i ++)
delete [] decomp[i];
delete [] decomp;
Expand Down
Loading

0 comments on commit 4178b74

Please sign in to comment.