Code cleanup

-GPU memory was being allocated from the host side but only used on the GPU side. Allocate this memory on the GPU side instead of passing the pointer every time.
Slyfrosty · Jun 28, 2018 · ae21e15 · ae21e15
1 parent f3a9a7d
commit ae21e15
Show file tree

Hide file tree

Showing 6 changed files with 94 additions and 49 deletions.
diff --git a/KeyFinderLib/KeyFinder.cpp b/KeyFinderLib/KeyFinder.cpp
@@ -79,6 +79,7 @@ KeyFinder::KeyFinder(int device, const secp256k1::uint256 &start, unsigned long
 KeyFinder::~KeyFinder()
 {
 	cleanupTargets();
+	cleanupChainBuf();
 
 	if(_devCtx) {
 		delete _devCtx;
@@ -138,6 +139,8 @@ void KeyFinder::init()
 
 	setTargetHashes();
 
+	allocateChainBuf(_numThreads * _numBlocks * _pointsPerThread);
+
 	// Set the incrementor
 	secp256k1::ecpoint g = secp256k1::G();
 	secp256k1::ecpoint p = secp256k1::multiplyPoint(secp256k1::uint256(_numThreads * _numBlocks * _pointsPerThread), g);

diff --git a/KeyFinderLib/KeyFinder.cu b/KeyFinderLib/KeyFinder.cu
@@ -23,9 +23,11 @@ __constant__ unsigned int *_BLOOM_FILTER[1];
 __constant__ unsigned int _USE_BLOOM_FILTER[1];
 __constant__ unsigned int _INC_X[8];
 __constant__ unsigned int _INC_Y[8];
+__constant__ unsigned int *_CHAIN[1];
 
 static bool _useBloomFilter = false;
 static unsigned int *_bloomFilterPtr = NULL;
+static unsigned int *_chainBufferPtr = NULL;
 
 static const unsigned int _RIPEMD160_IV_HOST[5] = {
 	0x67452301,
@@ -40,28 +42,27 @@ static unsigned int swp(unsigned int x)
 	return (x << 24) | ((x << 8) & 0x00ff0000) | ((x >> 8) & 0x0000ff00) | (x >> 24);
 }
 
-void cleanupTargets()
+
+static void undoRMD160FinalRound(const unsigned int hIn[5], unsigned int hOut[5])
 {
-	if(_useBloomFilter) {
-		if(_bloomFilterPtr != NULL) {
-			cudaFree(_bloomFilterPtr);
-			_bloomFilterPtr = NULL;
-		}
+	for(int i = 0; i < 5; i++) {
+		hOut[i] = swp(hIn[i]) - _RIPEMD160_IV_HOST[(i + 1) % 5];
 	}
 }
 
-cudaError_t setTargetConstantMemory(const std::vector<struct hash160> &targets)
+
+/**
+ Copies the target hashes to constant memory
+ */
+static cudaError_t setTargetConstantMemory(const std::vector<struct hash160> &targets)
 {
 	unsigned int count = targets.size();
 
 
-	for(int i = 0; i < count; i++) {
+	for(unsigned int i = 0; i < count; i++) {
 		unsigned int h[5];
 
-		// Undo the final round of RIPEMD160 and endian swap to save some computation
-		for(int j = 0; j < 5; j++) {
-			h[j] = swp(targets[i].h[j]) - _RIPEMD160_IV_HOST[(j + 1) % 5];
-		}
+		undoRMD160FinalRound(targets[i].h, h);
 
 		cudaError_t err = cudaMemcpyToSymbol(_TARGET_HASH, h, sizeof(unsigned int) * 5, i * sizeof(unsigned int) * 5);
 
@@ -85,7 +86,10 @@ cudaError_t setTargetConstantMemory(const std::vector<struct hash160> &targets)
 	return cudaSuccess;
 }
 
-cudaError_t setTargetBloomFilter(const std::vector<struct hash160> &targets)
+/**
+Populates the bloom filter with the target hashes
+*/
+static cudaError_t setTargetBloomFilter(const std::vector<struct hash160> &targets)
 {
 	unsigned int filter[BLOOM_FILTER_SIZE_WORDS];
 
@@ -96,20 +100,22 @@ cudaError_t setTargetBloomFilter(const std::vector<struct hash160> &targets)
 	}
 
 	memset(filter, 0, sizeof(unsigned int) * BLOOM_FILTER_SIZE_WORDS);
-	
+
 	// Use the low 16 bits of each word in the hash as the index into the bloom filter
-	for(int i = 0; i < targets.size(); i++) {
-
+	for(unsigned int i = 0; i < targets.size(); i++) {
+
+		unsigned int h[5];
+
+		undoRMD160FinalRound(targets[i].h, h);
+
 		for(int j = 0; j < 5; j++) {
-			// Undo the final round of RIPEMD160 and endian swap to save some computation
-			unsigned int h = swp(targets[i].h[j]) - _RIPEMD160_IV_HOST[(j + 1) % 5];
-
-			unsigned int idx = h & 0xffff;
-
+			unsigned int idx = h[i] & 0xffff;
+
 			filter[idx / 32] |= (0x01 << (idx % 32));
 		}
+
 	}
-	
+
 	// Copy to device
 	err = cudaMemcpy(_bloomFilterPtr, filter, sizeof(unsigned int) * BLOOM_FILTER_SIZE_WORDS, cudaMemcpyHostToDevice);
 	if(err) {
@@ -132,6 +138,19 @@ cudaError_t setTargetBloomFilter(const std::vector<struct hash160> &targets)
 	return err;
 }
 
+
+void cleanupTargets()
+{
+	if(_useBloomFilter && _bloomFilterPtr != NULL) {
+		cudaFree(_bloomFilterPtr);
+		_bloomFilterPtr = NULL;
+	}
+}
+
+/**
+ *Copies the target hashes to either constant memory, or the bloom filter depending
+ on how many targets there are
+ */
 cudaError_t setTargetHash(const std::vector<struct hash160> &targets)
 {
 	cleanupTargets();
@@ -144,7 +163,39 @@ cudaError_t setTargetHash(const std::vector<struct hash160> &targets)
 }
 
 
+/**
+ * Allocates device memory for storing the multiplication chain used in
+ the batch inversion operation
+ */
+cudaError_t allocateChainBuf(unsigned int count)
+{
+	cudaError_t err = cudaMalloc(&_chainBufferPtr, count * sizeof(unsigned int) * 8);
+
+	if(err) {
+		return err;
+	}
+
+	err = cudaMemcpyToSymbol(_CHAIN, &_chainBufferPtr, sizeof(unsigned int *));
+	if(err) {
+		cudaFree(_chainBufferPtr);
+	}
+
+	return err;
+}
 
+void cleanupChainBuf()
+{
+	if(_chainBufferPtr != NULL) {
+		cudaFree(_chainBufferPtr);
+		_chainBufferPtr = NULL;
+	}
+}
+
+
+
+/**
+ *Sets the EC point which all points will be incremented by
+ */
 cudaError_t setIncrementorPoint(const secp256k1::uint256 &x, const secp256k1::uint256 &y)
 {
 	unsigned int xWords[8];
@@ -161,6 +212,8 @@ cudaError_t setIncrementorPoint(const secp256k1::uint256 &x, const secp256k1::ui
 	return cudaMemcpyToSymbol(_INC_Y, yWords, sizeof(unsigned int) * 8);
 }
 
+
+
 __device__ void hashPublicKey(const unsigned int *x, const unsigned int *y, unsigned int *digestOut)
 {
 	unsigned int hash[8];
@@ -243,8 +296,10 @@ __device__ bool checkHash(unsigned int hash[5])
 	return foundMatch;
 }
 
-__device__ void doIteration(unsigned int *xPtr, unsigned int *yPtr, unsigned int *chain, int pointsPerThread, unsigned int *numResults, void *results, int compression)
+__device__ void doIteration(unsigned int *xPtr, unsigned int *yPtr, int pointsPerThread, unsigned int *numResults, void *results, int compression)
 {
+	unsigned int *chain = _CHAIN[0];
+
 	// Multiply together all (_Gx - x) and then invert
 	unsigned int inverse[8] = { 0,0,0,0,0,0,0,1 };
 	for(int i = 0; i < pointsPerThread; i++) {
@@ -254,7 +309,6 @@ __device__ void doIteration(unsigned int *xPtr, unsigned int *yPtr, unsigned int
 
 		readInt(xPtr, i, x);
 
-
 		if(compression == PointCompressionType::UNCOMPRESSED || compression == PointCompressionType::BOTH) {
 			unsigned int y[8];
 			readInt(yPtr, i, y);
@@ -293,8 +347,10 @@ __device__ void doIteration(unsigned int *xPtr, unsigned int *yPtr, unsigned int
 	}
 }
 
-__device__ void doIterationWithDouble(unsigned int *xPtr, unsigned int *yPtr, unsigned int *chain, int pointsPerThread, unsigned int *numResults, void *results, int compression)
+__device__ void doIterationWithDouble(unsigned int *xPtr, unsigned int *yPtr, int pointsPerThread, unsigned int *numResults, void *results, int compression)
 {
+	unsigned int *chain = _CHAIN[0];
+
 	// Multiply together all (_Gx - x) and then invert
 	unsigned int inverse[8] = { 0,0,0,0,0,0,0,1 };
 	for(int i = 0; i < pointsPerThread; i++) {
@@ -348,12 +404,12 @@ __device__ void doIterationWithDouble(unsigned int *xPtr, unsigned int *yPtr, un
 /**
 * Performs a single iteration
 */
-__global__ void keyFinderKernel(int points, unsigned int *x, unsigned int *y, unsigned int *chain, unsigned int *numResults, void *results, int compression)
+__global__ void keyFinderKernel(int points, unsigned int *x, unsigned int *y, unsigned int *numResults, void *results, int compression)
 {
-	doIteration(x, y, chain, points, numResults, results, compression);
+	doIteration(x, y, points, numResults, results, compression);
 }
 
-__global__ void keyFinderKernelWithDouble(int points, unsigned int *x, unsigned int *y, unsigned int *chain, unsigned int *numResults, void *results, int compression)
+__global__ void keyFinderKernelWithDouble(int points, unsigned int *x, unsigned int *y, unsigned int *numResults, void *results, int compression)
 {
-	doIterationWithDouble(x, y, chain, points, numResults, results, compression);
+	doIterationWithDouble(x, y, points, numResults, results, compression);
 }
diff --git a/KeyFinderLib/cudabridge.cu b/KeyFinderLib/cudabridge.cu
@@ -1,15 +1,15 @@
 #include "cudabridge.h"
 
 
-__global__ void keyFinderKernel(int points, unsigned int *x, unsigned int *y, unsigned int *chain, unsigned int *numResults, void *results, int compression);
-__global__ void keyFinderKernelWithDouble(int points, unsigned int *x, unsigned int *y, unsigned int *chain, unsigned int *numResults, void *results, int compression);
+__global__ void keyFinderKernel(int points, unsigned int *x, unsigned int *y, unsigned int *numResults, void *results, int compression);
+__global__ void keyFinderKernelWithDouble(int points, unsigned int *x, unsigned int *y, unsigned int *numResults, void *results, int compression);
 
 void callKeyFinderKernel(KernelParams &params, bool useDouble, int compression)
 {
 	if(useDouble) {
-		keyFinderKernelWithDouble <<<params.blocks, params.threads >> >(params.points, params.x, params.y, params.chain, params.numResults, params.results, compression);
+		keyFinderKernelWithDouble <<<params.blocks, params.threads >> >(params.points, params.x, params.y, params.numResults, params.results, compression);
 	} else {
-		keyFinderKernel <<<params.blocks, params.threads >> > (params.points, params.x, params.y, params.chain, params.numResults, params.results, compression);
+		keyFinderKernel <<<params.blocks, params.threads >> > (params.points, params.x, params.y, params.numResults, params.results, compression);
 	}
 	waitForKernel();
 }

diff --git a/KeyFinderLib/cudabridge.h b/KeyFinderLib/cudabridge.h
@@ -39,5 +39,7 @@ cudaError_t setTargetHash(const std::vector<struct hash160> &targets);
 void cleanupTargets();
 
 cudaError_t setIncrementorPoint(const secp256k1::uint256 &x, const secp256k1::uint256 &y);
+cudaError_t allocateChainBuf(unsigned int count);
+void cleanupChainBuf();
 
 #endif
diff --git a/cudaDeviceContext/DeviceContext.cpp b/cudaDeviceContext/DeviceContext.cpp
@@ -48,12 +48,6 @@ void CudaDeviceContext::init(const DeviceParameters &params)
 		goto end;
 	}
 
-	// Allocate chain buf
-	err = cudaMalloc(&_chain, sizeof(unsigned int) * count * 8);
-	if(err) {
-		goto end;
-	}
-
 	// The number of results found in the most recent kernel run
 	_numResultsHost = NULL;
 	err = cudaHostAlloc(&_numResultsHost, sizeof(unsigned int), cudaHostAllocMapped);
@@ -87,7 +81,6 @@ void CudaDeviceContext::init(const DeviceParameters &params)
 	if(err) {
 		cudaFree(_x);
 		cudaFree(_y);
-		cudaFree(_chain);
 		cudaFreeHost(_numResultsHost);
 		cudaFree(_numResultsDev);
 		cudaFreeHost(_resultsHost);
@@ -173,10 +166,6 @@ void CudaDeviceContext::cleanup()
 		cudaFree(_y);
 	}
 
-	if(_chain != NULL) {
-		cudaFree(_chain);
-	}
-
 	if(_numResultsHost != NULL) {
 		cudaFreeHost(_numResultsHost);
 	}
@@ -187,7 +176,6 @@ void CudaDeviceContext::cleanup()
 
 	_x = NULL;
 	_y = NULL;
-	_chain = NULL;
 
 	cudaDeviceReset();
 }
@@ -201,7 +189,6 @@ KernelParams CudaDeviceContext::getKernelParams()
 	params.points = _pointsPerThread;
 	params.x = _x;
 	params.y = _y;
-	params.chain = _chain;
 
 	params.results = _resultsDev;
 	params.numResults = _numResultsDev;

diff --git a/cudaDeviceContext/DeviceContext.h b/cudaDeviceContext/DeviceContext.h
@@ -24,7 +24,6 @@ typedef struct {
 
 	unsigned int *x;
 	unsigned int *y;
-	unsigned int *chain;
 
 	unsigned int *results;
 	unsigned int *numResults;
@@ -80,7 +79,6 @@ class CudaDeviceContext : DeviceContext {
 
 	unsigned int *_x;
 	unsigned int *_y;
-	unsigned int *_chain;
 
 	unsigned int *_numResultsHost;
 	unsigned int *_numResultsDev;
@@ -99,7 +97,6 @@ class CudaDeviceContext : DeviceContext {
 
 		_x = NULL;
 		_y = NULL;
-		_chain = NULL;
 	}
 
 	void init(const DeviceParameters &params);