Skip to content

Commit

Permalink
Code cleanup
Browse files Browse the repository at this point in the history
-GPU memory was being allocated from the host side but only used on the GPU side. Allocate this memory on the GPU side instead of passing the pointer every time.
  • Loading branch information
brichard19 committed Jun 28, 2018
1 parent f3a9a7d commit ae21e15
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 49 deletions.
3 changes: 3 additions & 0 deletions KeyFinderLib/KeyFinder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ KeyFinder::KeyFinder(int device, const secp256k1::uint256 &start, unsigned long
KeyFinder::~KeyFinder()
{
cleanupTargets();
cleanupChainBuf();

if(_devCtx) {
delete _devCtx;
Expand Down Expand Up @@ -138,6 +139,8 @@ void KeyFinder::init()

setTargetHashes();

allocateChainBuf(_numThreads * _numBlocks * _pointsPerThread);

// Set the incrementor
secp256k1::ecpoint g = secp256k1::G();
secp256k1::ecpoint p = secp256k1::multiplyPoint(secp256k1::uint256(_numThreads * _numBlocks * _pointsPerThread), g);
Expand Down
114 changes: 85 additions & 29 deletions KeyFinderLib/KeyFinder.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@ __constant__ unsigned int *_BLOOM_FILTER[1];
__constant__ unsigned int _USE_BLOOM_FILTER[1];
__constant__ unsigned int _INC_X[8];
__constant__ unsigned int _INC_Y[8];
__constant__ unsigned int *_CHAIN[1];

static bool _useBloomFilter = false;
static unsigned int *_bloomFilterPtr = NULL;
static unsigned int *_chainBufferPtr = NULL;

static const unsigned int _RIPEMD160_IV_HOST[5] = {
0x67452301,
Expand All @@ -40,28 +42,27 @@ static unsigned int swp(unsigned int x)
return (x << 24) | ((x << 8) & 0x00ff0000) | ((x >> 8) & 0x0000ff00) | (x >> 24);
}

void cleanupTargets()

static void undoRMD160FinalRound(const unsigned int hIn[5], unsigned int hOut[5])
{
if(_useBloomFilter) {
if(_bloomFilterPtr != NULL) {
cudaFree(_bloomFilterPtr);
_bloomFilterPtr = NULL;
}
for(int i = 0; i < 5; i++) {
hOut[i] = swp(hIn[i]) - _RIPEMD160_IV_HOST[(i + 1) % 5];
}
}

cudaError_t setTargetConstantMemory(const std::vector<struct hash160> &targets)

/**
Copies the target hashes to constant memory
*/
static cudaError_t setTargetConstantMemory(const std::vector<struct hash160> &targets)
{
unsigned int count = targets.size();


for(int i = 0; i < count; i++) {
for(unsigned int i = 0; i < count; i++) {
unsigned int h[5];

// Undo the final round of RIPEMD160 and endian swap to save some computation
for(int j = 0; j < 5; j++) {
h[j] = swp(targets[i].h[j]) - _RIPEMD160_IV_HOST[(j + 1) % 5];
}
undoRMD160FinalRound(targets[i].h, h);

cudaError_t err = cudaMemcpyToSymbol(_TARGET_HASH, h, sizeof(unsigned int) * 5, i * sizeof(unsigned int) * 5);

Expand All @@ -85,7 +86,10 @@ cudaError_t setTargetConstantMemory(const std::vector<struct hash160> &targets)
return cudaSuccess;
}

cudaError_t setTargetBloomFilter(const std::vector<struct hash160> &targets)
/**
Populates the bloom filter with the target hashes
*/
static cudaError_t setTargetBloomFilter(const std::vector<struct hash160> &targets)
{
unsigned int filter[BLOOM_FILTER_SIZE_WORDS];

Expand All @@ -96,20 +100,22 @@ cudaError_t setTargetBloomFilter(const std::vector<struct hash160> &targets)
}

memset(filter, 0, sizeof(unsigned int) * BLOOM_FILTER_SIZE_WORDS);

// Use the low 16 bits of each word in the hash as the index into the bloom filter
for(int i = 0; i < targets.size(); i++) {

for(unsigned int i = 0; i < targets.size(); i++) {

unsigned int h[5];

undoRMD160FinalRound(targets[i].h, h);

for(int j = 0; j < 5; j++) {
// Undo the final round of RIPEMD160 and endian swap to save some computation
unsigned int h = swp(targets[i].h[j]) - _RIPEMD160_IV_HOST[(j + 1) % 5];

unsigned int idx = h & 0xffff;

unsigned int idx = h[i] & 0xffff;

filter[idx / 32] |= (0x01 << (idx % 32));
}

}

// Copy to device
err = cudaMemcpy(_bloomFilterPtr, filter, sizeof(unsigned int) * BLOOM_FILTER_SIZE_WORDS, cudaMemcpyHostToDevice);
if(err) {
Expand All @@ -132,6 +138,19 @@ cudaError_t setTargetBloomFilter(const std::vector<struct hash160> &targets)
return err;
}


void cleanupTargets()
{
if(_useBloomFilter && _bloomFilterPtr != NULL) {
cudaFree(_bloomFilterPtr);
_bloomFilterPtr = NULL;
}
}

/**
*Copies the target hashes to either constant memory, or the bloom filter depending
on how many targets there are
*/
cudaError_t setTargetHash(const std::vector<struct hash160> &targets)
{
cleanupTargets();
Expand All @@ -144,7 +163,39 @@ cudaError_t setTargetHash(const std::vector<struct hash160> &targets)
}


/**
* Allocates device memory for storing the multiplication chain used in
the batch inversion operation
*/
cudaError_t allocateChainBuf(unsigned int count)
{
cudaError_t err = cudaMalloc(&_chainBufferPtr, count * sizeof(unsigned int) * 8);

if(err) {
return err;
}

err = cudaMemcpyToSymbol(_CHAIN, &_chainBufferPtr, sizeof(unsigned int *));
if(err) {
cudaFree(_chainBufferPtr);
}

return err;
}

void cleanupChainBuf()
{
if(_chainBufferPtr != NULL) {
cudaFree(_chainBufferPtr);
_chainBufferPtr = NULL;
}
}



/**
*Sets the EC point which all points will be incremented by
*/
cudaError_t setIncrementorPoint(const secp256k1::uint256 &x, const secp256k1::uint256 &y)
{
unsigned int xWords[8];
Expand All @@ -161,6 +212,8 @@ cudaError_t setIncrementorPoint(const secp256k1::uint256 &x, const secp256k1::ui
return cudaMemcpyToSymbol(_INC_Y, yWords, sizeof(unsigned int) * 8);
}



__device__ void hashPublicKey(const unsigned int *x, const unsigned int *y, unsigned int *digestOut)
{
unsigned int hash[8];
Expand Down Expand Up @@ -243,8 +296,10 @@ __device__ bool checkHash(unsigned int hash[5])
return foundMatch;
}

__device__ void doIteration(unsigned int *xPtr, unsigned int *yPtr, unsigned int *chain, int pointsPerThread, unsigned int *numResults, void *results, int compression)
__device__ void doIteration(unsigned int *xPtr, unsigned int *yPtr, int pointsPerThread, unsigned int *numResults, void *results, int compression)
{
unsigned int *chain = _CHAIN[0];

// Multiply together all (_Gx - x) and then invert
unsigned int inverse[8] = { 0,0,0,0,0,0,0,1 };
for(int i = 0; i < pointsPerThread; i++) {
Expand All @@ -254,7 +309,6 @@ __device__ void doIteration(unsigned int *xPtr, unsigned int *yPtr, unsigned int

readInt(xPtr, i, x);


if(compression == PointCompressionType::UNCOMPRESSED || compression == PointCompressionType::BOTH) {
unsigned int y[8];
readInt(yPtr, i, y);
Expand Down Expand Up @@ -293,8 +347,10 @@ __device__ void doIteration(unsigned int *xPtr, unsigned int *yPtr, unsigned int
}
}

__device__ void doIterationWithDouble(unsigned int *xPtr, unsigned int *yPtr, unsigned int *chain, int pointsPerThread, unsigned int *numResults, void *results, int compression)
__device__ void doIterationWithDouble(unsigned int *xPtr, unsigned int *yPtr, int pointsPerThread, unsigned int *numResults, void *results, int compression)
{
unsigned int *chain = _CHAIN[0];

// Multiply together all (_Gx - x) and then invert
unsigned int inverse[8] = { 0,0,0,0,0,0,0,1 };
for(int i = 0; i < pointsPerThread; i++) {
Expand Down Expand Up @@ -348,12 +404,12 @@ __device__ void doIterationWithDouble(unsigned int *xPtr, unsigned int *yPtr, un
/**
* Performs a single iteration
*/
__global__ void keyFinderKernel(int points, unsigned int *x, unsigned int *y, unsigned int *chain, unsigned int *numResults, void *results, int compression)
__global__ void keyFinderKernel(int points, unsigned int *x, unsigned int *y, unsigned int *numResults, void *results, int compression)
{
doIteration(x, y, chain, points, numResults, results, compression);
doIteration(x, y, points, numResults, results, compression);
}

__global__ void keyFinderKernelWithDouble(int points, unsigned int *x, unsigned int *y, unsigned int *chain, unsigned int *numResults, void *results, int compression)
__global__ void keyFinderKernelWithDouble(int points, unsigned int *x, unsigned int *y, unsigned int *numResults, void *results, int compression)
{
doIterationWithDouble(x, y, chain, points, numResults, results, compression);
doIterationWithDouble(x, y, points, numResults, results, compression);
}
8 changes: 4 additions & 4 deletions KeyFinderLib/cudabridge.cu
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
#include "cudabridge.h"


__global__ void keyFinderKernel(int points, unsigned int *x, unsigned int *y, unsigned int *chain, unsigned int *numResults, void *results, int compression);
__global__ void keyFinderKernelWithDouble(int points, unsigned int *x, unsigned int *y, unsigned int *chain, unsigned int *numResults, void *results, int compression);
__global__ void keyFinderKernel(int points, unsigned int *x, unsigned int *y, unsigned int *numResults, void *results, int compression);
__global__ void keyFinderKernelWithDouble(int points, unsigned int *x, unsigned int *y, unsigned int *numResults, void *results, int compression);

void callKeyFinderKernel(KernelParams &params, bool useDouble, int compression)
{
if(useDouble) {
keyFinderKernelWithDouble <<<params.blocks, params.threads >> >(params.points, params.x, params.y, params.chain, params.numResults, params.results, compression);
keyFinderKernelWithDouble <<<params.blocks, params.threads >> >(params.points, params.x, params.y, params.numResults, params.results, compression);
} else {
keyFinderKernel <<<params.blocks, params.threads >> > (params.points, params.x, params.y, params.chain, params.numResults, params.results, compression);
keyFinderKernel <<<params.blocks, params.threads >> > (params.points, params.x, params.y, params.numResults, params.results, compression);
}
waitForKernel();
}
Expand Down
2 changes: 2 additions & 0 deletions KeyFinderLib/cudabridge.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,7 @@ cudaError_t setTargetHash(const std::vector<struct hash160> &targets);
void cleanupTargets();

cudaError_t setIncrementorPoint(const secp256k1::uint256 &x, const secp256k1::uint256 &y);
cudaError_t allocateChainBuf(unsigned int count);
void cleanupChainBuf();

#endif
13 changes: 0 additions & 13 deletions cudaDeviceContext/DeviceContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,6 @@ void CudaDeviceContext::init(const DeviceParameters &params)
goto end;
}

// Allocate chain buf
err = cudaMalloc(&_chain, sizeof(unsigned int) * count * 8);
if(err) {
goto end;
}

// The number of results found in the most recent kernel run
_numResultsHost = NULL;
err = cudaHostAlloc(&_numResultsHost, sizeof(unsigned int), cudaHostAllocMapped);
Expand Down Expand Up @@ -87,7 +81,6 @@ void CudaDeviceContext::init(const DeviceParameters &params)
if(err) {
cudaFree(_x);
cudaFree(_y);
cudaFree(_chain);
cudaFreeHost(_numResultsHost);
cudaFree(_numResultsDev);
cudaFreeHost(_resultsHost);
Expand Down Expand Up @@ -173,10 +166,6 @@ void CudaDeviceContext::cleanup()
cudaFree(_y);
}

if(_chain != NULL) {
cudaFree(_chain);
}

if(_numResultsHost != NULL) {
cudaFreeHost(_numResultsHost);
}
Expand All @@ -187,7 +176,6 @@ void CudaDeviceContext::cleanup()

_x = NULL;
_y = NULL;
_chain = NULL;

cudaDeviceReset();
}
Expand All @@ -201,7 +189,6 @@ KernelParams CudaDeviceContext::getKernelParams()
params.points = _pointsPerThread;
params.x = _x;
params.y = _y;
params.chain = _chain;

params.results = _resultsDev;
params.numResults = _numResultsDev;
Expand Down
3 changes: 0 additions & 3 deletions cudaDeviceContext/DeviceContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ typedef struct {

unsigned int *x;
unsigned int *y;
unsigned int *chain;

unsigned int *results;
unsigned int *numResults;
Expand Down Expand Up @@ -80,7 +79,6 @@ class CudaDeviceContext : DeviceContext {

unsigned int *_x;
unsigned int *_y;
unsigned int *_chain;

unsigned int *_numResultsHost;
unsigned int *_numResultsDev;
Expand All @@ -99,7 +97,6 @@ class CudaDeviceContext : DeviceContext {

_x = NULL;
_y = NULL;
_chain = NULL;
}

void init(const DeviceParameters &params);
Expand Down

0 comments on commit ae21e15

Please sign in to comment.