Skip to content

Commit

Permalink
Add GPU-memory arena for efficient, re-useable temporary allocations
Browse files Browse the repository at this point in the history
  • Loading branch information
Tom94 committed Feb 10, 2022
1 parent 57fcd33 commit d3729d2
Show file tree
Hide file tree
Showing 15 changed files with 562 additions and 397 deletions.
22 changes: 19 additions & 3 deletions include/tiny-cuda-nn/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ enum class Activation {
//////////////////

uint32_t cuda_compute_capability(int device = 0);
size_t cuda_memory_granularity(int device = 0);

std::string to_lower(std::string str);
std::string to_upper(std::string str);
Expand Down Expand Up @@ -130,14 +131,29 @@ inline uint32_t powi(uint32_t base, uint32_t exponent) {
// CUDA ERROR HANDLING (EXCEPTIONS) //
//////////////////////////////////////

#define STRINGIFY(x) #x
#define STR(x) STRINGIFY(x)
#define FILE_LINE __FILE__ ":" STR(__LINE__)

/// Checks the result of a cuXXXXXX call and throws an error on failure
#define CU_CHECK_THROW(x) \
do { \
CUresult result = x; \
if (result != CUDA_SUCCESS) { \
const char *msg; \
cuGetErrorName(result, &msg); \
throw std::runtime_error(std::string("CUDA Error: " #x " failed with error ") + msg); \
throw std::runtime_error(std::string(FILE_LINE " " #x " failed with error ") + msg); \
} \
} while(0)

/// Checks the result of a cuXXXXXX call and prints an error on failure
#define CU_CHECK_PRINT(x) \
do { \
CUresult result = x; \
if (result != CUDA_SUCCESS) { \
const char *msg; \
cuGetErrorName(result, &msg); \
std::cout << FILE_LINE " " #x " failed with error " << msg << std::endl; \
} \
} while(0)

Expand All @@ -146,15 +162,15 @@ inline uint32_t powi(uint32_t base, uint32_t exponent) {
do { \
cudaError_t result = x; \
if (result != cudaSuccess) \
throw std::runtime_error(std::string("CUDA Error: " #x " failed with error ") + cudaGetErrorString(result)); \
throw std::runtime_error(std::string(FILE_LINE " " #x " failed with error ") + cudaGetErrorString(result)); \
} while(0)

/// Checks the result of a cudaXXXXXX call and prints an error on failure
#define CUDA_CHECK_PRINT(x) \
do { \
cudaError_t result = x; \
if (result != cudaSuccess) \
std::cout << "CUDA Error: " #x " failed with error " << cudaGetErrorString(result) << std::endl; \
std::cout << FILE_LINE " " #x " failed with error " << cudaGetErrorString(result) << std::endl; \
} while(0)

#if defined(__CUDA_ARCH__)
Expand Down
4 changes: 2 additions & 2 deletions include/tiny-cuda-nn/cutlass_matmul.h
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ void fc_multiply_impl(cudaStream_t stream, const typename Gemm::Arguments& args)
Gemm gemm_op;

// Initialize CUTLASS kernel with arguments and workspace pointer
auto workspace = borrow_workspace(stream, workspace_size);
auto workspace = allocate_workspace(stream, workspace_size);
cutlass::Status status = gemm_op.initialize(args, workspace.data(), stream);
CUTLASS_CHECK(status);

Expand All @@ -373,7 +373,7 @@ void fc_multiply_split_k_impl(cudaStream_t stream, const typename Gemm::Argument
Gemm gemm_op;

// Initialize CUTLASS kernel with arguments and workspace pointer
auto workspace = borrow_workspace(stream, workspace_size);
auto workspace = allocate_workspace(stream, workspace_size);
cutlass::Status status = gemm_op.initialize(args, workspace.data());
CUTLASS_CHECK(status);

Expand Down
8 changes: 4 additions & 4 deletions include/tiny-cuda-nn/encodings/grid.h
Original file line number Diff line number Diff line change
Expand Up @@ -616,9 +616,9 @@ class GridEncodingTemplated : public GridEncoding<T> {
const dim3 blocks_hashgrid = { div_round_up(num_elements, N_THREADS_HASHGRID), m_n_levels, 1 };

T* rm_encoded_positions = outputs.ptr;
BorrowedWorkspace workspace;
GPUMemoryArena::Allocation workspace;
if (m_output_layout == CM) {
workspace = borrow_workspace(stream, num_elements * m_n_features * sizeof(T));
workspace = allocate_workspace(stream, num_elements * m_n_features * sizeof(T));
rm_encoded_positions = (T*)workspace.data();
}

Expand Down Expand Up @@ -672,9 +672,9 @@ class GridEncodingTemplated : public GridEncoding<T> {

const T* dL_dy_rm = dL_dy.ptr;

BorrowedWorkspace workspace;
GPUMemoryArena::Allocation workspace;
if (m_output_layout == CM) {
workspace = borrow_workspace(stream, num_elements * m_n_features * sizeof(T));
workspace = allocate_workspace(stream, num_elements * m_n_features * sizeof(T));

// Transpose dL_dy. Use the buffer previously occupied by the encoded positions
const dim3 threads_transpose = { m_n_levels * N_FEATURES_PER_LEVEL, 8, 1 };
Expand Down
113 changes: 96 additions & 17 deletions include/tiny-cuda-nn/gpu_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,32 +82,78 @@ class GPUMatrixBase {

template <typename T, MatrixLayout layout>
static void allocate_shared_memory(GPUMemory<char>& memory, std::vector<GPUMatrix<T, layout>>& matrices);

static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, const std::vector<GPUMatrixBase*>& matrices) {
size_t total_n_bytes = 0;
for (auto* matrix : matrices) {
total_n_bytes += matrix->n_bytes();
}

auto alloc = allocate_workspace(stream, total_n_bytes);

size_t offset = 0;
for (auto* matrix : matrices) {
matrix->set_data(alloc.data() + offset);
offset += matrix->n_bytes();
}

return alloc;
}

template <typename T>
static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrixDynamic<T>>& matrices);

template <typename T, MatrixLayout layout>
static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrix<T, layout>>& matrices);
};

template <typename T>
class GPUMatrixDynamic : public GPUMatrixBase {
public:
using Type = T;

// Owning its memory
// Owning its memory as a GPUMemory<T>
GPUMatrixDynamic(uint32_t m, uint32_t n, MatrixLayout layout = CM)
: m_owned_data{m * n}, m_rows{m}, m_cols{n}, m_layout{layout} {
m_data = m_owned_data.data();
}

// Owning its memory as an allocation from a stream's memory arena
GPUMatrixDynamic(uint32_t m, uint32_t n, cudaStream_t stream, MatrixLayout layout = CM)
: m_arena_data{allocate_workspace(stream, m * n * sizeof(T))}, m_rows{m}, m_cols{n}, m_layout{layout} {
m_data = (T*)m_arena_data.data();
}

// Pointing to external memory
explicit GPUMatrixDynamic(T* data, uint32_t m, uint32_t n, MatrixLayout layout = CM)
: m_data{data}, m_rows{m}, m_cols{n}, m_layout{layout} {
}
: m_data{data}, m_rows{m}, m_cols{n}, m_layout{layout} {}

GPUMatrixDynamic() : GPUMatrixDynamic(nullptr, 0, 0) {}

GPUMatrixDynamic(GPUMatrixDynamic<T>&& other) : m_data{other.m_data}, m_rows{other.m_rows}, m_cols{other.m_cols}, m_layout{other.m_layout}, m_owned_data{std::move(other.m_owned_data)} { }
GPUMatrixDynamic<T>& operator=(GPUMatrixDynamic<T>&& other) {
std::swap(m_data, other.m_data);
std::swap(m_rows, other.m_rows);
std::swap(m_cols, other.m_cols);
std::swap(m_layout, other.m_layout);
std::swap(m_owned_data, other.m_owned_data);
std::swap(m_arena_data, other.m_arena_data);
return *this;
}

GPUMatrixDynamic(GPUMatrixDynamic<T>&& other) {
*this = std::move(other);
}

explicit GPUMatrixDynamic(const GPUMatrixDynamic<T>& other) : m_data{other.m_data}, m_rows{other.m_rows}, m_cols{other.m_cols}, m_layout{other.m_layout}, m_owned_data{other.m_owned_data.copy()} {
// If we just copied over some owned data, then we want to point to our copy
if (m_owned_data.data()) {
m_data = m_owned_data.data();
}

if (other.m_arena_data.data()) {
m_arena_data = allocate_workspace(other.m_arena_data.stream(), n_bytes());
m_data = (T*)m_arena_data.data();
CUDA_CHECK_THROW(cudaMemcpyAsync(data(), other.data(), n_bytes(), cudaMemcpyDeviceToDevice, m_arena_data.stream()));
}
}

virtual ~GPUMatrixDynamic() {}
Expand All @@ -118,6 +164,11 @@ class GPUMatrixDynamic : public GPUMatrixBase {
m_cols = cols;
}

void set(T* data, uint32_t rows, uint32_t cols) {
set_data(data);
set_size(rows, cols);
}

uint32_t rows() const { return m_rows; }
uint32_t fan_out() const { return m_rows; }
uint32_t m() const { return m_rows; }
Expand All @@ -136,11 +187,11 @@ class GPUMatrixDynamic : public GPUMatrixBase {
const T* data() const { return m_data; }

void memset(int value) {
CUDA_CHECK_THROW(cudaMemset(m_data, value, n_elements() * sizeof(T)));
CUDA_CHECK_THROW(cudaMemset(data(), value, n_bytes()));
}

void memset_async(cudaStream_t stream, int value) {
CUDA_CHECK_THROW(cudaMemsetAsync(m_data, value, n_elements() * sizeof(T), stream));
CUDA_CHECK_THROW(cudaMemsetAsync(data(), value, n_bytes(), stream));
}

// Various initializations
Expand All @@ -159,7 +210,7 @@ class GPUMatrixDynamic : public GPUMatrixBase {
new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale);
}

CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_elements() * sizeof(T), cudaMemcpyHostToDevice));
CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
}

void initialize_fa_uniform_forward(pcg32& rnd, float scale = 1) {
Expand All @@ -177,7 +228,7 @@ class GPUMatrixDynamic : public GPUMatrixBase {
new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale);
}

CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_elements() * sizeof(T), cudaMemcpyHostToDevice));
CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
}

void initialize_fa_uniform_backward(pcg32& rnd, float scale = 1) {
Expand All @@ -195,7 +246,7 @@ class GPUMatrixDynamic : public GPUMatrixBase {
new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale);
}

CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_elements() * sizeof(T), cudaMemcpyHostToDevice));
CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
}

void initialize_siren_uniform(pcg32& rnd, float scale = 1) {
Expand All @@ -213,7 +264,7 @@ class GPUMatrixDynamic : public GPUMatrixBase {
new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale);
}

CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_elements() * sizeof(T), cudaMemcpyHostToDevice));
CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
}

void initialize_siren_uniform_first(pcg32& rnd, float scale = 1) {
Expand All @@ -233,7 +284,7 @@ class GPUMatrixDynamic : public GPUMatrixBase {
new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale);
}

CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_elements() * sizeof(T), cudaMemcpyHostToDevice));
CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
}

void initialize_constant(float val) {
Expand All @@ -242,7 +293,7 @@ class GPUMatrixDynamic : public GPUMatrixBase {
}

std::vector<T> new_data(n_elements(), (T)val);
CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_elements() * sizeof(T), cudaMemcpyHostToDevice));
CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
}

void initialize_diagonal(float val = 1) {
Expand All @@ -259,7 +310,7 @@ class GPUMatrixDynamic : public GPUMatrixBase {
new_data[i + i*n()] = (T)val;
}

CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_elements() * sizeof(T), cudaMemcpyHostToDevice));
CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
}

const GPUMatrixDynamic<T> transposed() const {
Expand Down Expand Up @@ -287,6 +338,7 @@ class GPUMatrixDynamic : public GPUMatrixBase {
uint32_t m_rows, m_cols;
MatrixLayout m_layout;
GPUMemory<T> m_owned_data;
GPUMemoryArena::Allocation m_arena_data;
};

template <typename T, MatrixLayout _layout = MatrixLayout::ColumnMajor>
Expand All @@ -295,21 +347,30 @@ class GPUMatrix : public GPUMatrixDynamic<T> {
static const MatrixLayout static_layout = _layout;
static const MatrixLayout static_transposed_layout = _layout == RM ? CM : RM;

// Owning its memory
// Owning its memory as a GPUMemory<T>
GPUMatrix(uint32_t m, uint32_t n)
: GPUMatrixDynamic<T>{m, n, static_layout} { }

// Owning its memory as an allocation from a stream's memory arena
GPUMatrix(uint32_t m, uint32_t n, cudaStream_t stream)
: GPUMatrixDynamic<T>{m, n, stream, static_layout} { }

// Pointing to external memory
explicit GPUMatrix(T* data, uint32_t m, uint32_t n)
: GPUMatrixDynamic<T>{data, m, n, static_layout} { }

GPUMatrix() : GPUMatrix(nullptr, 0, 0) {}

GPUMatrix(GPUMatrixDynamic<T>&& other)
: GPUMatrixDynamic<T>{other} {
GPUMatrix<T>& operator=(GPUMatrixDynamic<T>&& other) {
*((GPUMatrixDynamic<T>*)this) = std::move(other);
if (static_layout != this->layout()) {
throw std::runtime_error{"GPUMatrix must be constructed from a GPUMatrixDynamic with matching layout."};
}
return *this;
}

GPUMatrix(GPUMatrixDynamic<T>&& other) {
*this = std::move(other);
}

// Only copy by reference. This is to prevent accidental deep copies of owned data.
Expand Down Expand Up @@ -361,4 +422,22 @@ void GPUMatrixBase::allocate_shared_memory(GPUMemory<char>& memory, std::vector<
allocate_shared_memory(memory, matrix_pointers);
}

template <typename T>
GPUMemoryArena::Allocation GPUMatrixBase::allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrixDynamic<T>>& matrices) {
std::vector<GPUMatrixBase*> matrix_pointers;
for (auto& matrix : matrices) {
matrix_pointers.emplace_back(&matrix);
}
return allocate_shared_memory(stream, matrix_pointers);
}

template <typename T, MatrixLayout layout>
GPUMemoryArena::Allocation GPUMatrixBase::allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrix<T, layout>>& matrices) {
std::vector<GPUMatrixBase*> matrix_pointers;
for (auto& matrix : matrices) {
matrix_pointers.emplace_back(&matrix);
}
return allocate_shared_memory(stream, matrix_pointers);
}

TCNN_NAMESPACE_END
Loading

0 comments on commit d3729d2

Please sign in to comment.