Add GPU-memory arena for efficient, re-useable temporary allocations

YOUSIKI · Feb 10, 2022 · d3729d2 · d3729d2
1 parent 57fcd33
commit d3729d2
Show file tree

Hide file tree

Showing 15 changed files with 562 additions and 397 deletions.
diff --git a/include/tiny-cuda-nn/common.h b/include/tiny-cuda-nn/common.h
@@ -95,6 +95,7 @@ enum class Activation {
 //////////////////
 
 uint32_t cuda_compute_capability(int device = 0);
+size_t cuda_memory_granularity(int device = 0);
 
 std::string to_lower(std::string str);
 std::string to_upper(std::string str);
@@ -130,14 +131,29 @@ inline uint32_t powi(uint32_t base, uint32_t exponent) {
 // CUDA ERROR HANDLING (EXCEPTIONS) //
 //////////////////////////////////////
 
+#define STRINGIFY(x) #x
+#define STR(x) STRINGIFY(x)
+#define FILE_LINE __FILE__ ":" STR(__LINE__)
+
 /// Checks the result of a cuXXXXXX call and throws an error on failure
 #define CU_CHECK_THROW(x)                                                                          \
 	do {                                                                                           \
 		CUresult result = x;                                                                       \
 		if (result != CUDA_SUCCESS) {                                                              \
 			const char *msg;                                                                       \
 			cuGetErrorName(result, &msg);                                                          \
-			throw std::runtime_error(std::string("CUDA Error: " #x " failed with error ") + msg);  \
+			throw std::runtime_error(std::string(FILE_LINE " " #x " failed with error ") + msg);  \
+		}                                                                                          \
+	} while(0)
+
+/// Checks the result of a cuXXXXXX call and prints an error on failure
+#define CU_CHECK_PRINT(x)                                                                          \
+	do {                                                                                           \
+		CUresult result = x;                                                                       \
+		if (result != CUDA_SUCCESS) {                                                              \
+			const char *msg;                                                                       \
+			cuGetErrorName(result, &msg);                                                          \
+			std::cout << FILE_LINE " " #x " failed with error " << msg << std::endl;  \
 		}                                                                                          \
 	} while(0)
 
@@ -146,15 +162,15 @@ inline uint32_t powi(uint32_t base, uint32_t exponent) {
 	do {                                                                                                                  \
 		cudaError_t result = x;                                                                                           \
 		if (result != cudaSuccess)                                                                                        \
-			throw std::runtime_error(std::string("CUDA Error: " #x " failed with error ") + cudaGetErrorString(result));  \
+			throw std::runtime_error(std::string(FILE_LINE " " #x " failed with error ") + cudaGetErrorString(result));  \
 	} while(0)
 
 /// Checks the result of a cudaXXXXXX call and prints an error on failure
 #define CUDA_CHECK_PRINT(x)                                                                                   \
 	do {                                                                                                      \
 		cudaError_t result = x;                                                                               \
 		if (result != cudaSuccess)                                                                            \
-			std::cout << "CUDA Error: " #x " failed with error " << cudaGetErrorString(result) << std::endl;  \
+			std::cout << FILE_LINE " " #x " failed with error " << cudaGetErrorString(result) << std::endl;  \
 	} while(0)
 
 #if defined(__CUDA_ARCH__)

diff --git a/include/tiny-cuda-nn/cutlass_matmul.h b/include/tiny-cuda-nn/cutlass_matmul.h
@@ -355,7 +355,7 @@ void fc_multiply_impl(cudaStream_t stream, const typename Gemm::Arguments& args)
 	Gemm gemm_op;
 
 	// Initialize CUTLASS kernel with arguments and workspace pointer
-	auto workspace = borrow_workspace(stream, workspace_size);
+	auto workspace = allocate_workspace(stream, workspace_size);
 	cutlass::Status status = gemm_op.initialize(args, workspace.data(), stream);
 	CUTLASS_CHECK(status);
 
@@ -373,7 +373,7 @@ void fc_multiply_split_k_impl(cudaStream_t stream, const typename Gemm::Argument
 	Gemm gemm_op;
 
 	// Initialize CUTLASS kernel with arguments and workspace pointer
-	auto workspace = borrow_workspace(stream, workspace_size);
+	auto workspace = allocate_workspace(stream, workspace_size);
 	cutlass::Status status = gemm_op.initialize(args, workspace.data());
 	CUTLASS_CHECK(status);
 

diff --git a/include/tiny-cuda-nn/encodings/grid.h b/include/tiny-cuda-nn/encodings/grid.h
@@ -616,9 +616,9 @@ class GridEncodingTemplated : public GridEncoding<T> {
 		const dim3 blocks_hashgrid = { div_round_up(num_elements, N_THREADS_HASHGRID), m_n_levels, 1 };
 
 		T* rm_encoded_positions = outputs.ptr;
-		BorrowedWorkspace workspace;
+		GPUMemoryArena::Allocation workspace;
 		if (m_output_layout == CM) {
-			workspace = borrow_workspace(stream, num_elements * m_n_features * sizeof(T));
+			workspace = allocate_workspace(stream, num_elements * m_n_features * sizeof(T));
 			rm_encoded_positions = (T*)workspace.data();
 		}
 
@@ -672,9 +672,9 @@ class GridEncodingTemplated : public GridEncoding<T> {
 
 			const T* dL_dy_rm = dL_dy.ptr;
 
-			BorrowedWorkspace workspace;
+			GPUMemoryArena::Allocation workspace;
 			if (m_output_layout == CM) {
-				workspace = borrow_workspace(stream, num_elements * m_n_features * sizeof(T));
+				workspace = allocate_workspace(stream, num_elements * m_n_features * sizeof(T));
 
 				// Transpose dL_dy. Use the buffer previously occupied by the encoded positions
 				const dim3 threads_transpose = { m_n_levels * N_FEATURES_PER_LEVEL, 8, 1 };

diff --git a/include/tiny-cuda-nn/gpu_matrix.h b/include/tiny-cuda-nn/gpu_matrix.h
@@ -82,32 +82,78 @@ class GPUMatrixBase {
 
 	template <typename T, MatrixLayout layout>
 	static void allocate_shared_memory(GPUMemory<char>& memory, std::vector<GPUMatrix<T, layout>>& matrices);
+
+	static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, const std::vector<GPUMatrixBase*>& matrices) {
+		size_t total_n_bytes = 0;
+		for (auto* matrix : matrices) {
+			total_n_bytes += matrix->n_bytes();
+		}
+
+		auto alloc = allocate_workspace(stream, total_n_bytes);
+
+		size_t offset = 0;
+		for (auto* matrix : matrices) {
+			matrix->set_data(alloc.data() + offset);
+			offset += matrix->n_bytes();
+		}
+
+		return alloc;
+	}
+
+	template <typename T>
+	static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrixDynamic<T>>& matrices);
+
+	template <typename T, MatrixLayout layout>
+	static GPUMemoryArena::Allocation allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrix<T, layout>>& matrices);
 };
 
 template <typename T>
 class GPUMatrixDynamic : public GPUMatrixBase {
 public:
 	using Type = T;
 
-	// Owning its memory
+	// Owning its memory as a GPUMemory<T>
 	GPUMatrixDynamic(uint32_t m, uint32_t n, MatrixLayout layout = CM)
 	: m_owned_data{m * n}, m_rows{m}, m_cols{n}, m_layout{layout} {
 		m_data = m_owned_data.data();
 	}
 
+	// Owning its memory as an allocation from a stream's memory arena
+	GPUMatrixDynamic(uint32_t m, uint32_t n, cudaStream_t stream, MatrixLayout layout = CM)
+	: m_arena_data{allocate_workspace(stream, m * n * sizeof(T))}, m_rows{m}, m_cols{n}, m_layout{layout} {
+		m_data = (T*)m_arena_data.data();
+	}
+
 	// Pointing to external memory
 	explicit GPUMatrixDynamic(T* data, uint32_t m, uint32_t n, MatrixLayout layout = CM)
-	: m_data{data}, m_rows{m}, m_cols{n}, m_layout{layout} {
-	}
+	: m_data{data}, m_rows{m}, m_cols{n}, m_layout{layout} {}
 
 	GPUMatrixDynamic() : GPUMatrixDynamic(nullptr, 0, 0) {}
 
-	GPUMatrixDynamic(GPUMatrixDynamic<T>&& other) : m_data{other.m_data}, m_rows{other.m_rows}, m_cols{other.m_cols}, m_layout{other.m_layout}, m_owned_data{std::move(other.m_owned_data)} { }
+	GPUMatrixDynamic<T>& operator=(GPUMatrixDynamic<T>&& other) {
+		std::swap(m_data, other.m_data);
+		std::swap(m_rows, other.m_rows);
+		std::swap(m_cols, other.m_cols);
+		std::swap(m_layout, other.m_layout);
+		std::swap(m_owned_data, other.m_owned_data);
+		std::swap(m_arena_data, other.m_arena_data);
+		return *this;
+	}
+
+	GPUMatrixDynamic(GPUMatrixDynamic<T>&& other) {
+		*this = std::move(other);
+	}
+
 	explicit GPUMatrixDynamic(const GPUMatrixDynamic<T>& other) : m_data{other.m_data}, m_rows{other.m_rows}, m_cols{other.m_cols}, m_layout{other.m_layout}, m_owned_data{other.m_owned_data.copy()} {
-		// If we just copied over some owned data, then we want to point to our copy
 		if (m_owned_data.data()) {
 			m_data = m_owned_data.data();
 		}
+
+		if (other.m_arena_data.data()) {
+			m_arena_data = allocate_workspace(other.m_arena_data.stream(), n_bytes());
+			m_data = (T*)m_arena_data.data();
+			CUDA_CHECK_THROW(cudaMemcpyAsync(data(), other.data(), n_bytes(), cudaMemcpyDeviceToDevice, m_arena_data.stream()));
+		}
 	}
 
 	virtual ~GPUMatrixDynamic() {}
@@ -118,6 +164,11 @@ class GPUMatrixDynamic : public GPUMatrixBase {
 		m_cols = cols;
 	}
 
+	void set(T* data, uint32_t rows, uint32_t cols) {
+		set_data(data);
+		set_size(rows, cols);
+	}
+
 	uint32_t rows() const { return m_rows; }
 	uint32_t fan_out() const { return m_rows; }
 	uint32_t m() const { return m_rows; }
@@ -136,11 +187,11 @@ class GPUMatrixDynamic : public GPUMatrixBase {
 	const T* data() const { return m_data; }
 
 	void memset(int value) {
-		CUDA_CHECK_THROW(cudaMemset(m_data, value, n_elements() * sizeof(T)));
+		CUDA_CHECK_THROW(cudaMemset(data(), value, n_bytes()));
 	}
 
 	void memset_async(cudaStream_t stream, int value) {
-		CUDA_CHECK_THROW(cudaMemsetAsync(m_data, value, n_elements() * sizeof(T), stream));
+		CUDA_CHECK_THROW(cudaMemsetAsync(data(), value, n_bytes(), stream));
 	}
 
 	// Various initializations
@@ -159,7 +210,7 @@ class GPUMatrixDynamic : public GPUMatrixBase {
 			new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale);
 		}
 
-		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_elements() * sizeof(T), cudaMemcpyHostToDevice));
+		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
 	}
 
 	void initialize_fa_uniform_forward(pcg32& rnd, float scale = 1) {
@@ -177,7 +228,7 @@ class GPUMatrixDynamic : public GPUMatrixBase {
 			new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale);
 		}
 
-		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_elements() * sizeof(T), cudaMemcpyHostToDevice));
+		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
 	}
 
 	void initialize_fa_uniform_backward(pcg32& rnd, float scale = 1) {
@@ -195,7 +246,7 @@ class GPUMatrixDynamic : public GPUMatrixBase {
 			new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale);
 		}
 
-		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_elements() * sizeof(T), cudaMemcpyHostToDevice));
+		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
 	}
 
 	void initialize_siren_uniform(pcg32& rnd, float scale = 1) {
@@ -213,7 +264,7 @@ class GPUMatrixDynamic : public GPUMatrixBase {
 			new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale);
 		}
 
-		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_elements() * sizeof(T), cudaMemcpyHostToDevice));
+		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
 	}
 
 	void initialize_siren_uniform_first(pcg32& rnd, float scale = 1) {
@@ -233,7 +284,7 @@ class GPUMatrixDynamic : public GPUMatrixBase {
 			new_data[i] = (T)(rnd.next_float() * 2.0f * scale - scale);
 		}
 
-		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_elements() * sizeof(T), cudaMemcpyHostToDevice));
+		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
 	}
 
 	void initialize_constant(float val) {
@@ -242,7 +293,7 @@ class GPUMatrixDynamic : public GPUMatrixBase {
 		}
 
 		std::vector<T> new_data(n_elements(), (T)val);
-		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_elements() * sizeof(T), cudaMemcpyHostToDevice));
+		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
 	}
 
 	void initialize_diagonal(float val = 1) {
@@ -259,7 +310,7 @@ class GPUMatrixDynamic : public GPUMatrixBase {
 			new_data[i + i*n()] = (T)val;
 		}
 
-		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_elements() * sizeof(T), cudaMemcpyHostToDevice));
+		CUDA_CHECK_THROW(cudaMemcpy(data(), new_data.data(), n_bytes(), cudaMemcpyHostToDevice));
 	}
 
 	const GPUMatrixDynamic<T> transposed() const {
@@ -287,6 +338,7 @@ class GPUMatrixDynamic : public GPUMatrixBase {
 	uint32_t m_rows, m_cols;
 	MatrixLayout m_layout;
 	GPUMemory<T> m_owned_data;
+	GPUMemoryArena::Allocation m_arena_data;
 };
 
 template <typename T, MatrixLayout _layout = MatrixLayout::ColumnMajor>
@@ -295,21 +347,30 @@ class GPUMatrix : public GPUMatrixDynamic<T> {
 	static const MatrixLayout static_layout = _layout;
 	static const MatrixLayout static_transposed_layout = _layout == RM ? CM : RM;
 
-	// Owning its memory
+	// Owning its memory as a GPUMemory<T>
 	GPUMatrix(uint32_t m, uint32_t n)
 	: GPUMatrixDynamic<T>{m, n, static_layout} { }
 
+	// Owning its memory as an allocation from a stream's memory arena
+	GPUMatrix(uint32_t m, uint32_t n, cudaStream_t stream)
+	: GPUMatrixDynamic<T>{m, n, stream, static_layout} { }
+
 	// Pointing to external memory
 	explicit GPUMatrix(T* data, uint32_t m, uint32_t n)
 	: GPUMatrixDynamic<T>{data, m, n, static_layout} { }
 
 	GPUMatrix() : GPUMatrix(nullptr, 0, 0) {}
 
-	GPUMatrix(GPUMatrixDynamic<T>&& other)
-	: GPUMatrixDynamic<T>{other} {
+	GPUMatrix<T>& operator=(GPUMatrixDynamic<T>&& other) {
+		*((GPUMatrixDynamic<T>*)this) = std::move(other);
 		if (static_layout != this->layout()) {
 			throw std::runtime_error{"GPUMatrix must be constructed from a GPUMatrixDynamic with matching layout."};
 		}
+		return *this;
+	}
+
+	GPUMatrix(GPUMatrixDynamic<T>&& other) {
+		*this = std::move(other);
 	}
 
 	// Only copy by reference. This is to prevent accidental deep copies of owned data.
@@ -361,4 +422,22 @@ void GPUMatrixBase::allocate_shared_memory(GPUMemory<char>& memory, std::vector<
 	allocate_shared_memory(memory, matrix_pointers);
 }
 
+template <typename T>
+GPUMemoryArena::Allocation GPUMatrixBase::allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrixDynamic<T>>& matrices) {
+	std::vector<GPUMatrixBase*> matrix_pointers;
+	for (auto& matrix : matrices) {
+		matrix_pointers.emplace_back(&matrix);
+	}
+	return allocate_shared_memory(stream, matrix_pointers);
+}
+
+template <typename T, MatrixLayout layout>
+GPUMemoryArena::Allocation GPUMatrixBase::allocate_shared_memory(cudaStream_t stream, std::vector<GPUMatrix<T, layout>>& matrices) {
+	std::vector<GPUMatrixBase*> matrix_pointers;
+	for (auto& matrix : matrices) {
+		matrix_pointers.emplace_back(&matrix);
+	}
+	return allocate_shared_memory(stream, matrix_pointers);
+}
+
 TCNN_NAMESPACE_END