support for CUDA streams

dusty-nv · Jun 16, 2024 · 4152d8a · 4152d8a
1 parent 72c4e09
commit 4152d8a
Show file tree

Hide file tree

Showing 60 changed files with 1,709 additions and 1,100 deletions.
diff --git a/camera/gstCamera.cpp b/camera/gstCamera.cpp
@@ -712,7 +712,7 @@ void gstCamera::checkBuffer()
 
 
 // Capture
-bool gstCamera::Capture( void** output, imageFormat format, uint64_t timeout, int* status )
+bool gstCamera::Capture( void** output, imageFormat format, uint64_t timeout, int* status, cudaStream_t stream )
 {
 	// verify the output pointer exists
 	if( !output )
@@ -726,7 +726,7 @@ bool gstCamera::Capture( void** output, imageFormat format, uint64_t timeout, in
 	}
 
 	// wait until a new frame is recieved
-	const int result = mBufferManager->Dequeue(output, format, timeout);
+	const int result = mBufferManager->Dequeue(output, format, timeout, stream);
 
 	if( result < 0 )
 	{
@@ -746,10 +746,10 @@ bool gstCamera::Capture( void** output, imageFormat format, uint64_t timeout, in
 }
 
 // CaptureRGBA
-bool gstCamera::CaptureRGBA( float** output, unsigned long timeout, bool zeroCopy )
+bool gstCamera::CaptureRGBA( float** output, unsigned long timeout, bool zeroCopy, cudaStream_t stream )
 {
 	mOptions.zeroCopy = zeroCopy;
-	return Capture((void**)output, IMAGE_RGBA32F, timeout);
+	return Capture((void**)output, IMAGE_RGBA32F, timeout, NULL, stream);
 }
 
 

diff --git a/camera/gstCamera.h b/camera/gstCamera.h
@@ -135,7 +135,7 @@ class gstCamera : public videoSource
 	 * Capture the next image frame from the camera.
 	 * @see videoSource::Capture
 	 */
-	virtual bool Capture( void** image, imageFormat format, uint64_t timeout=DEFAULT_TIMEOUT, int* status=NULL );
+	virtual bool Capture( void** image, imageFormat format, uint64_t timeout=DEFAULT_TIMEOUT, int* status=NULL, cudaStream_t stream=0 );
 
 	/**
 	 * Capture the next image frame from the camera and convert it to float4 RGBA format,
@@ -170,7 +170,7 @@ class gstCamera : public videoSource
 	 * @returns `true` if a frame was successfully captured, otherwise `false` if a timeout
 	 *               or error occurred, or if timeout was 0 and a frame wasn't ready.
 	 */
-	bool CaptureRGBA( float** image, uint64_t timeout=DEFAULT_TIMEOUT, bool zeroCopy=false );
+	bool CaptureRGBA( float** image, uint64_t timeout=DEFAULT_TIMEOUT, bool zeroCopy=false, cudaStream_t stream=0 );
 
 	/**
 	 * Set whether converted RGB(A) images should use ZeroCopy buffer allocation.

diff --git a/codec/gstBufferManager.cpp b/codec/gstBufferManager.cpp
@@ -257,32 +257,32 @@ bool gstBufferManager::Enqueue( GstBuffer* gstBuffer, GstCaps* gstCaps )
 		mBufferYUV.Next(RingBuffer::Write);
 	}
 
-		// handle timestamps in either case (CPU or NVMM path)
-		size_t timestamp_size = sizeof(uint64_t);
+	// handle timestamps in either case (CPU or NVMM path)
+	size_t timestamp_size = sizeof(uint64_t);
 
-		// allocate timestamp ringbuffer (GPU only if not ZeroCopy)
-		if( !mTimestamps.Alloc(mOptions->numBuffers, timestamp_size, RingBuffer::ZeroCopy) )
-		{
-			LogError(LOG_GSTREAMER "gstBufferManager -- failed to allocate %u timestamp buffers (%zu bytes each)\n", mOptions->numBuffers, timestamp_size);
-			return false;
-		}
+	// allocate timestamp ringbuffer (GPU only if not ZeroCopy)
+	if( !mTimestamps.Alloc(mOptions->numBuffers, timestamp_size, RingBuffer::ZeroCopy) )
+	{
+		LogError(LOG_GSTREAMER "gstBufferManager -- failed to allocate %u timestamp buffers (%zu bytes each)\n", mOptions->numBuffers, timestamp_size);
+		return false;
+	}
 
-		// copy to next timestamp ringbuffer
-		void* nextTimestamp = mTimestamps.Peek(RingBuffer::Write);
+	// copy to next timestamp ringbuffer
+	void* nextTimestamp = mTimestamps.Peek(RingBuffer::Write);
 
-		if( !nextTimestamp )
-		{
-			LogError(LOG_GSTREAMER "gstBufferManager -- failed to retrieve next timestamp ringbuffer for writing\n");
-			return false;
-		}
+	if( !nextTimestamp )
+	{
+		LogError(LOG_GSTREAMER "gstBufferManager -- failed to retrieve next timestamp ringbuffer for writing\n");
+		return false;
+	}
 
-		if (GST_BUFFER_DTS_IS_VALID(gstBuffer) || GST_BUFFER_PTS_IS_VALID(gstBuffer))
-		{
-			timestamp = GST_BUFFER_DTS_OR_PTS(gstBuffer);
-		}
+	if( GST_BUFFER_DTS_IS_VALID(gstBuffer) || GST_BUFFER_PTS_IS_VALID(gstBuffer) )
+	{
+		timestamp = GST_BUFFER_DTS_OR_PTS(gstBuffer);
+	}
 
-		memcpy(nextTimestamp, (void*)&timestamp, timestamp_size);
-		mTimestamps.Next(RingBuffer::Write);
+	memcpy(nextTimestamp, (void*)&timestamp, timestamp_size);
+	mTimestamps.Next(RingBuffer::Write);
 
 	mWaitEvent.Wake();
 	mFrameCount++;
@@ -296,7 +296,7 @@ bool gstBufferManager::Enqueue( GstBuffer* gstBuffer, GstCaps* gstCaps )
 
 
 // Dequeue
-int gstBufferManager::Dequeue( void** output, imageFormat format, uint64_t timeout )
+int gstBufferManager::Dequeue( void** output, imageFormat format, uint64_t timeout, cudaStream_t stream )
 {
 	// wait until a new frame is recieved
 	if( !mWaitEvent.Wait(timeout) )
@@ -442,7 +442,7 @@ int gstBufferManager::Dequeue( void** output, imageFormat format, uint64_t timeo
 	// perform colorspace conversion
 	void* nextRGB = mBufferRGB.Next(RingBuffer::Write);
 
-	if( CUDA_FAILED(cudaConvertColor(latestYUV, mFormatYUV, nextRGB, format, mOptions->width, mOptions->height)) )
+	if( CUDA_FAILED(cudaConvertColor(latestYUV, mFormatYUV, nextRGB, format, mOptions->width, mOptions->height, stream)) )
 	{
 		LogError(LOG_GSTREAMER "gstBufferManager -- unsupported image format (%s)\n", imageFormatToStr(format));
 		LogError(LOG_GSTREAMER "                    supported formats are:\n");

diff --git a/codec/gstBufferManager.h b/codec/gstBufferManager.h
@@ -79,7 +79,7 @@ class gstBufferManager
 	/**
 	 * Dequeue the next frame.  Returns 1 on success, 0 on timeout, -1 on error.
 	 */
-	int Dequeue( void** output, imageFormat format, uint64_t timeout=UINT64_MAX );
+	int Dequeue( void** output, imageFormat format, uint64_t timeout=UINT64_MAX, cudaStream_t stream=0 );
 
 	/**
 	 * Get timestamp of the latest dequeued frame.

diff --git a/codec/gstDecoder.cpp b/codec/gstDecoder.cpp
@@ -889,7 +889,7 @@ void gstDecoder::checkBuffer()
 
 
 // Capture
-bool gstDecoder::Capture( void** output, imageFormat format, uint64_t timeout, int* status )
+bool gstDecoder::Capture( void** output, imageFormat format, uint64_t timeout, int* status, cudaStream_t stream )
 {
 	// update the webrtc server if needed
 	if( mWebRTCServer != NULL && !mWebRTCServer->IsThreaded() )
@@ -907,7 +907,7 @@ bool gstDecoder::Capture( void** output, imageFormat format, uint64_t timeout, i
 	}
 
 	// wait until a new frame is recieved
-	const int result = mBufferManager->Dequeue(output, format, timeout);
+	const int result = mBufferManager->Dequeue(output, format, timeout, stream);
 
 	if( result < 0 )
 	{

diff --git a/codec/gstDecoder.h b/codec/gstDecoder.h
@@ -71,7 +71,7 @@ class gstDecoder : public videoSource
 	 * Capture the next decoded frame.
 	 * @see videoSource::Capture()
 	 */
-	virtual bool Capture( void** image, imageFormat format, uint64_t timeout=DEFAULT_TIMEOUT, int* status=NULL );
+	virtual bool Capture( void** image, imageFormat format, uint64_t timeout=DEFAULT_TIMEOUT, int* status=NULL, cudaStream_t stream=0 );
 
 	/**
 	 * Open the stream.

diff --git a/codec/gstEncoder.cpp b/codec/gstEncoder.cpp
@@ -625,7 +625,7 @@ bool gstEncoder::encodeYUV( void* buffer, size_t size )
 
 
 // Render
-bool gstEncoder::Render( void* image, uint32_t width, uint32_t height, imageFormat format )
+bool gstEncoder::Render( void* image, uint32_t width, uint32_t height, imageFormat format, cudaStream_t stream )
 {	
 	// update the webrtc server if needed
 	if( mWebRTCServer != NULL && !mWebRTCServer->IsThreaded() )
@@ -701,7 +701,7 @@ bool gstEncoder::Render( void* image, uint32_t width, uint32_t height, imageForm
 	// perform colorspace conversion
 	void* nextYUV = mBufferYUV.Next(RingBuffer::Write);
 
-	if( CUDA_FAILED(cudaConvertColor(image, format, nextYUV, IMAGE_I420, width, height)) )
+	if( CUDA_FAILED(cudaConvertColor(image, format, nextYUV, IMAGE_I420, width, height, stream)) )
 	{
 		LogError(LOG_GSTREAMER "gstEncoder::Render() -- unsupported image format (%s)\n", imageFormatToStr(format));
 		LogError(LOG_GSTREAMER "                        supported formats are:\n");
@@ -714,7 +714,10 @@ bool gstEncoder::Render( void* image, uint32_t width, uint32_t height, imageForm
 		render_end();
 	}
 
-	CUDA(cudaDeviceSynchronize());	// TODO replace with cudaStream?
+    if( stream != 0 )
+        CUDA(cudaStreamSynchronize(stream));
+    else
+	    CUDA(cudaDeviceSynchronize());
 
 	// encode YUV buffer
 	enc_success = encodeYUV(nextYUV, i420Size);

diff --git a/codec/gstEncoder.h b/codec/gstEncoder.h
@@ -70,13 +70,13 @@ class gstEncoder : public videoOutput
 	 * Encode the next frame.
 	 * @see videoOutput::Render()
 	 */
-	template<typename T> bool Render( T* image, uint32_t width, uint32_t height )		{ return Render((void**)image, width, height, imageFormatFromType<T>()); }
+	template<typename T> bool Render( T* image, uint32_t width, uint32_t height, cudaStream_t stream=0 )		{ return Render((void**)image, width, height, imageFormatFromType<T>(), stream); }
 
 	/**
 	 * Encode the next frame.
 	 * @see videoOutput::Render()
 	 */
-	virtual bool Render( void* image, uint32_t width, uint32_t height, imageFormat format );
+	virtual bool Render( void* image, uint32_t width, uint32_t height, imageFormat format, cudaStream_t stream=0 );
 
 	/**
 	 * Open the stream.

diff --git a/cuda/cudaBayer.cpp b/cuda/cudaBayer.cpp
@@ -23,11 +23,12 @@
 #include "cudaBayer.h"
 #include "logging.h"
 
+#include <npp.h>
 #include <nppi.h>
 
 
 // cudaBayerToRGB
-cudaError_t cudaBayerToRGB( uint8_t* input, uchar3* output, size_t width, size_t height, imageFormat format )
+cudaError_t cudaBayerToRGB( uint8_t* input, uchar3* output, size_t width, size_t height, imageFormat format, cudaStream_t stream )
 {
 	NppiSize size;
 	size.width = width;
@@ -52,20 +53,25 @@ cudaError_t cudaBayerToRGB( uint8_t* input, uchar3* output, size_t width, size_t
 	else
 		return cudaErrorInvalidValue;
 
-	const NppStatus result = nppiCFAToRGB_8u_C1C3R(input, width * sizeof(uint8_t), size, roi, 
-												   (uint8_t*)output, width * sizeof(uchar3),
-												   grid, NPPI_INTER_UNDEFINED);
+	NppStreamContext nppStreamContext;
+	nppGetStreamContext(&nppStreamContext);
+	nppStreamContext.hStream = stream;
+
+	const NppStatus result = nppiCFAToRGB_8u_C1C3R_Ctx(input, width * sizeof(uint8_t), size, roi, 
+												       (uint8_t*)output, width * sizeof(uchar3),
+												       grid, NPPI_INTER_UNDEFINED, nppStreamContext);
 
 	if( result != 0 )
 	{
 		LogError(LOG_CUDA "cudaBayerToRGB() NPP error %i\n", result);
 		return cudaErrorUnknown;
 	}
+
 	return cudaSuccess;
 }
 
 
-cudaError_t cudaBayerToRGBA( uint8_t* input, uchar3* output, size_t width, size_t height, imageFormat format )
+cudaError_t cudaBayerToRGBA( uint8_t* input, uchar3* output, size_t width, size_t height, imageFormat format, cudaStream_t stream )
 {
 	return cudaErrorInvalidValue;
 

diff --git a/cuda/cudaBayer.h b/cuda/cudaBayer.h
@@ -41,14 +41,14 @@
  * @params format the Bayer pattern of the input image, should be one of: 	
  *                IMAGE_BAYER_BGGR, IMAGE_BAYER_GBRG, IMAGE_BAYER_GRBG, IMAGE_BAYER_RGGB
  */
-cudaError_t cudaBayerToRGB( uint8_t* input, uchar3* output, size_t width, size_t height, imageFormat format );
+cudaError_t cudaBayerToRGB( uint8_t* input, uchar3* output, size_t width, size_t height, imageFormat format, cudaStream_t stream=0 );
 
 /**
  * Demosaick an 8-bit Bayer image to uchar4 RGBA.
  * @params format the Bayer pattern of the input image, should be one of: 	
  *                IMAGE_BAYER_BGGR, IMAGE_BAYER_GBRG, IMAGE_BAYER_GRBG, IMAGE_BAYER_RGGB
  */
-cudaError_t cudaBayerToRGBA( uint8_t* input, uchar3* output, size_t width, size_t height, imageFormat format );
+cudaError_t cudaBayerToRGBA( uint8_t* input, uchar3* output, size_t width, size_t height, imageFormat format, cudaStream_t stream=0 );
 
 ///@}
 

diff --git a/cuda/cudaColormap.h b/cuda/cudaColormap.h
@@ -88,7 +88,7 @@ cudaError_t cudaColormap( float* input, T* output,
 					 const float2& input_range=make_float2(0,255),
 					 cudaDataFormat input_format=FORMAT_DEFAULT,
 					 cudaColormapType colormap=COLORMAP_DEFAULT,
-					 cudaStream_t stream=NULL)					{ return cudaColormap(input, (void*)output, width, height, input_range, input_format, imageFormatFromType<T>(), colormap, stream); }
+					 cudaStream_t stream=0 )					{ return cudaColormap(input, (void*)output, width, height, input_range, input_format, imageFormatFromType<T>(), colormap, stream); }
 
 /**
  * Apply a colormap from an input image or vector field to RGB/RGBA.
@@ -104,8 +104,8 @@ cudaError_t cudaColormap( float* input, void* output,
 					 const float2& input_range=make_float2(0,255),
 					 cudaDataFormat input_format=FORMAT_DEFAULT,
 					 imageFormat output_format=IMAGE_UNKNOWN,
-                          cudaColormapType colormap=COLORMAP_DEFAULT,
-					 cudaStream_t stream=NULL);
+                     cudaColormapType colormap=COLORMAP_DEFAULT,
+					 cudaStream_t stream=0 );
 
 /**
  * Apply a colormap from an input image or vector field to RGB/RGBA.
@@ -124,7 +124,7 @@ cudaError_t cudaColormap( float* input, size_t input_width, size_t input_height,
 					 cudaDataFormat input_format=FORMAT_DEFAULT,
                           cudaColormapType colormap=COLORMAP_DEFAULT,
 					 cudaFilterMode filter=FILTER_LINEAR,
-					 cudaStream_t stream=NULL )					{ return cudaColormap(input, input_width, input_height, output, output_width, output_height, input_range, input_format, imageFormatFromType<T>(), colormap, filter, stream); }
+					 cudaStream_t stream=0 )					{ return cudaColormap(input, input_width, input_height, output, output_width, output_height, input_range, input_format, imageFormatFromType<T>(), colormap, filter, stream); }
 
 /**
  * Apply a colormap from an input image or vector field to RGB/RGBA.
@@ -143,7 +143,7 @@ cudaError_t cudaColormap( float* input, size_t input_width, size_t input_height,
 					 imageFormat output_format=IMAGE_UNKNOWN,
                           cudaColormapType colormap=COLORMAP_DEFAULT,
 					 cudaFilterMode filter=FILTER_LINEAR,
-					 cudaStream_t stream=NULL );
+					 cudaStream_t stream=0 );
 
 /**
  * Initialize the colormap palettes by allocating them in CUDA memory.