Skip to content

Commit

Permalink
support for CUDA streams
Browse files Browse the repository at this point in the history
  • Loading branch information
dusty-nv committed Jun 16, 2024
1 parent 72c4e09 commit 4152d8a
Show file tree
Hide file tree
Showing 60 changed files with 1,709 additions and 1,100 deletions.
8 changes: 4 additions & 4 deletions camera/gstCamera.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -712,7 +712,7 @@ void gstCamera::checkBuffer()


// Capture
bool gstCamera::Capture( void** output, imageFormat format, uint64_t timeout, int* status )
bool gstCamera::Capture( void** output, imageFormat format, uint64_t timeout, int* status, cudaStream_t stream )
{
// verify the output pointer exists
if( !output )
Expand All @@ -726,7 +726,7 @@ bool gstCamera::Capture( void** output, imageFormat format, uint64_t timeout, in
}

// wait until a new frame is recieved
const int result = mBufferManager->Dequeue(output, format, timeout);
const int result = mBufferManager->Dequeue(output, format, timeout, stream);

if( result < 0 )
{
Expand All @@ -746,10 +746,10 @@ bool gstCamera::Capture( void** output, imageFormat format, uint64_t timeout, in
}

// CaptureRGBA
bool gstCamera::CaptureRGBA( float** output, unsigned long timeout, bool zeroCopy )
bool gstCamera::CaptureRGBA( float** output, unsigned long timeout, bool zeroCopy, cudaStream_t stream )
{
mOptions.zeroCopy = zeroCopy;
return Capture((void**)output, IMAGE_RGBA32F, timeout);
return Capture((void**)output, IMAGE_RGBA32F, timeout, NULL, stream);
}


Expand Down
4 changes: 2 additions & 2 deletions camera/gstCamera.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ class gstCamera : public videoSource
* Capture the next image frame from the camera.
* @see videoSource::Capture
*/
virtual bool Capture( void** image, imageFormat format, uint64_t timeout=DEFAULT_TIMEOUT, int* status=NULL );
virtual bool Capture( void** image, imageFormat format, uint64_t timeout=DEFAULT_TIMEOUT, int* status=NULL, cudaStream_t stream=0 );

/**
* Capture the next image frame from the camera and convert it to float4 RGBA format,
Expand Down Expand Up @@ -170,7 +170,7 @@ class gstCamera : public videoSource
* @returns `true` if a frame was successfully captured, otherwise `false` if a timeout
* or error occurred, or if timeout was 0 and a frame wasn't ready.
*/
bool CaptureRGBA( float** image, uint64_t timeout=DEFAULT_TIMEOUT, bool zeroCopy=false );
bool CaptureRGBA( float** image, uint64_t timeout=DEFAULT_TIMEOUT, bool zeroCopy=false, cudaStream_t stream=0 );

/**
* Set whether converted RGB(A) images should use ZeroCopy buffer allocation.
Expand Down
46 changes: 23 additions & 23 deletions codec/gstBufferManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -257,32 +257,32 @@ bool gstBufferManager::Enqueue( GstBuffer* gstBuffer, GstCaps* gstCaps )
mBufferYUV.Next(RingBuffer::Write);
}

// handle timestamps in either case (CPU or NVMM path)
size_t timestamp_size = sizeof(uint64_t);
// handle timestamps in either case (CPU or NVMM path)
size_t timestamp_size = sizeof(uint64_t);

// allocate timestamp ringbuffer (GPU only if not ZeroCopy)
if( !mTimestamps.Alloc(mOptions->numBuffers, timestamp_size, RingBuffer::ZeroCopy) )
{
LogError(LOG_GSTREAMER "gstBufferManager -- failed to allocate %u timestamp buffers (%zu bytes each)\n", mOptions->numBuffers, timestamp_size);
return false;
}
// allocate timestamp ringbuffer (GPU only if not ZeroCopy)
if( !mTimestamps.Alloc(mOptions->numBuffers, timestamp_size, RingBuffer::ZeroCopy) )
{
LogError(LOG_GSTREAMER "gstBufferManager -- failed to allocate %u timestamp buffers (%zu bytes each)\n", mOptions->numBuffers, timestamp_size);
return false;
}

// copy to next timestamp ringbuffer
void* nextTimestamp = mTimestamps.Peek(RingBuffer::Write);
// copy to next timestamp ringbuffer
void* nextTimestamp = mTimestamps.Peek(RingBuffer::Write);

if( !nextTimestamp )
{
LogError(LOG_GSTREAMER "gstBufferManager -- failed to retrieve next timestamp ringbuffer for writing\n");
return false;
}
if( !nextTimestamp )
{
LogError(LOG_GSTREAMER "gstBufferManager -- failed to retrieve next timestamp ringbuffer for writing\n");
return false;
}

if (GST_BUFFER_DTS_IS_VALID(gstBuffer) || GST_BUFFER_PTS_IS_VALID(gstBuffer))
{
timestamp = GST_BUFFER_DTS_OR_PTS(gstBuffer);
}
if( GST_BUFFER_DTS_IS_VALID(gstBuffer) || GST_BUFFER_PTS_IS_VALID(gstBuffer) )
{
timestamp = GST_BUFFER_DTS_OR_PTS(gstBuffer);
}

memcpy(nextTimestamp, (void*)&timestamp, timestamp_size);
mTimestamps.Next(RingBuffer::Write);
memcpy(nextTimestamp, (void*)&timestamp, timestamp_size);
mTimestamps.Next(RingBuffer::Write);

mWaitEvent.Wake();
mFrameCount++;
Expand All @@ -296,7 +296,7 @@ bool gstBufferManager::Enqueue( GstBuffer* gstBuffer, GstCaps* gstCaps )


// Dequeue
int gstBufferManager::Dequeue( void** output, imageFormat format, uint64_t timeout )
int gstBufferManager::Dequeue( void** output, imageFormat format, uint64_t timeout, cudaStream_t stream )
{
// wait until a new frame is recieved
if( !mWaitEvent.Wait(timeout) )
Expand Down Expand Up @@ -442,7 +442,7 @@ int gstBufferManager::Dequeue( void** output, imageFormat format, uint64_t timeo
// perform colorspace conversion
void* nextRGB = mBufferRGB.Next(RingBuffer::Write);

if( CUDA_FAILED(cudaConvertColor(latestYUV, mFormatYUV, nextRGB, format, mOptions->width, mOptions->height)) )
if( CUDA_FAILED(cudaConvertColor(latestYUV, mFormatYUV, nextRGB, format, mOptions->width, mOptions->height, stream)) )
{
LogError(LOG_GSTREAMER "gstBufferManager -- unsupported image format (%s)\n", imageFormatToStr(format));
LogError(LOG_GSTREAMER " supported formats are:\n");
Expand Down
2 changes: 1 addition & 1 deletion codec/gstBufferManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class gstBufferManager
/**
* Dequeue the next frame. Returns 1 on success, 0 on timeout, -1 on error.
*/
int Dequeue( void** output, imageFormat format, uint64_t timeout=UINT64_MAX );
int Dequeue( void** output, imageFormat format, uint64_t timeout=UINT64_MAX, cudaStream_t stream=0 );

/**
* Get timestamp of the latest dequeued frame.
Expand Down
4 changes: 2 additions & 2 deletions codec/gstDecoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -889,7 +889,7 @@ void gstDecoder::checkBuffer()


// Capture
bool gstDecoder::Capture( void** output, imageFormat format, uint64_t timeout, int* status )
bool gstDecoder::Capture( void** output, imageFormat format, uint64_t timeout, int* status, cudaStream_t stream )
{
// update the webrtc server if needed
if( mWebRTCServer != NULL && !mWebRTCServer->IsThreaded() )
Expand All @@ -907,7 +907,7 @@ bool gstDecoder::Capture( void** output, imageFormat format, uint64_t timeout, i
}

// wait until a new frame is recieved
const int result = mBufferManager->Dequeue(output, format, timeout);
const int result = mBufferManager->Dequeue(output, format, timeout, stream);

if( result < 0 )
{
Expand Down
2 changes: 1 addition & 1 deletion codec/gstDecoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class gstDecoder : public videoSource
* Capture the next decoded frame.
* @see videoSource::Capture()
*/
virtual bool Capture( void** image, imageFormat format, uint64_t timeout=DEFAULT_TIMEOUT, int* status=NULL );
virtual bool Capture( void** image, imageFormat format, uint64_t timeout=DEFAULT_TIMEOUT, int* status=NULL, cudaStream_t stream=0 );

/**
* Open the stream.
Expand Down
9 changes: 6 additions & 3 deletions codec/gstEncoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,7 @@ bool gstEncoder::encodeYUV( void* buffer, size_t size )


// Render
bool gstEncoder::Render( void* image, uint32_t width, uint32_t height, imageFormat format )
bool gstEncoder::Render( void* image, uint32_t width, uint32_t height, imageFormat format, cudaStream_t stream )
{
// update the webrtc server if needed
if( mWebRTCServer != NULL && !mWebRTCServer->IsThreaded() )
Expand Down Expand Up @@ -701,7 +701,7 @@ bool gstEncoder::Render( void* image, uint32_t width, uint32_t height, imageForm
// perform colorspace conversion
void* nextYUV = mBufferYUV.Next(RingBuffer::Write);

if( CUDA_FAILED(cudaConvertColor(image, format, nextYUV, IMAGE_I420, width, height)) )
if( CUDA_FAILED(cudaConvertColor(image, format, nextYUV, IMAGE_I420, width, height, stream)) )
{
LogError(LOG_GSTREAMER "gstEncoder::Render() -- unsupported image format (%s)\n", imageFormatToStr(format));
LogError(LOG_GSTREAMER " supported formats are:\n");
Expand All @@ -714,7 +714,10 @@ bool gstEncoder::Render( void* image, uint32_t width, uint32_t height, imageForm
render_end();
}

CUDA(cudaDeviceSynchronize()); // TODO replace with cudaStream?
if( stream != 0 )
CUDA(cudaStreamSynchronize(stream));
else
CUDA(cudaDeviceSynchronize());

// encode YUV buffer
enc_success = encodeYUV(nextYUV, i420Size);
Expand Down
4 changes: 2 additions & 2 deletions codec/gstEncoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,13 @@ class gstEncoder : public videoOutput
* Encode the next frame.
* @see videoOutput::Render()
*/
template<typename T> bool Render( T* image, uint32_t width, uint32_t height ) { return Render((void**)image, width, height, imageFormatFromType<T>()); }
template<typename T> bool Render( T* image, uint32_t width, uint32_t height, cudaStream_t stream=0 ) { return Render((void**)image, width, height, imageFormatFromType<T>(), stream); }

/**
* Encode the next frame.
* @see videoOutput::Render()
*/
virtual bool Render( void* image, uint32_t width, uint32_t height, imageFormat format );
virtual bool Render( void* image, uint32_t width, uint32_t height, imageFormat format, cudaStream_t stream=0 );

/**
* Open the stream.
Expand Down
16 changes: 11 additions & 5 deletions cuda/cudaBayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@
#include "cudaBayer.h"
#include "logging.h"

#include <npp.h>
#include <nppi.h>


// cudaBayerToRGB
cudaError_t cudaBayerToRGB( uint8_t* input, uchar3* output, size_t width, size_t height, imageFormat format )
cudaError_t cudaBayerToRGB( uint8_t* input, uchar3* output, size_t width, size_t height, imageFormat format, cudaStream_t stream )
{
NppiSize size;
size.width = width;
Expand All @@ -52,20 +53,25 @@ cudaError_t cudaBayerToRGB( uint8_t* input, uchar3* output, size_t width, size_t
else
return cudaErrorInvalidValue;

const NppStatus result = nppiCFAToRGB_8u_C1C3R(input, width * sizeof(uint8_t), size, roi,
(uint8_t*)output, width * sizeof(uchar3),
grid, NPPI_INTER_UNDEFINED);
NppStreamContext nppStreamContext;
nppGetStreamContext(&nppStreamContext);
nppStreamContext.hStream = stream;

const NppStatus result = nppiCFAToRGB_8u_C1C3R_Ctx(input, width * sizeof(uint8_t), size, roi,
(uint8_t*)output, width * sizeof(uchar3),
grid, NPPI_INTER_UNDEFINED, nppStreamContext);

if( result != 0 )
{
LogError(LOG_CUDA "cudaBayerToRGB() NPP error %i\n", result);
return cudaErrorUnknown;
}

return cudaSuccess;
}


cudaError_t cudaBayerToRGBA( uint8_t* input, uchar3* output, size_t width, size_t height, imageFormat format )
cudaError_t cudaBayerToRGBA( uint8_t* input, uchar3* output, size_t width, size_t height, imageFormat format, cudaStream_t stream )
{
return cudaErrorInvalidValue;

Expand Down
4 changes: 2 additions & 2 deletions cuda/cudaBayer.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@
* @params format the Bayer pattern of the input image, should be one of:
* IMAGE_BAYER_BGGR, IMAGE_BAYER_GBRG, IMAGE_BAYER_GRBG, IMAGE_BAYER_RGGB
*/
cudaError_t cudaBayerToRGB( uint8_t* input, uchar3* output, size_t width, size_t height, imageFormat format );
cudaError_t cudaBayerToRGB( uint8_t* input, uchar3* output, size_t width, size_t height, imageFormat format, cudaStream_t stream=0 );

/**
* Demosaick an 8-bit Bayer image to uchar4 RGBA.
* @params format the Bayer pattern of the input image, should be one of:
* IMAGE_BAYER_BGGR, IMAGE_BAYER_GBRG, IMAGE_BAYER_GRBG, IMAGE_BAYER_RGGB
*/
cudaError_t cudaBayerToRGBA( uint8_t* input, uchar3* output, size_t width, size_t height, imageFormat format );
cudaError_t cudaBayerToRGBA( uint8_t* input, uchar3* output, size_t width, size_t height, imageFormat format, cudaStream_t stream=0 );

///@}

Expand Down
10 changes: 5 additions & 5 deletions cuda/cudaColormap.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ cudaError_t cudaColormap( float* input, T* output,
const float2& input_range=make_float2(0,255),
cudaDataFormat input_format=FORMAT_DEFAULT,
cudaColormapType colormap=COLORMAP_DEFAULT,
cudaStream_t stream=NULL) { return cudaColormap(input, (void*)output, width, height, input_range, input_format, imageFormatFromType<T>(), colormap, stream); }
cudaStream_t stream=0 ) { return cudaColormap(input, (void*)output, width, height, input_range, input_format, imageFormatFromType<T>(), colormap, stream); }

/**
* Apply a colormap from an input image or vector field to RGB/RGBA.
Expand All @@ -104,8 +104,8 @@ cudaError_t cudaColormap( float* input, void* output,
const float2& input_range=make_float2(0,255),
cudaDataFormat input_format=FORMAT_DEFAULT,
imageFormat output_format=IMAGE_UNKNOWN,
cudaColormapType colormap=COLORMAP_DEFAULT,
cudaStream_t stream=NULL);
cudaColormapType colormap=COLORMAP_DEFAULT,
cudaStream_t stream=0 );

/**
* Apply a colormap from an input image or vector field to RGB/RGBA.
Expand All @@ -124,7 +124,7 @@ cudaError_t cudaColormap( float* input, size_t input_width, size_t input_height,
cudaDataFormat input_format=FORMAT_DEFAULT,
cudaColormapType colormap=COLORMAP_DEFAULT,
cudaFilterMode filter=FILTER_LINEAR,
cudaStream_t stream=NULL ) { return cudaColormap(input, input_width, input_height, output, output_width, output_height, input_range, input_format, imageFormatFromType<T>(), colormap, filter, stream); }
cudaStream_t stream=0 ) { return cudaColormap(input, input_width, input_height, output, output_width, output_height, input_range, input_format, imageFormatFromType<T>(), colormap, filter, stream); }

/**
* Apply a colormap from an input image or vector field to RGB/RGBA.
Expand All @@ -143,7 +143,7 @@ cudaError_t cudaColormap( float* input, size_t input_width, size_t input_height,
imageFormat output_format=IMAGE_UNKNOWN,
cudaColormapType colormap=COLORMAP_DEFAULT,
cudaFilterMode filter=FILTER_LINEAR,
cudaStream_t stream=NULL );
cudaStream_t stream=0 );

/**
* Initialize the colormap palettes by allocating them in CUDA memory.
Expand Down
Loading

0 comments on commit 4152d8a

Please sign in to comment.