Skip to content

Commit

Permalink
updated YUV conversion functions
Browse files Browse the repository at this point in the history
  • Loading branch information
dusty-nv committed Jun 15, 2020
1 parent 74da461 commit 259e9fb
Show file tree
Hide file tree
Showing 4 changed files with 269 additions and 101 deletions.
9 changes: 6 additions & 3 deletions cuda/cudaYUV-NV12.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,15 @@ static inline __device__ T YUV2RGB(const uint3& yuvi)
const float v = float(yuvi.z) - 512.0f;
const float s = 1.0f / 1024.0f * 255.0f; // TODO clamp for uchar output?

// R = Y + 1.140V
// G = Y - 0.395U - 0.581V
// B = Y + 2.032U
#if 1
return make_vec<T>(clamp((luma + 1.402f * v) * s),
clamp((luma - 0.344f * u - 0.714f * v) * s),
clamp((luma + 1.772f * u) * s), 255);
#else
return make_vec<T>(clamp((luma + 1.140f * v) * s),
clamp((luma - 0.395f * u - 0.581f * v) * s),
clamp((luma + 2.032f * u) * s), 255);
#endif
}


Expand Down
193 changes: 111 additions & 82 deletions cuda/cudaYUV-YUYV.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,143 +23,172 @@
#include "cudaYUV.h"


inline __device__ __host__ float clamp(float f, float a, float b)
{
return fmaxf(a, fminf(f, b));
}


/* From RGB to YUV

Y = 0.299R + 0.587G + 0.114B
U = 0.492 (B-Y)
V = 0.877 (R-Y)
It can also be represented as:
Y = 0.299R + 0.587G + 0.114B
U = -0.147R - 0.289G + 0.436B
V = 0.615R - 0.515G - 0.100B
//-----------------------------------------------------------------------------------
// YUV to RGB colorspace conversion
//-----------------------------------------------------------------------------------
static inline __device__ float clamp( float x )
{
return fminf(fmaxf(x, 0.0f), 255.0f);
}

From YUV to RGB
static inline __device__ float3 YUV2RGB(float Y, float U, float V)
{
U -= 128.0f;
V -= 128.0f;

#if 1
return make_float3(clamp(Y + 1.4065f * V),
clamp(Y - 0.3455f * U - 0.7169f * V),
clamp(Y + 1.7790f * U));
#else
return make_float3(clamp(Y + 1.402f * V),
clamp(Y - 0.344f * U - 0.714f * V),
clamp(Y + 1.772f * U));
#endif
}

R = Y + 1.140V
G = Y - 0.395U - 0.581V
B = Y + 2.032U
*/
//-----------------------------------------------------------------------------------
// YUYV/UYVY are macropixel formats, and two RGB pixels are output at once.
// Define vectors with 6 and 8 elements so they can be written at one time.
// These are similar to those from cudaVector.h, except for 6/8 elements.
//-----------------------------------------------------------------------------------
struct /*__align__(6)*/ uchar6
{
uint8_t x0, y0, z0, x1, y1, z1;
};

struct __align__(8) uchar8
{
uint8_t a0, a1, a2, a3, a4, a5, a6, a7;
uint8_t x0, y0, z0, w0, x1, y1, z1, w1;
};

struct __align__(32) float8
struct /*__align__(24)*/ float6
{
float a0, a1, a2, a3, a4, a5, a6, a7;
float x0, y0, z0, x1, y1, z1;
};

template<typename T, typename BaseT> __device__ __forceinline__ T make_vec8(BaseT a0, BaseT a1, BaseT a2, BaseT a3, BaseT a4, BaseT a5, BaseT a6, BaseT a7)
struct __align__(32) float8
{
T val = {a0, a1, a2, a3, a4, a5, a6, a7};
return val;
}
float x0, y0, z0, w0, x1, y1, z1, w1;
};

/*static __host__ __device__ __forceinline__ uchar8 make_uchar8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7)
{
uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
return val;
}
template<class T> struct vecTypeInfo;

template<> struct vecTypeInfo<uchar6> { typedef uint8_t Base; };
template<> struct vecTypeInfo<uchar8> { typedef uint8_t Base; };

static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)
{
float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
return val;
}*/
template<> struct vecTypeInfo<float6> { typedef float Base; };
template<> struct vecTypeInfo<float8> { typedef float Base; };

template<typename T> struct vec_assert_false : std::false_type { };

#define BaseType typename vecTypeInfo<T>::Base

template<typename T> inline __host__ __device__ T make_vec(BaseType x0, BaseType y0, BaseType z0, BaseType w0, BaseType x1, BaseType y1, BaseType z1, BaseType w1) { static_assert(vec_assert_false<T>::value, "invalid vector type - supported types are uchar6, uchar8, float6, float8"); }

template<> inline __host__ __device__ uchar6 make_vec( uint8_t x0, uint8_t y0, uint8_t z0, uint8_t w0, uint8_t x1, uint8_t y1, uint8_t z1, uint8_t w1 ) { return {x0, y0, z0, x1, y1, z1}; }
template<> inline __host__ __device__ uchar8 make_vec( uint8_t x0, uint8_t y0, uint8_t z0, uint8_t w0, uint8_t x1, uint8_t y1, uint8_t z1, uint8_t w1 ) { return {x0, y0, z0, w1, x1, y1, z1, w1}; }

template<> inline __host__ __device__ float6 make_vec( float x0, float y0, float z0, float w0, float x1, float y1, float z1, float w1 ) { return {x0, y0, z0, x1, y1, z1}; }
template<> inline __host__ __device__ float8 make_vec( float x0, float y0, float z0, float w0, float x1, float y1, float z1, float w1 ) { return {x0, y0, z0, w1, x1, y1, z1, w1}; }


//-----------------------------------------------------------------------------------
// YUYV/UYVY to RGBA
//-----------------------------------------------------------------------------------
template <typename T, typename BaseT, bool formatUYVY>
__global__ void yuyvToRgba( uchar4* src, int srcAlignedWidth, T* dst, int dstAlignedWidth, int width, int height )
template <typename T, bool formatUYVY>
__global__ void YUYVToRGBA( uchar4* src, T* dst, int halfWidth, int height )
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;

if( x >= srcAlignedWidth || y >= height )
if( x >= halfWidth || y >= height )
return;

const uchar4 macroPx = src[y * srcAlignedWidth + x];
const uchar4 macroPx = src[y * halfWidth + x];

// Y0 is the brightness of pixel 0, Y1 the brightness of pixel 1.
// U0 and V0 is the color of both pixels.
// UYVY [ U0 | Y0 | V0 | Y1 ]
// YUYV [ Y0 | U0 | Y1 | V0 ]
const float y0 = formatUYVY ? macroPx.y : macroPx.x;
const float y1 = formatUYVY ? macroPx.w : macroPx.z;
const float u = (formatUYVY ? macroPx.x : macroPx.y) - 128.0f;
const float v = (formatUYVY ? macroPx.z : macroPx.w) - 128.0f;

const float4 px0 = make_float4( y0 + 1.4065f * v,
y0 - 0.3455f * u - 0.7169f * v,
y0 + 1.7790f * u, 255.0f );

const float4 px1 = make_float4( y1 + 1.4065f * v,
y1 - 0.3455f * u - 0.7169f * v,
y1 + 1.7790f * u, 255.0f );

dst[y * dstAlignedWidth + x] = make_vec8<T, BaseT>(clamp(px0.x, 0.0f, 255.0f),
clamp(px0.y, 0.0f, 255.0f),
clamp(px0.z, 0.0f, 255.0f),
clamp(px0.w, 0.0f, 255.0f),
clamp(px1.x, 0.0f, 255.0f),
clamp(px1.y, 0.0f, 255.0f),
clamp(px1.z, 0.0f, 255.0f),
clamp(px1.w, 0.0f, 255.0f));
const float u = formatUYVY ? macroPx.x : macroPx.y;
const float v = formatUYVY ? macroPx.z : macroPx.w;

// this function outputs two pixels from one YUYV macropixel
const float3 px0 = YUV2RGB(y0, u, v);
const float3 px1 = YUV2RGB(y1, u, v);

dst[y * halfWidth + x] = make_vec<T>(px0.x, px0.y, px0.z, 255,
px1.x, px1.y, px1.z, 255);
}

template<typename T, typename BaseT, bool formatUYVY>
cudaError_t launchYUYV( void* input, size_t inputPitch, T* output, size_t outputPitch, size_t width, size_t height)
template<typename T, bool formatUYVY>
cudaError_t launchYUYVToRGB( void* input, T* output, size_t width, size_t height)
{
if( !input || !inputPitch || !output || !outputPitch || !width || !height )
if( !input || !output || !width || !height )
return cudaErrorInvalidValue;

const dim3 block(8,8);
const dim3 grid(iDivUp(width/2, block.x), iDivUp(height, block.y));
const int halfWidth = width / 2; // two pixels are output at once
const dim3 blockDim(8,8);
const dim3 gridDim(iDivUp(halfWidth, blockDim.x), iDivUp(height, blockDim.y));

const int srcAlignedWidth = inputPitch / sizeof(uchar4); // normally would be uchar2, but we're doubling up pixels
const int dstAlignedWidth = outputPitch / sizeof(T); // normally would be uchar4 ^^^

//printf("yuyvToRgba %zu %zu %i %i %i %i %i\n", width, height, (int)formatUYVY, srcAlignedWidth, dstAlignedWidth, grid.x, grid.y);

yuyvToRgba<T, BaseT, formatUYVY><<<grid, block>>>((uchar4*)input, srcAlignedWidth, output, dstAlignedWidth, width, height);
YUYVToRGBA<T, formatUYVY><<<gridDim, blockDim>>>((uchar4*)input, output, halfWidth, height);

return CUDA(cudaGetLastError());
}


// cudaUYVYToRGBA (uchar4)
cudaError_t cudaUYVYToRGBA( void* input, uchar4* output, size_t width, size_t height )
// cudaYUYVToRGB (uchar3)
cudaError_t cudaYUYVToRGB( void* input, uchar3* output, size_t width, size_t height )
{
return launchYUYV<uchar8, uint8_t, true>(input, width * sizeof(uchar2), (uchar8*)output, width * sizeof(uchar4), width, height);
return launchYUYVToRGB<uchar6, false>(input, (uchar6*)output, width, height);
}

// cudaYUYVToRGBA (uchar4)
cudaError_t cudaYUYVToRGBA( void* input, uchar4* output, size_t width, size_t height )
// cudaYUYVToRGB (float3)
cudaError_t cudaYUYVToRGB( void* input, float3* output, size_t width, size_t height )
{
return launchYUYV<uchar8, uint8_t, false>(input, width * sizeof(uchar2), (uchar8*)output, width * sizeof(uchar4), width, height);
return launchYUYVToRGB<float6, false>(input, (float6*)output, width, height);
}

// cudaUYVYToRGBA (float4)
cudaError_t cudaUYVYToRGBA( void* input, float4* output, size_t width, size_t height )
// cudaYUYVToRGBA (uchar4)
cudaError_t cudaYUYVToRGBA( void* input, uchar4* output, size_t width, size_t height )
{
return launchYUYV<float8, float, true>(input, width * sizeof(uchar2), (float8*)output, width * sizeof(float4), width, height);
return launchYUYVToRGB<uchar8, false>(input, (uchar8*)output, width, height);
}

// cudaYUYVToRGBA (float4)
cudaError_t cudaYUYVToRGBA( void* input, float4* output, size_t width, size_t height )
{
return launchYUYV<float8, float, false>(input, width * sizeof(uchar2), (float8*)output, width * sizeof(float4), width, height);
return launchYUYVToRGB<float8, false>(input, (float8*)output, width, height);
}

//-----------------------------------------------------------------------------------

// cudaUYVYToRGB (uchar3)
cudaError_t cudaUYVYToRGB( void* input, uchar3* output, size_t width, size_t height )
{
return launchYUYVToRGB<uchar6, true>(input, (uchar6*)output, width, height);
}

// cudaUYVYToRGB (float3)
cudaError_t cudaUYVYToRGB( void* input, float3* output, size_t width, size_t height )
{
return launchYUYVToRGB<float6, true>(input, (float6*)output, width, height);
}

// cudaUYVYToRGBA (uchar4)
cudaError_t cudaUYVYToRGBA( void* input, uchar4* output, size_t width, size_t height )
{
return launchYUYVToRGB<uchar8, true>(input, (uchar8*)output, width, height);
}

// cudaUYVYToRGBA (float4)
cudaError_t cudaUYVYToRGBA( void* input, float4* output, size_t width, size_t height )
{
return launchYUYVToRGB<float8, true>(input, (float8*)output, width, height);
}


80 changes: 65 additions & 15 deletions cuda/cudaYUV-YV12.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,14 @@
#include "cudaVector.h"


//-----------------------------------------------------------------------------------------------
// I420/YV12 to RGB
//-----------------------------------------------------------------------------------------------
static inline __device__ float clamp( float x ) { return fminf(fmaxf(x, 0.0f), 255.0f); }

//-----------------------------------------------------------------------------------
// YUV to RGB colorspace conversion
//-----------------------------------------------------------------------------------
static inline __device__ float clamp( float x )
{
return fminf(fmaxf(x, 0.0f), 255.0f);
}

static inline __device__ float3 YUV2RGB(float Y, float U, float V)
{
Expand All @@ -45,6 +49,9 @@ static inline __device__ float3 YUV2RGB(float Y, float U, float V)
#endif
}

//-------------------------------------------------------------------------------------
// I420/YV12 to RGB
//-------------------------------------------------------------------------------------
template <typename T, bool formatYV12>
__global__ void I420ToRGB(uint8_t* srcImage, int srcPitch,
T* dstImage, int dstPitch,
Expand Down Expand Up @@ -92,38 +99,81 @@ __global__ void I420ToRGB(uint8_t* srcImage, int srcPitch,
}

template <typename T, bool formatYV12>
cudaError_t launch420ToRGB(void* srcDev, size_t srcPitch, T* destDev,
size_t destPitch, size_t width, size_t height)
cudaError_t launch420ToRGB(void* srcDev, T* dstDev, size_t width, size_t height)
{
if( !srcDev || !destDev )
if( !srcDev || !dstDev )
return cudaErrorInvalidDevicePointer;

if( srcPitch == 0 || destPitch == 0 || width == 0 || height == 0 )
if( width == 0 || height == 0 )
return cudaErrorInvalidValue;

const int srcPitch = width * sizeof(uint8_t);
const int dstPitch = width * sizeof(T);

const dim3 blockDim(8,8);
//const dim3 gridDim((width+(2*blockDim.x-1))/(2*blockDim.x), (height+(blockDim.y-1))/blockDim.y, 1);
const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height, blockDim.y));

I420ToRGB<T, formatYV12><<<gridDim, blockDim>>>( (uint8_t*)srcDev, srcPitch, destDev, destPitch, width, height );
I420ToRGB<T, formatYV12><<<gridDim, blockDim>>>( (uint8_t*)srcDev, srcPitch, dstDev, dstPitch, width, height );

return CUDA(cudaGetLastError());
}

cudaError_t cudaI420ToRGB(void* input, size_t inputPitch, uchar3* output, size_t outputPitch, size_t width, size_t height)

// cudaI420ToRGB (uchar3)
cudaError_t cudaI420ToRGB(void* input, uchar3* output, size_t width, size_t height)
{
return launch420ToRGB<uchar3, false>(input, inputPitch, output, outputPitch, width, height);
return launch420ToRGB<uchar3, false>(input, output, width, height);
}

cudaError_t cudaI420ToRGB(void* input, uchar3* output, size_t width, size_t height)
// cudaI420ToRGB (float3)
cudaError_t cudaI420ToRGB(void* input, float3* output, size_t width, size_t height)
{
return launch420ToRGB<float3, false>(input, output, width, height);
}

// cudaI420ToRGBA (uchar4)
cudaError_t cudaI420ToRGBA(void* input, uchar4* output, size_t width, size_t height)
{
return launch420ToRGB<uchar4, false>(input, output, width, height);
}

// cudaI420ToRGBA (float4)
cudaError_t cudaI420ToRGBA(void* input, float4* output, size_t width, size_t height)
{
return launch420ToRGB<float4, false>(input, output, width, height);
}

//-----------------------------------------------------------------------------------

// cudaYV12ToRGB (uchar3)
cudaError_t cudaYV12ToRGB(void* input, uchar3* output, size_t width, size_t height)
{
return launch420ToRGB<uchar3, true>(input, output, width, height);
}

// cudaYV12ToRGB (float3)
cudaError_t cudaYV12ToRGB(void* input, float3* output, size_t width, size_t height)
{
return launch420ToRGB<float3, true>(input, output, width, height);
}

// cudaYV12ToRGBA (uchar4)
cudaError_t cudaYV12ToRGBA(void* input, uchar4* output, size_t width, size_t height)
{
return launch420ToRGB<uchar4, true>(input, output, width, height);
}

// cudaYV12ToRGBA (float4)
cudaError_t cudaYV12ToRGBA(void* input, float4* output, size_t width, size_t height)
{
return launch420ToRGB<uchar3, false>(input, width * sizeof(uint8_t), output, width * sizeof(uchar3), width, height);
return launch420ToRGB<float4, true>(input, output, width, height);
}


//-----------------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------
// RGB to I420/YV12
//-----------------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------
inline __device__ void rgb_to_y(const uint8_t r, const uint8_t g, const uint8_t b, uint8_t& y)
{
y = static_cast<uint8_t>(((int)(30 * r) + (int)(59 * g) + (int)(11 * b)) / 100);
Expand Down
Loading

0 comments on commit 259e9fb

Please sign in to comment.