updated YUV conversion functions

dusty-nv · Jun 15, 2020 · 259e9fb · 259e9fb
1 parent 74da461
commit 259e9fb
Show file tree

Hide file tree

Showing 4 changed files with 269 additions and 101 deletions.
diff --git a/cuda/cudaYUV-NV12.cu b/cuda/cudaYUV-NV12.cu
@@ -42,12 +42,15 @@ static inline __device__ T YUV2RGB(const uint3& yuvi)
 	const float v    = float(yuvi.z) - 512.0f;
 	const float s    = 1.0f / 1024.0f * 255.0f;	// TODO clamp for uchar output?
 
-	// R = Y + 1.140V
-   	// G = Y - 0.395U - 0.581V
-   	// B = Y + 2.032U
+#if 1
+	return make_vec<T>(clamp((luma + 1.402f * v) * s),
+				    clamp((luma - 0.344f * u - 0.714f * v) * s),
+				    clamp((luma + 1.772f * u) * s), 255);
+#else
 	return make_vec<T>(clamp((luma + 1.140f * v) * s),
 				    clamp((luma - 0.395f * u - 0.581f * v) * s),
 				    clamp((luma + 2.032f * u) * s), 255);
+#endif
 }
 
 

diff --git a/cuda/cudaYUV-YUYV.cu b/cuda/cudaYUV-YUYV.cu
@@ -23,143 +23,172 @@
 #include "cudaYUV.h"
 
 
-inline __device__ __host__ float clamp(float f, float a, float b)
-{
-    return fmaxf(a, fminf(f, b));
-}
-
-
-/* From RGB to YUV
 
-   Y = 0.299R + 0.587G + 0.114B
-   U = 0.492 (B-Y)
-   V = 0.877 (R-Y)
-
-   It can also be represented as:
-
-   Y =  0.299R + 0.587G + 0.114B
-   U = -0.147R - 0.289G + 0.436B
-   V =  0.615R - 0.515G - 0.100B
+//-----------------------------------------------------------------------------------
+// YUV to RGB colorspace conversion
+//-----------------------------------------------------------------------------------
+static inline __device__ float clamp( float x )	
+{ 
+	return fminf(fmaxf(x, 0.0f), 255.0f); 
+}
 
-   From YUV to RGB
+static inline __device__ float3 YUV2RGB(float Y, float U, float V)
+{
+	U -= 128.0f;
+	V -= 128.0f;
+
+#if 1
+	return make_float3(clamp(Y + 1.4065f * V),
+    				    clamp(Y - 0.3455f * U - 0.7169f * V),
+				    clamp(Y + 1.7790f * U));
+#else
+	return make_float3(clamp(Y + 1.402f * V),
+    				    clamp(Y - 0.344f * U - 0.714f * V),
+				    clamp(Y + 1.772f * U));
+#endif
+}
 
-   R = Y + 1.140V
-   G = Y - 0.395U - 0.581V
-   B = Y + 2.032U
- */
+//-----------------------------------------------------------------------------------
+// YUYV/UYVY are macropixel formats, and two RGB pixels are output at once.
+// Define vectors with 6 and 8 elements so they can be written at one time.
+// These are similar to those from cudaVector.h, except for 6/8 elements.
+//-----------------------------------------------------------------------------------
+struct /*__align__(6)*/ uchar6
+{
+   uint8_t x0, y0, z0, x1, y1, z1;
+};
 
 struct __align__(8) uchar8
 {
-   uint8_t a0, a1, a2, a3, a4, a5, a6, a7;
+   uint8_t x0, y0, z0, w0, x1, y1, z1, w1;
 };
 
-struct __align__(32) float8
+struct /*__align__(24)*/ float6
 {
-   float a0, a1, a2, a3, a4, a5, a6, a7;
+   float x0, y0, z0, x1, y1, z1;
 };
 
-template<typename T, typename BaseT> __device__ __forceinline__ T make_vec8(BaseT a0, BaseT a1, BaseT a2, BaseT a3, BaseT a4, BaseT a5, BaseT a6, BaseT a7)
+struct __align__(32) float8
 {
-   T val = {a0, a1, a2, a3, a4, a5, a6, a7};
-   return val;
-}
+   float x0, y0, z0, w0, x1, y1, z1, w1;
+};
 
-/*static __host__ __device__ __forceinline__ uchar8 make_uchar8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7)
-{
-   uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-   return val;
-}
+template<class T> struct vecTypeInfo;
 
+template<> struct vecTypeInfo<uchar6> { typedef uint8_t Base; };
+template<> struct vecTypeInfo<uchar8> { typedef uint8_t Base; };
 
-static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)
-{
-   float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-   return val;
-}*/
+template<> struct vecTypeInfo<float6> { typedef float Base; };
+template<> struct vecTypeInfo<float8> { typedef float Base; };
+
+template<typename T> struct vec_assert_false : std::false_type { };
+
+#define BaseType typename vecTypeInfo<T>::Base
+
+template<typename T> inline __host__ __device__ T make_vec(BaseType x0, BaseType y0, BaseType z0, BaseType w0, BaseType x1, BaseType y1, BaseType z1, BaseType w1) { static_assert(vec_assert_false<T>::value, "invalid vector type - supported types are uchar6, uchar8, float6, float8");  }
+
+template<> inline __host__ __device__ uchar6 make_vec( uint8_t x0, uint8_t y0, uint8_t z0, uint8_t w0, uint8_t x1, uint8_t y1, uint8_t z1, uint8_t w1 )	{ return {x0, y0, z0, x1, y1, z1}; }
+template<> inline __host__ __device__ uchar8 make_vec( uint8_t x0, uint8_t y0, uint8_t z0, uint8_t w0, uint8_t x1, uint8_t y1, uint8_t z1, uint8_t w1 )	{ return {x0, y0, z0, w1, x1, y1, z1, w1}; }
+
+template<> inline __host__ __device__ float6 make_vec( float x0, float y0, float z0, float w0, float x1, float y1, float z1, float w1 )				{ return {x0, y0, z0, x1, y1, z1}; }
+template<> inline __host__ __device__ float8 make_vec( float x0, float y0, float z0, float w0, float x1, float y1, float z1, float w1 )				{ return {x0, y0, z0, w1, x1, y1, z1, w1}; }
 
 
 //-----------------------------------------------------------------------------------
 // YUYV/UYVY to RGBA
 //-----------------------------------------------------------------------------------
-template <typename T, typename BaseT, bool formatUYVY>
-__global__ void yuyvToRgba( uchar4* src, int srcAlignedWidth, T* dst, int dstAlignedWidth, int width, int height )
+template <typename T, bool formatUYVY>
+__global__ void YUYVToRGBA( uchar4* src, T* dst, int halfWidth, int height )
 {
 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
 
-	if( x >= srcAlignedWidth || y >= height )
+	if( x >= halfWidth || y >= height )
 		return;
 
-	const uchar4 macroPx = src[y * srcAlignedWidth + x];
+	const uchar4 macroPx = src[y * halfWidth + x];
 
 	// Y0 is the brightness of pixel 0, Y1 the brightness of pixel 1.
 	// U0 and V0 is the color of both pixels.
 	// UYVY [ U0 | Y0 | V0 | Y1 ] 
 	// YUYV [ Y0 | U0 | Y1 | V0 ]
 	const float y0 = formatUYVY ? macroPx.y : macroPx.x;
 	const float y1 = formatUYVY ? macroPx.w : macroPx.z; 
-	const float u = (formatUYVY ? macroPx.x : macroPx.y) - 128.0f;
-	const float v = (formatUYVY ? macroPx.z : macroPx.w) - 128.0f;
-
-	const float4 px0 = make_float4( y0 + 1.4065f * v,
-							  y0 - 0.3455f * u - 0.7169f * v,
-							  y0 + 1.7790f * u, 255.0f );
-
-	const float4 px1 = make_float4( y1 + 1.4065f * v,
-							  y1 - 0.3455f * u - 0.7169f * v,
-							  y1 + 1.7790f * u, 255.0f );
-
-	dst[y * dstAlignedWidth + x] = make_vec8<T, BaseT>(clamp(px0.x, 0.0f, 255.0f), 
-									    clamp(px0.y, 0.0f, 255.0f),
-									    clamp(px0.z, 0.0f, 255.0f),
-									    clamp(px0.w, 0.0f, 255.0f),
-									    clamp(px1.x, 0.0f, 255.0f),
-									    clamp(px1.y, 0.0f, 255.0f),
-									    clamp(px1.z, 0.0f, 255.0f),
-									    clamp(px1.w, 0.0f, 255.0f));
+	const float u =  formatUYVY ? macroPx.x : macroPx.y;
+	const float v =  formatUYVY ? macroPx.z : macroPx.w;
+
+	// this function outputs two pixels from one YUYV macropixel
+	const float3 px0 = YUV2RGB(y0, u, v);
+	const float3 px1 = YUV2RGB(y1, u, v);
+
+	dst[y * halfWidth + x] = make_vec<T>(px0.x, px0.y, px0.z, 255,
+								  px1.x, px1.y, px1.z, 255);
 } 
 
-template<typename T, typename BaseT, bool formatUYVY>
-cudaError_t launchYUYV( void* input, size_t inputPitch, T* output, size_t outputPitch, size_t width, size_t height)
+template<typename T, bool formatUYVY>
+cudaError_t launchYUYVToRGB( void* input, T* output, size_t width, size_t height)
 {
-	if( !input || !inputPitch || !output || !outputPitch || !width || !height )
+	if( !input || !output || !width || !height )
 		return cudaErrorInvalidValue;
 
-	const dim3 block(8,8);
-	const dim3 grid(iDivUp(width/2, block.x), iDivUp(height, block.y));
+	const int  halfWidth = width / 2;	// two pixels are output at once
+	const dim3 blockDim(8,8);
+	const dim3 gridDim(iDivUp(halfWidth, blockDim.x), iDivUp(height, blockDim.y));
 
-	const int srcAlignedWidth = inputPitch / sizeof(uchar4);	// normally would be uchar2, but we're doubling up pixels
-	const int dstAlignedWidth = outputPitch / sizeof(T);	// normally would be uchar4 ^^^
-
-	//printf("yuyvToRgba %zu %zu %i %i %i %i %i\n", width, height, (int)formatUYVY, srcAlignedWidth, dstAlignedWidth, grid.x, grid.y);
-
-	yuyvToRgba<T, BaseT, formatUYVY><<<grid, block>>>((uchar4*)input, srcAlignedWidth, output, dstAlignedWidth, width, height);
+	YUYVToRGBA<T, formatUYVY><<<gridDim, blockDim>>>((uchar4*)input, output, halfWidth, height);
 
 	return CUDA(cudaGetLastError());
 }
 
 
-// cudaUYVYToRGBA (uchar4)
-cudaError_t cudaUYVYToRGBA( void* input, uchar4* output, size_t width, size_t height )
+// cudaYUYVToRGB (uchar3)
+cudaError_t cudaYUYVToRGB( void* input, uchar3* output, size_t width, size_t height )
 {
-	return launchYUYV<uchar8, uint8_t, true>(input, width * sizeof(uchar2), (uchar8*)output, width * sizeof(uchar4), width, height);
+	return launchYUYVToRGB<uchar6, false>(input, (uchar6*)output, width, height);
 }
 
-// cudaYUYVToRGBA (uchar4)
-cudaError_t cudaYUYVToRGBA( void* input, uchar4* output, size_t width, size_t height )
+// cudaYUYVToRGB (float3)
+cudaError_t cudaYUYVToRGB( void* input, float3* output, size_t width, size_t height )
 {
-	return launchYUYV<uchar8, uint8_t, false>(input, width * sizeof(uchar2), (uchar8*)output, width * sizeof(uchar4), width, height);
+	return launchYUYVToRGB<float6, false>(input, (float6*)output, width, height);
 }
 
-// cudaUYVYToRGBA (float4)
-cudaError_t cudaUYVYToRGBA( void* input, float4* output, size_t width, size_t height )
+// cudaYUYVToRGBA (uchar4)
+cudaError_t cudaYUYVToRGBA( void* input, uchar4* output, size_t width, size_t height )
 {
-	return launchYUYV<float8, float, true>(input, width * sizeof(uchar2), (float8*)output, width * sizeof(float4), width, height);
+	return launchYUYVToRGB<uchar8, false>(input, (uchar8*)output, width, height);
 }
 
 // cudaYUYVToRGBA (float4)
 cudaError_t cudaYUYVToRGBA( void* input, float4* output, size_t width, size_t height )
 {
-	return launchYUYV<float8, float, false>(input, width * sizeof(uchar2), (float8*)output, width * sizeof(float4), width, height);
+	return launchYUYVToRGB<float8, false>(input, (float8*)output, width, height);
+}
+
+//-----------------------------------------------------------------------------------
+
+// cudaUYVYToRGB (uchar3)
+cudaError_t cudaUYVYToRGB( void* input, uchar3* output, size_t width, size_t height )
+{
+	return launchYUYVToRGB<uchar6, true>(input, (uchar6*)output, width, height);
 }
 
+// cudaUYVYToRGB (float3)
+cudaError_t cudaUYVYToRGB( void* input, float3* output, size_t width, size_t height )
+{
+	return launchYUYVToRGB<float6, true>(input, (float6*)output, width, height);
+}
+
+// cudaUYVYToRGBA (uchar4)
+cudaError_t cudaUYVYToRGBA( void* input, uchar4* output, size_t width, size_t height )
+{
+	return launchYUYVToRGB<uchar8, true>(input, (uchar8*)output, width, height);
+}
+
+// cudaUYVYToRGBA (float4)
+cudaError_t cudaUYVYToRGBA( void* input, float4* output, size_t width, size_t height )
+{
+	return launchYUYVToRGB<float8, true>(input, (float8*)output, width, height);
+}
+
+
diff --git a/cuda/cudaYUV-YV12.cu b/cuda/cudaYUV-YV12.cu
@@ -24,10 +24,14 @@
 #include "cudaVector.h"
 
 
-//-----------------------------------------------------------------------------------------------
-// I420/YV12 to RGB
-//-----------------------------------------------------------------------------------------------
-static inline __device__ float clamp( float x )	{ return fminf(fmaxf(x, 0.0f), 255.0f); }
+
+//-----------------------------------------------------------------------------------
+// YUV to RGB colorspace conversion
+//-----------------------------------------------------------------------------------
+static inline __device__ float clamp( float x )	
+{ 
+	return fminf(fmaxf(x, 0.0f), 255.0f); 
+}
 
 static inline __device__ float3 YUV2RGB(float Y, float U, float V)
 {
@@ -45,6 +49,9 @@ static inline __device__ float3 YUV2RGB(float Y, float U, float V)
 #endif
 }
 
+//-------------------------------------------------------------------------------------
+// I420/YV12 to RGB
+//-------------------------------------------------------------------------------------
 template <typename T, bool formatYV12>
 __global__ void I420ToRGB(uint8_t* srcImage, int srcPitch,
                           T* dstImage,     	int dstPitch,
@@ -92,38 +99,81 @@ __global__ void I420ToRGB(uint8_t* srcImage, int srcPitch,
 }
 
 template <typename T, bool formatYV12>
-cudaError_t launch420ToRGB(void* srcDev, size_t srcPitch, T* destDev,
-                           size_t destPitch, size_t width, size_t height) 
+cudaError_t launch420ToRGB(void* srcDev, T* dstDev, size_t width, size_t height) 
 {
-	if( !srcDev || !destDev )
+	if( !srcDev || !dstDev )
 		return cudaErrorInvalidDevicePointer;
 
-	if( srcPitch == 0 || destPitch == 0 || width == 0 || height == 0 )
+	if( width == 0 || height == 0 )
 		return cudaErrorInvalidValue;
 
+	const int srcPitch = width * sizeof(uint8_t);
+	const int dstPitch = width * sizeof(T);
+
 	const dim3 blockDim(8,8);
 	//const dim3 gridDim((width+(2*blockDim.x-1))/(2*blockDim.x), (height+(blockDim.y-1))/blockDim.y, 1);
 	const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height, blockDim.y));
 
-	I420ToRGB<T, formatYV12><<<gridDim, blockDim>>>( (uint8_t*)srcDev, srcPitch, destDev, destPitch, width, height );
+	I420ToRGB<T, formatYV12><<<gridDim, blockDim>>>( (uint8_t*)srcDev, srcPitch, dstDev, dstPitch, width, height );
 
 	return CUDA(cudaGetLastError());
 }
 
-cudaError_t cudaI420ToRGB(void* input, size_t inputPitch, uchar3* output, size_t outputPitch, size_t width, size_t height) 
+
+// cudaI420ToRGB (uchar3)
+cudaError_t cudaI420ToRGB(void* input, uchar3* output, size_t width, size_t height) 
 {
-    return launch420ToRGB<uchar3, false>(input, inputPitch, output, outputPitch, width, height);
+    return launch420ToRGB<uchar3, false>(input, output, width, height);
 }
 
-cudaError_t cudaI420ToRGB(void* input, uchar3* output, size_t width, size_t height) 
+// cudaI420ToRGB (float3)
+cudaError_t cudaI420ToRGB(void* input, float3* output, size_t width, size_t height) 
+{
+    return launch420ToRGB<float3, false>(input, output, width, height);
+}
+
+// cudaI420ToRGBA (uchar4)
+cudaError_t cudaI420ToRGBA(void* input, uchar4* output, size_t width, size_t height) 
+{
+    return launch420ToRGB<uchar4, false>(input, output, width, height);
+}
+
+// cudaI420ToRGBA (float4)
+cudaError_t cudaI420ToRGBA(void* input, float4* output, size_t width, size_t height) 
+{
+    return launch420ToRGB<float4, false>(input, output, width, height);
+}
+
+//-----------------------------------------------------------------------------------
+
+// cudaYV12ToRGB (uchar3)
+cudaError_t cudaYV12ToRGB(void* input, uchar3* output, size_t width, size_t height) 
+{
+    return launch420ToRGB<uchar3, true>(input, output, width, height);
+}
+
+// cudaYV12ToRGB (float3)
+cudaError_t cudaYV12ToRGB(void* input, float3* output, size_t width, size_t height) 
+{
+    return launch420ToRGB<float3, true>(input, output, width, height);
+}
+
+// cudaYV12ToRGBA (uchar4)
+cudaError_t cudaYV12ToRGBA(void* input, uchar4* output, size_t width, size_t height) 
+{
+    return launch420ToRGB<uchar4, true>(input, output, width, height);
+}
+
+// cudaYV12ToRGBA (float4)
+cudaError_t cudaYV12ToRGBA(void* input, float4* output, size_t width, size_t height) 
 {
-    return launch420ToRGB<uchar3, false>(input, width * sizeof(uint8_t), output, width * sizeof(uchar3), width, height);
+    return launch420ToRGB<float4, true>(input, output, width, height);
 }
 
 
-//-----------------------------------------------------------------------------------------------
+//-------------------------------------------------------------------------------------
 // RGB to I420/YV12
-//-----------------------------------------------------------------------------------------------
+//-------------------------------------------------------------------------------------
 inline __device__ void rgb_to_y(const uint8_t r, const uint8_t g, const uint8_t b, uint8_t& y)
 {
 	y = static_cast<uint8_t>(((int)(30 * r) + (int)(59 * g) + (int)(11 * b)) / 100);