Next Event Estimation for wavefront pt.

jbikker · jbikker · Dec 26, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024
commit f8098a25346b422ac6b2d3734f2d28ab6b3c58f4
diff --git a/tiny_bvh_gpu.cpp b/tiny_bvh_gpu.cpp
@@ -30,9 +30,7 @@ static Buffer* pixels, * accumulator, * raysIn, * raysOut, * connections, * triD
 // View pyramid for a pinhole camera
 struct RenderData
 {
-	bvhvec4 eye = bvhvec4( 0, 30, 0, 0 );
-	bvhvec4 view = bvhvec4( -1, 0, 0, 0 );
-	bvhvec4 C, p0, p1, p2;
+	bvhvec4 eye = bvhvec4( 0, 30, 0, 0 ), view = bvhvec4( -1, 0, 0, 0 ), C, p0, p1, p2;
 	uint32_t frameIdx, dummy1, dummy2, dummy3;
 } rd;
 
@@ -46,6 +44,15 @@ void AddMesh( const char* file, float scale = 1, bvhvec3 pos = {}, int c = 0, in
 	for (int* b = (int*)tris + (triCount - N) * 12, i = 0; i < N * 3; i++)
 		*(bvhvec3*)b = *(bvhvec3*)b * scale + pos, b[3] = c ? c : b[3], b += 4;
 }
+void AddQuad( const bvhvec3 pos, const float w, const float d, int c )
+{
+	bvhvec4* data = (bvhvec4*)tinybvh::malloc64( (triCount + 2) * 48 );
+	if (tris) memcpy( data + 6, tris, triCount * 48 ), tinybvh::free64( tris );
+	data[0] = bvhvec3( -w, 0, -d ), data[1] = bvhvec3( w, 0, -d );
+	data[2] = bvhvec3( w, 0, d ), data[3] = bvhvec3( -w, 0, -d ), tris = data;
+	data[4] = bvhvec3( w, 0, d ), data[5] = bvhvec3( -w, 0, d ), triCount += 2;
+	for( int i = 0; i < 6; i++ ) data[i] = 0.5f * data[i] + pos, data[i].w = *(float*)&c; 
+}
 
 Buffer* cwbvhNodes = 0;
 Buffer* cwbvhTris = 0;
@@ -73,6 +80,7 @@ void Init()
 	// load raw vertex data
 	AddMesh( "./testdata/cryteksponza.bin", 1, bvhvec3( 0 ), 0xffffff );
 	AddMesh( "./testdata/lucy.bin", 1.1f, bvhvec3( -2, 4.1f, -3 ), 0xaaaaff );
+	AddQuad( bvhvec3( 0, 30, -1 ), 9, 5, 0x1ffffff ); 
 	// build bvh (here: 'compressed wide bvh', for efficient GPU rendering)
 	bvh.Build( tris, triCount );
 	// create gpu buffers
@@ -99,12 +107,9 @@ bool UpdateCamera( float delta_time_s, fenster& f )
 	bvhvec3 right = normalize( cross( bvhvec3( 0, 1, 0 ), rd.view ) ), up = 0.8f * cross( rd.view, right );
 	// get camera controls.
 	bool moved = false;
-	if (f.keys['A']) rd.eye += right * -1.0f * delta_time_s * 10, moved = true;
-	if (f.keys['D']) rd.eye += right * delta_time_s * 10, moved = true;
-	if (f.keys['W']) rd.eye += rd.view * delta_time_s * 10, moved = true;
-	if (f.keys['S']) rd.eye += rd.view * -1.0f * delta_time_s * 10, moved = true;
-	if (f.keys['R']) rd.eye += up * delta_time_s * 20, moved = true;
-	if (f.keys['F']) rd.eye += up * -1.0f * delta_time_s * 20, moved = true;
+	if (f.keys['A'] || f.keys['D']) rd.eye += right * delta_time_s * (f.keys['D'] ? 10 : -10), moved = true;
+	if (f.keys['W'] || f.keys['S']) rd.eye += rd.view * delta_time_s * (f.keys['W'] ? 10 : -10), moved = true;
+	if (f.keys['R'] || f.keys['F']) rd.eye += up * delta_time_s * (f.keys['R'] ? 20 : -20), moved = true;
 	if (f.keys[20]) rd.view = normalize( rd.view + right * -1.0f * delta_time_s ), moved = true;
 	if (f.keys[19]) rd.view = normalize( rd.view + right * delta_time_s ), moved = true;
 	if (f.keys[17]) rd.view = normalize( rd.view + up * -1.0f * delta_time_s ), moved = true;
@@ -132,7 +137,7 @@ void Tick( float delta_time_s, fenster& f, uint32_t* buf )
 	init->Run( 1 ); // init atomic counters, set buffer ptrs etc.
 	generate->SetArguments( raysOut );
 	generate->Run2D( oclint2( SCRWIDTH, SCRHEIGHT ) );
-	for (int i = 0; i < 4; i++)
+	for (int i = 0; i < 2; i++)
 	{
 		swap( raysOut, raysIn );
 		extend->SetArguments( raysIn );

diff --git a/traverse.cl b/traverse.cl
@@ -832,6 +832,254 @@ float4 traverse_cwbvh( global const float4* cwbvhNodes, global const float4* cwb
 	return hit;
 }
 
+bool isoccluded_cwbvh( global const float4* cwbvhNodes, global const float4* cwbvhTris, const float3 O, const float3 D, const float3 rD, const float t )
+{
+	// initialize ray
+	const unsigned threadId = get_global_id( 0 );
+#ifdef SIMD_AABBTEST
+	const float4 O4 = (float4)( O, 1 ); // rayData[threadId].O;
+	const float4 D4 = (float4)( D, 0 ); // rayData[threadId].D;
+	const float4 rD4 = (float4)( rD, 1 ); // rayData[threadId].rD;
+#endif // otherwise, we'll use the float3 input directly.
+	// prepare traversal
+	uint2 stack[STACK_SIZE];
+	uint stackPtr = 0;
+	float tmax = t;
+#ifdef SIMD_AABBTEST
+	const uint octinv4 = (7 - ((D4.x < 0 ? 4 : 0) | (D4.y < 0 ? 2 : 0) | (D4.z < 0 ? 1 : 0))) * 0x1010101;
+#else
+	const uint octinv4 = (7 - ((D.x < 0 ? 4 : 0) | (D.y < 0 ? 2 : 0) | (D.z < 0 ? 1 : 0))) * 0x1010101;
+#endif
+	uint2 ngroup = (uint2)(0, 0b10000000000000000000000000000000), tgroup = (uint2)(0);
+	do
+	{
+		if (ngroup.y > 0x00FFFFFF)
+		{
+			const unsigned hits = ngroup.y, imask = ngroup.y;
+			const unsigned child_bit_index = __bfind( hits );
+			const unsigned child_node_base_index = ngroup.x;
+			ngroup.y &= ~(1 << child_bit_index);
+			if (ngroup.y > 0x00FFFFFF) { STACK_PUSH( ngroup ); }
+			{
+				const unsigned slot_index = (child_bit_index - 24) ^ (octinv4 & 255);
+				const unsigned relative_index = __popc( imask & ~(0xFFFFFFFF << slot_index) );
+				const unsigned child_node_index = child_node_base_index + relative_index;
+			#ifdef USE_VLOAD_VSTORE
+				const float* p = (float*)&cwbvhNodes[child_node_index * 5 + 0];
+				float4 n0 = vload4( 0, p ), n1 = vload4( 1, p ), n2 = vload4( 2, p );
+				float4 n3 = vload4( 3, p ), n4 = vload4( 4, p );
+			#else
+				float4 n0 = cwbvhNodes[child_node_index * 5 + 0], n1 = cwbvhNodes[child_node_index * 5 + 1];
+				float4 n2 = cwbvhNodes[child_node_index * 5 + 2], n3 = cwbvhNodes[child_node_index * 5 + 3];
+				float4 n4 = cwbvhNodes[child_node_index * 5 + 4];
+			#endif
+				const char4 e = as_char4( n0.w );
+				ngroup.x = as_uint( n1.x ), tgroup = (uint2)(as_uint( n1.y ), 0);
+				unsigned hitmask = 0;
+			#ifdef SIMD_AABBTEST
+				const float4 idir4 = (float4)(
+					as_float( (e.x + 127) << 23 ) * rD4.x, as_float( (e.y + 127) << 23 ) * rD4.y,
+					as_float( (e.z + 127) << 23 ) * rD4.z, 1
+				);
+				const float4 orig4 = (n0 - O4) * rD4;
+			#else
+				const float idirx = as_float( (e.x + 127) << 23 ) * rD.x;
+				const float idiry = as_float( (e.y + 127) << 23 ) * rD.y;
+				const float idirz = as_float( (e.z + 127) << 23 ) * rD.z;
+				const float origx = (n0.x - O.x) * rD.x;
+				const float origy = (n0.y - O.y) * rD.y;
+				const float origz = (n0.z - O.z) * rD.z;
+			#endif
+				{	// first 4
+					const unsigned meta4 = as_uint( n1.z ), is_inner4 = (meta4 & (meta4 << 1)) & 0x10101010;
+					const unsigned inner_mask4 = sign_extend_s8x4( is_inner4 << 3 );
+					const unsigned bit_index4 = (meta4 ^ (octinv4 & inner_mask4)) & 0x1F1F1F1F;
+					const unsigned child_bits4 = (meta4 >> 5) & 0x07070707;
+					const float4 lox4 = convert_float4( as_uchar4( rD.x < 0 ? n3.z : n2.x ) ), hix4 = convert_float4( as_uchar4( rD.x < 0 ? n2.x : n3.z ) );
+					const float4 loy4 = convert_float4( as_uchar4( rD.y < 0 ? n4.x : n2.z ) ), hiy4 = convert_float4( as_uchar4( rD.y < 0 ? n2.z : n4.x ) );
+					const float4 loz4 = convert_float4( as_uchar4( rD.z < 0 ? n4.z : n3.x ) ), hiz4 = convert_float4( as_uchar4( rD.z < 0 ? n3.x : n4.z ) );
+					{
+					#ifdef SIMD_AABBTEST
+						const float4 tminx4 = lox4 * idir4.xxxx + orig4.xxxx, tmaxx4 = hix4 * idir4.xxxx + orig4.xxxx;
+						const float4 tminy4 = loy4 * idir4.yyyy + orig4.yyyy, tmaxy4 = hiy4 * idir4.yyyy + orig4.yyyy;
+						const float4 tminz4 = loz4 * idir4.zzzz + orig4.zzzz, tmaxz4 = hiz4 * idir4.zzzz + orig4.zzzz;
+						const float cmina = fmax( fmax( fmax( tminx4.x, tminy4.x ), tminz4.x ), 0 );
+						const float cmaxa = fmin( fmin( fmin( tmaxx4.x, tmaxy4.x ), tmaxz4.x ), tmax );
+						const float cminb = fmax( fmax( fmax( tminx4.y, tminy4.y ), tminz4.y ), 0 );
+						const float cmaxb = fmin( fmin( fmin( tmaxx4.y, tmaxy4.y ), tmaxz4.y ), tmax );
+						if (cmina <= cmaxa) UPDATE_HITMASK;
+						if (cminb <= cmaxb) UPDATE_HITMASK1;
+					#else
+						float tminx0 = _native_fma( lox4.x, idirx, origx ), tminx1 = _native_fma( lox4.y, idirx, origx );
+						float tminy0 = _native_fma( loy4.x, idiry, origy ), tminy1 = _native_fma( loy4.y, idiry, origy );
+						float tminz0 = _native_fma( loz4.x, idirz, origz ), tminz1 = _native_fma( loz4.y, idirz, origz );
+						float tmaxx0 = _native_fma( hix4.x, idirx, origx ), tmaxx1 = _native_fma( hix4.y, idirx, origx );
+						float tmaxy0 = _native_fma( hiy4.x, idiry, origy ), tmaxy1 = _native_fma( hiy4.y, idiry, origy );
+						float tmaxz0 = _native_fma( hiz4.x, idirz, origz ), tmaxz1 = _native_fma( hiz4.y, idirz, origz );
+						n0.x = fmax( fmax_fmax( tminx0, tminy0, tminz0 ), 0 );
+						n0.y = fmin( fmin_fmin( tmaxx0, tmaxy0, tmaxz0 ), tmax );
+						n1.x = fmax( fmax_fmax( tminx1, tminy1, tminz1 ), 0 );
+						n1.y = fmin( fmin_fmin( tmaxx1, tmaxy1, tmaxz1 ), tmax );
+						if (n0.x <= n0.y) UPDATE_HITMASK;
+						if (n1.x <= n1.y) UPDATE_HITMASK1;
+					#endif
+					#ifdef SIMD_AABBTEST
+						const float cminc = fmax( fmax( fmax( tminx4.z, tminy4.z ), tminz4.z ), 0 );
+						const float cmaxc = fmin( fmin( fmin( tmaxx4.z, tmaxy4.z ), tmaxz4.z ), tmax );
+						const float cmind = fmax( fmax( fmax( tminx4.w, tminy4.w ), tminz4.w ), 0 );
+						const float cmaxd = fmin( fmin( fmin( tmaxx4.w, tmaxy4.w ), tmaxz4.w ), tmax );
+						if (cminc <= cmaxc) UPDATE_HITMASK2;
+						if (cmind <= cmaxd) UPDATE_HITMASK3;
+					#else
+						tminx0 = _native_fma( lox4.z, idirx, origx ), tminx1 = _native_fma( lox4.w, idirx, origx );
+						tminy0 = _native_fma( loy4.z, idiry, origy ), tminy1 = _native_fma( loy4.w, idiry, origy );
+						tminz0 = _native_fma( loz4.z, idirz, origz ), tminz1 = _native_fma( loz4.w, idirz, origz );
+						tmaxx0 = _native_fma( hix4.z, idirx, origx ), tmaxx1 = _native_fma( hix4.w, idirx, origx );
+						tmaxy0 = _native_fma( hiy4.z, idiry, origy ), tmaxy1 = _native_fma( hiy4.w, idiry, origy );
+						tmaxz0 = _native_fma( hiz4.z, idirz, origz ), tmaxz1 = _native_fma( hiz4.w, idirz, origz );
+						n0.x = fmax( fmax_fmax( tminx0, tminy0, tminz0 ), 0 );
+						n0.y = fmin( fmin_fmin( tmaxx0, tmaxy0, tmaxz0 ), tmax );
+						n1.x = fmax( fmax_fmax( tminx1, tminy1, tminz1 ), 0 );
+						n1.y = fmin( fmin_fmin( tmaxx1, tmaxy1, tmaxz1 ), tmax );
+						if (n0.x <= n0.y) UPDATE_HITMASK2;
+						if (n1.x <= n1.y) UPDATE_HITMASK3;
+					#endif
+					}
+				}
+				{	// second 4
+					const unsigned meta4 = as_uint( n1.w ), is_inner4 = (meta4 & (meta4 << 1)) & 0x10101010;
+					const unsigned inner_mask4 = sign_extend_s8x4( is_inner4 << 3 );
+					const unsigned bit_index4 = (meta4 ^ (octinv4 & inner_mask4)) & 0x1F1F1F1F;
+					const unsigned child_bits4 = (meta4 >> 5) & 0x07070707;
+					const float4 lox4 = convert_float4( as_uchar4( rD.x < 0 ? n3.w : n2.y ) ), hix4 = convert_float4( as_uchar4( rD.x < 0 ? n2.y : n3.w ) );
+					const float4 loy4 = convert_float4( as_uchar4( rD.y < 0 ? n4.y : n2.w ) ), hiy4 = convert_float4( as_uchar4( rD.y < 0 ? n2.w : n4.y ) );
+					const float4 loz4 = convert_float4( as_uchar4( rD.z < 0 ? n4.w : n3.y ) ), hiz4 = convert_float4( as_uchar4( rD.z < 0 ? n3.y : n4.w ) );
+					{
+					#ifdef SIMD_AABBTEST
+						const float4 tminx4 = lox4 * idir4.xxxx + orig4.xxxx, tmaxx4 = hix4 * idir4.xxxx + orig4.xxxx;
+						const float4 tminy4 = loy4 * idir4.yyyy + orig4.yyyy, tmaxy4 = hiy4 * idir4.yyyy + orig4.yyyy;
+						const float4 tminz4 = loz4 * idir4.zzzz + orig4.zzzz, tmaxz4 = hiz4 * idir4.zzzz + orig4.zzzz;
+						const float cmina = fmax( fmax( fmax( tminx4.x, tminy4.x ), tminz4.x ), 0 );
+						const float cmaxa = fmin( fmin( fmin( tmaxx4.x, tmaxy4.x ), tmaxz4.x ), tmax );
+						const float cminb = fmax( fmax( fmax( tminx4.y, tminy4.y ), tminz4.y ), 0 );
+						const float cmaxb = fmin( fmin( fmin( tmaxx4.y, tmaxy4.y ), tmaxz4.y ), tmax );
+						if (cmina <= cmaxa) UPDATE_HITMASK0;
+						if (cminb <= cmaxb) UPDATE_HITMASK1;
+					#else
+						float tminx0 = _native_fma( lox4.x, idirx, origx ), tminx1 = _native_fma( lox4.y, idirx, origx );
+						float tminy0 = _native_fma( loy4.x, idiry, origy ), tminy1 = _native_fma( loy4.y, idiry, origy );
+						float tminz0 = _native_fma( loz4.x, idirz, origz ), tminz1 = _native_fma( loz4.y, idirz, origz );
+						float tmaxx0 = _native_fma( hix4.x, idirx, origx ), tmaxx1 = _native_fma( hix4.y, idirx, origx );
+						float tmaxy0 = _native_fma( hiy4.x, idiry, origy ), tmaxy1 = _native_fma( hiy4.y, idiry, origy );
+						float tmaxz0 = _native_fma( hiz4.x, idirz, origz ), tmaxz1 = _native_fma( hiz4.y, idirz, origz );
+						n0.x = fmax( fmax_fmax( tminx0, tminy0, tminz0 ), 0 );
+						n0.y = fmin( fmin_fmin( tmaxx0, tmaxy0, tmaxz0 ), tmax );
+						n1.x = fmax( fmax_fmax( tminx1, tminy1, tminz1 ), 0 );
+						n1.y = fmin( fmin_fmin( tmaxx1, tmaxy1, tmaxz1 ), tmax );
+						if (n0.x <= n0.y) UPDATE_HITMASK0;
+						if (n1.x <= n1.y) UPDATE_HITMASK1;
+					#endif
+					#ifdef SIMD_AABBTEST
+						const float cminc = fmax( fmax( fmax( tminx4.z, tminy4.z ), tminz4.z ), 0 );
+						const float cmaxc = fmin( fmin( fmin( tmaxx4.z, tmaxy4.z ), tmaxz4.z ), tmax );
+						const float cmind = fmax( fmax( fmax( tminx4.w, tminy4.w ), tminz4.w ), 0 );
+						const float cmaxd = fmin( fmin( fmin( tmaxx4.w, tmaxy4.w ), tmaxz4.w ), tmax );
+						if (cminc <= cmaxc) UPDATE_HITMASK2;
+						if (cmind <= cmaxd) UPDATE_HITMASK3;
+					#else
+						tminx0 = _native_fma( lox4.z, idirx, origx ), tminx1 = _native_fma( lox4.w, idirx, origx );
+						tminy0 = _native_fma( loy4.z, idiry, origy ), tminy1 = _native_fma( loy4.w, idiry, origy );
+						tminz0 = _native_fma( loz4.z, idirz, origz ), tminz1 = _native_fma( loz4.w, idirz, origz );
+						tmaxx0 = _native_fma( hix4.z, idirx, origx ), tmaxx1 = _native_fma( hix4.w, idirx, origx );
+						tmaxy0 = _native_fma( hiy4.z, idiry, origy ), tmaxy1 = _native_fma( hiy4.w, idiry, origy );
+						tmaxz0 = _native_fma( hiz4.z, idirz, origz ), tmaxz1 = _native_fma( hiz4.w, idirz, origz );
+						n0.x = fmax( fmax_fmax( tminx0, tminy0, tminz0 ), 0 );
+						n0.y = fmin( fmin_fmin( tmaxx0, tmaxy0, tmaxz0 ), tmax );
+						n1.x = fmax( fmax_fmax( tminx1, tminy1, tminz1 ), 0 );
+						n1.y = fmin( fmin_fmin( tmaxx1, tmaxy1, tmaxz1 ), tmax );
+						if (n0.x <= n0.y) UPDATE_HITMASK2;
+						if (n1.x <= n1.y) UPDATE_HITMASK3;
+					#endif
+					}
+				}
+				ngroup.y = (hitmask & 0xFF000000) | (as_uint( n0.w ) >> 24), tgroup.y = hitmask & 0x00FFFFFF;
+			}
+		}
+		else tgroup = ngroup, ngroup = (uint2)(0);
+		while (tgroup.y != 0)
+		{
+		#ifdef CWBVH_COMPRESSED_TRIS
+			// Fast intersection of triangle data for the algorithm in:
+			// "Fast Ray-Triangle Intersections by Coordinate Transformation"
+			// Baldwin & Weber, 2016.
+			const unsigned triangleIndex = __bfind( tgroup.y ), triAddr = tgroup.x + triangleIndex * 4;
+			const float4 T2 = cwbvhTris[triAddr + 2];
+			const float transS = T2.x * O.x + T2.y * O.y + T2.z * O.z + T2.w;
+			const float transD = T2.x * D.x + T2.y * D.y + T2.z * D.z;
+			const float d = -transS / transD;
+			if (d > 0 && d < tmax)
+			{
+				const float4 T0 = cwbvhTris[triAddr + 0];
+				const float4 T1 = cwbvhTris[triAddr + 1];
+			#ifdef SIMD_AABBTEST
+				const float4 I = O4 + d * D4;
+			#else
+				const float3 I = O + d * D;
+			#endif
+				const float u = T0.x * I.x + T0.y * I.y + T0.z * I.z + T0.w;
+				const float v = T1.x * I.x + T1.y * I.y + T1.z * I.z + T1.w;
+				const bool hit = u >= 0 && v >= 0 && u + v < 1;
+				if (hit) return true;
+			}
+		#else
+			// Möller-Trumbore intersection; triangles are stored as 3x16 bytes,
+			// with the original primitive index in the (otherwise unused) w 
+			// component of vertex 0.
+			const int triangleIndex = __bfind( tgroup.y ), triAddr = tgroup.x + triangleIndex * 3;
+			const float3 v0 = cwbvhTris[triAddr].xyz;
+			const float3 e1 = cwbvhTris[triAddr + 1].xyz - v0;
+			const float3 e2 = cwbvhTris[triAddr + 2].xyz - v0;
+		#ifdef SIMD_AABBTEST
+			const float3 r = cross( D4.xyz, e2 );
+		#else
+			const float3 r = cross( D, e2 );
+		#endif
+			const float a = dot( e1, r );
+			if (fabs( a ) > 0.0000001f)
+			{
+				const float f = 1 / a;
+			#ifdef SIMD_AABBTEST
+				const float3 s = O4.xyz - v0;
+			#else
+				const float3 s = O - v0;
+			#endif
+				const float u = f * dot( s, r );
+				if (u >= 0 && u <= 1)
+				{
+					const float3 q = cross( s, e1 );
+				#ifdef SIMD_AABBTEST
+					const float v = f * dot( D4.xyz, q );
+				#else
+					const float v = f * dot( D, q );
+				#endif
+					if (v >= 0 && u + v <= 1)
+					{
+						const float d = f * dot( e2, q );
+						if (d > 0.0f && d < tmax) return true;
+					}
+				}
+			}
+		#endif
+			tgroup.y -= 1 << triangleIndex;
+		}
+		if (ngroup.y <= 0x00FFFFFF)
+		{
+			if (stackPtr > 0) { STACK_POP( ngroup ); } else break;
+		}
+	} while (true);
+	return false; // no occlusion found.
+}
+
 void kernel batch_cwbvh( global const float4* cwbvhNodes, global const float4* cwbvhTris, global struct Ray* rayData )
 {
 	// initialize ray