Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merging dev into main #74

Merged
merged 35 commits into from
Dec 26, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
e91fbe3
Version bump before sync to main.
jbikker Dec 18, 2024
115f2a3
Update README.md
jbikker Dec 18, 2024
2aaff3e
Merge branch 'dev' of https://github.com/jbikker/tinybvh into dev
jbikker Dec 18, 2024
e89bf4c
Minimalist wavefront code.
jbikker Dec 18, 2024
2a49133
Error in windows window.
jbikker Dec 18, 2024
dd8dc99
Disabling BuildNEON for now.
jbikker Dec 19, 2024
299bc27
Functional basic gpu wavefront path tracer.
jbikker Dec 19, 2024
f8098a2
Next Event Estimation for wavefront pt.
jbikker Dec 19, 2024
e554bbf
New screenshot.
jbikker Dec 19, 2024
e711d22
Update README.md
jbikker Dec 19, 2024
2affe64
Fix for second bounce.
jbikker Dec 20, 2024
c8c6093
NEE now accounts for BRDF.
jbikker Dec 20, 2024
f204b7f
Specular support in wavefront pt.
jbikker Dec 20, 2024
2372756
native_recip and fast_normalize
benanil Dec 20, 2024
9bca1e9
Merge pull request #71 from benanil/dev
jbikker Dec 20, 2024
71e50d4
Blue noise to play with.
jbikker Dec 20, 2024
c86ac2a
Blue noise sampling for NEE & bounce.
jbikker Dec 20, 2024
a848a16
Update README.md
jbikker Dec 21, 2024
392d450
Optimized cwbvh for ~10% faster traversal.
jbikker Dec 21, 2024
8feb5c2
Merge branch 'dev' of https://github.com/jbikker/tinybvh into dev
jbikker Dec 21, 2024
d73d549
Merge branch 'dev' of https://github.com/jbikker/tinybvh into dev
jbikker Dec 21, 2024
4e51514
Save/load for cwbvh, proper path state in wavefront.
jbikker Dec 22, 2024
8846bbe
Merged.
jbikker Dec 22, 2024
d04ae10
MIS for wavefront path tracer.
jbikker Dec 22, 2024
91d1a1a
Better documented code.
jbikker Dec 22, 2024
ed3451c
Minor comments.
jbikker Dec 22, 2024
cc624f9
Fixed MIS.
jbikker Dec 23, 2024
473fbc4
Update README.md
jbikker Dec 23, 2024
b1db88a
Update README.md
jbikker Dec 23, 2024
e22f885
Update README.md
jbikker Dec 23, 2024
36caa36
Update README.md
jbikker Dec 23, 2024
38e50d5
More MIS fixes.
jbikker Dec 23, 2024
a659545
Merge branch 'dev' of https://github.com/jbikker/tinybvh into dev
jbikker Dec 23, 2024
87b9189
tinby_bvh_gpu now compiles on Linux; CodeMaid applied.
jbikker Dec 26, 2024
437ce4f
Update README.md
jbikker Dec 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Next Event Estimation for wavefront pt.
  • Loading branch information
jbikker committed Dec 19, 2024
commit f8098a25346b422ac6b2d3734f2d28ab6b3c58f4
25 changes: 15 additions & 10 deletions tiny_bvh_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,7 @@ static Buffer* pixels, * accumulator, * raysIn, * raysOut, * connections, * triD
// View pyramid for a pinhole camera
struct RenderData
{
bvhvec4 eye = bvhvec4( 0, 30, 0, 0 );
bvhvec4 view = bvhvec4( -1, 0, 0, 0 );
bvhvec4 C, p0, p1, p2;
bvhvec4 eye = bvhvec4( 0, 30, 0, 0 ), view = bvhvec4( -1, 0, 0, 0 ), C, p0, p1, p2;
uint32_t frameIdx, dummy1, dummy2, dummy3;
} rd;

Expand All @@ -46,6 +44,15 @@ void AddMesh( const char* file, float scale = 1, bvhvec3 pos = {}, int c = 0, in
for (int* b = (int*)tris + (triCount - N) * 12, i = 0; i < N * 3; i++)
*(bvhvec3*)b = *(bvhvec3*)b * scale + pos, b[3] = c ? c : b[3], b += 4;
}
void AddQuad( const bvhvec3 pos, const float w, const float d, int c )
{
bvhvec4* data = (bvhvec4*)tinybvh::malloc64( (triCount + 2) * 48 );
if (tris) memcpy( data + 6, tris, triCount * 48 ), tinybvh::free64( tris );
data[0] = bvhvec3( -w, 0, -d ), data[1] = bvhvec3( w, 0, -d );
data[2] = bvhvec3( w, 0, d ), data[3] = bvhvec3( -w, 0, -d ), tris = data;
data[4] = bvhvec3( w, 0, d ), data[5] = bvhvec3( -w, 0, d ), triCount += 2;
for( int i = 0; i < 6; i++ ) data[i] = 0.5f * data[i] + pos, data[i].w = *(float*)&c;
}

Buffer* cwbvhNodes = 0;
Buffer* cwbvhTris = 0;
Expand Down Expand Up @@ -73,6 +80,7 @@ void Init()
// load raw vertex data
AddMesh( "./testdata/cryteksponza.bin", 1, bvhvec3( 0 ), 0xffffff );
AddMesh( "./testdata/lucy.bin", 1.1f, bvhvec3( -2, 4.1f, -3 ), 0xaaaaff );
AddQuad( bvhvec3( 0, 30, -1 ), 9, 5, 0x1ffffff );
// build bvh (here: 'compressed wide bvh', for efficient GPU rendering)
bvh.Build( tris, triCount );
// create gpu buffers
Expand All @@ -99,12 +107,9 @@ bool UpdateCamera( float delta_time_s, fenster& f )
bvhvec3 right = normalize( cross( bvhvec3( 0, 1, 0 ), rd.view ) ), up = 0.8f * cross( rd.view, right );
// get camera controls.
bool moved = false;
if (f.keys['A']) rd.eye += right * -1.0f * delta_time_s * 10, moved = true;
if (f.keys['D']) rd.eye += right * delta_time_s * 10, moved = true;
if (f.keys['W']) rd.eye += rd.view * delta_time_s * 10, moved = true;
if (f.keys['S']) rd.eye += rd.view * -1.0f * delta_time_s * 10, moved = true;
if (f.keys['R']) rd.eye += up * delta_time_s * 20, moved = true;
if (f.keys['F']) rd.eye += up * -1.0f * delta_time_s * 20, moved = true;
if (f.keys['A'] || f.keys['D']) rd.eye += right * delta_time_s * (f.keys['D'] ? 10 : -10), moved = true;
if (f.keys['W'] || f.keys['S']) rd.eye += rd.view * delta_time_s * (f.keys['W'] ? 10 : -10), moved = true;
if (f.keys['R'] || f.keys['F']) rd.eye += up * delta_time_s * (f.keys['R'] ? 20 : -20), moved = true;
if (f.keys[20]) rd.view = normalize( rd.view + right * -1.0f * delta_time_s ), moved = true;
if (f.keys[19]) rd.view = normalize( rd.view + right * delta_time_s ), moved = true;
if (f.keys[17]) rd.view = normalize( rd.view + up * -1.0f * delta_time_s ), moved = true;
Expand Down Expand Up @@ -132,7 +137,7 @@ void Tick( float delta_time_s, fenster& f, uint32_t* buf )
init->Run( 1 ); // init atomic counters, set buffer ptrs etc.
generate->SetArguments( raysOut );
generate->Run2D( oclint2( SCRWIDTH, SCRHEIGHT ) );
for (int i = 0; i < 4; i++)
for (int i = 0; i < 2; i++)
{
swap( raysOut, raysIn );
extend->SetArguments( raysIn );
Expand Down
248 changes: 248 additions & 0 deletions traverse.cl
Original file line number Diff line number Diff line change
Expand Up @@ -832,6 +832,254 @@ float4 traverse_cwbvh( global const float4* cwbvhNodes, global const float4* cwb
return hit;
}

bool isoccluded_cwbvh( global const float4* cwbvhNodes, global const float4* cwbvhTris, const float3 O, const float3 D, const float3 rD, const float t )
{
// initialize ray
const unsigned threadId = get_global_id( 0 );
#ifdef SIMD_AABBTEST
const float4 O4 = (float4)( O, 1 ); // rayData[threadId].O;
const float4 D4 = (float4)( D, 0 ); // rayData[threadId].D;
const float4 rD4 = (float4)( rD, 1 ); // rayData[threadId].rD;
#endif // otherwise, we'll use the float3 input directly.
// prepare traversal
uint2 stack[STACK_SIZE];
uint stackPtr = 0;
float tmax = t;
#ifdef SIMD_AABBTEST
const uint octinv4 = (7 - ((D4.x < 0 ? 4 : 0) | (D4.y < 0 ? 2 : 0) | (D4.z < 0 ? 1 : 0))) * 0x1010101;
#else
const uint octinv4 = (7 - ((D.x < 0 ? 4 : 0) | (D.y < 0 ? 2 : 0) | (D.z < 0 ? 1 : 0))) * 0x1010101;
#endif
uint2 ngroup = (uint2)(0, 0b10000000000000000000000000000000), tgroup = (uint2)(0);
do
{
if (ngroup.y > 0x00FFFFFF)
{
const unsigned hits = ngroup.y, imask = ngroup.y;
const unsigned child_bit_index = __bfind( hits );
const unsigned child_node_base_index = ngroup.x;
ngroup.y &= ~(1 << child_bit_index);
if (ngroup.y > 0x00FFFFFF) { STACK_PUSH( ngroup ); }
{
const unsigned slot_index = (child_bit_index - 24) ^ (octinv4 & 255);
const unsigned relative_index = __popc( imask & ~(0xFFFFFFFF << slot_index) );
const unsigned child_node_index = child_node_base_index + relative_index;
#ifdef USE_VLOAD_VSTORE
const float* p = (float*)&cwbvhNodes[child_node_index * 5 + 0];
float4 n0 = vload4( 0, p ), n1 = vload4( 1, p ), n2 = vload4( 2, p );
float4 n3 = vload4( 3, p ), n4 = vload4( 4, p );
#else
float4 n0 = cwbvhNodes[child_node_index * 5 + 0], n1 = cwbvhNodes[child_node_index * 5 + 1];
float4 n2 = cwbvhNodes[child_node_index * 5 + 2], n3 = cwbvhNodes[child_node_index * 5 + 3];
float4 n4 = cwbvhNodes[child_node_index * 5 + 4];
#endif
const char4 e = as_char4( n0.w );
ngroup.x = as_uint( n1.x ), tgroup = (uint2)(as_uint( n1.y ), 0);
unsigned hitmask = 0;
#ifdef SIMD_AABBTEST
const float4 idir4 = (float4)(
as_float( (e.x + 127) << 23 ) * rD4.x, as_float( (e.y + 127) << 23 ) * rD4.y,
as_float( (e.z + 127) << 23 ) * rD4.z, 1
);
const float4 orig4 = (n0 - O4) * rD4;
#else
const float idirx = as_float( (e.x + 127) << 23 ) * rD.x;
const float idiry = as_float( (e.y + 127) << 23 ) * rD.y;
const float idirz = as_float( (e.z + 127) << 23 ) * rD.z;
const float origx = (n0.x - O.x) * rD.x;
const float origy = (n0.y - O.y) * rD.y;
const float origz = (n0.z - O.z) * rD.z;
#endif
{ // first 4
const unsigned meta4 = as_uint( n1.z ), is_inner4 = (meta4 & (meta4 << 1)) & 0x10101010;
const unsigned inner_mask4 = sign_extend_s8x4( is_inner4 << 3 );
const unsigned bit_index4 = (meta4 ^ (octinv4 & inner_mask4)) & 0x1F1F1F1F;
const unsigned child_bits4 = (meta4 >> 5) & 0x07070707;
const float4 lox4 = convert_float4( as_uchar4( rD.x < 0 ? n3.z : n2.x ) ), hix4 = convert_float4( as_uchar4( rD.x < 0 ? n2.x : n3.z ) );
const float4 loy4 = convert_float4( as_uchar4( rD.y < 0 ? n4.x : n2.z ) ), hiy4 = convert_float4( as_uchar4( rD.y < 0 ? n2.z : n4.x ) );
const float4 loz4 = convert_float4( as_uchar4( rD.z < 0 ? n4.z : n3.x ) ), hiz4 = convert_float4( as_uchar4( rD.z < 0 ? n3.x : n4.z ) );
{
#ifdef SIMD_AABBTEST
const float4 tminx4 = lox4 * idir4.xxxx + orig4.xxxx, tmaxx4 = hix4 * idir4.xxxx + orig4.xxxx;
const float4 tminy4 = loy4 * idir4.yyyy + orig4.yyyy, tmaxy4 = hiy4 * idir4.yyyy + orig4.yyyy;
const float4 tminz4 = loz4 * idir4.zzzz + orig4.zzzz, tmaxz4 = hiz4 * idir4.zzzz + orig4.zzzz;
const float cmina = fmax( fmax( fmax( tminx4.x, tminy4.x ), tminz4.x ), 0 );
const float cmaxa = fmin( fmin( fmin( tmaxx4.x, tmaxy4.x ), tmaxz4.x ), tmax );
const float cminb = fmax( fmax( fmax( tminx4.y, tminy4.y ), tminz4.y ), 0 );
const float cmaxb = fmin( fmin( fmin( tmaxx4.y, tmaxy4.y ), tmaxz4.y ), tmax );
if (cmina <= cmaxa) UPDATE_HITMASK;
if (cminb <= cmaxb) UPDATE_HITMASK1;
#else
float tminx0 = _native_fma( lox4.x, idirx, origx ), tminx1 = _native_fma( lox4.y, idirx, origx );
float tminy0 = _native_fma( loy4.x, idiry, origy ), tminy1 = _native_fma( loy4.y, idiry, origy );
float tminz0 = _native_fma( loz4.x, idirz, origz ), tminz1 = _native_fma( loz4.y, idirz, origz );
float tmaxx0 = _native_fma( hix4.x, idirx, origx ), tmaxx1 = _native_fma( hix4.y, idirx, origx );
float tmaxy0 = _native_fma( hiy4.x, idiry, origy ), tmaxy1 = _native_fma( hiy4.y, idiry, origy );
float tmaxz0 = _native_fma( hiz4.x, idirz, origz ), tmaxz1 = _native_fma( hiz4.y, idirz, origz );
n0.x = fmax( fmax_fmax( tminx0, tminy0, tminz0 ), 0 );
n0.y = fmin( fmin_fmin( tmaxx0, tmaxy0, tmaxz0 ), tmax );
n1.x = fmax( fmax_fmax( tminx1, tminy1, tminz1 ), 0 );
n1.y = fmin( fmin_fmin( tmaxx1, tmaxy1, tmaxz1 ), tmax );
if (n0.x <= n0.y) UPDATE_HITMASK;
if (n1.x <= n1.y) UPDATE_HITMASK1;
#endif
#ifdef SIMD_AABBTEST
const float cminc = fmax( fmax( fmax( tminx4.z, tminy4.z ), tminz4.z ), 0 );
const float cmaxc = fmin( fmin( fmin( tmaxx4.z, tmaxy4.z ), tmaxz4.z ), tmax );
const float cmind = fmax( fmax( fmax( tminx4.w, tminy4.w ), tminz4.w ), 0 );
const float cmaxd = fmin( fmin( fmin( tmaxx4.w, tmaxy4.w ), tmaxz4.w ), tmax );
if (cminc <= cmaxc) UPDATE_HITMASK2;
if (cmind <= cmaxd) UPDATE_HITMASK3;
#else
tminx0 = _native_fma( lox4.z, idirx, origx ), tminx1 = _native_fma( lox4.w, idirx, origx );
tminy0 = _native_fma( loy4.z, idiry, origy ), tminy1 = _native_fma( loy4.w, idiry, origy );
tminz0 = _native_fma( loz4.z, idirz, origz ), tminz1 = _native_fma( loz4.w, idirz, origz );
tmaxx0 = _native_fma( hix4.z, idirx, origx ), tmaxx1 = _native_fma( hix4.w, idirx, origx );
tmaxy0 = _native_fma( hiy4.z, idiry, origy ), tmaxy1 = _native_fma( hiy4.w, idiry, origy );
tmaxz0 = _native_fma( hiz4.z, idirz, origz ), tmaxz1 = _native_fma( hiz4.w, idirz, origz );
n0.x = fmax( fmax_fmax( tminx0, tminy0, tminz0 ), 0 );
n0.y = fmin( fmin_fmin( tmaxx0, tmaxy0, tmaxz0 ), tmax );
n1.x = fmax( fmax_fmax( tminx1, tminy1, tminz1 ), 0 );
n1.y = fmin( fmin_fmin( tmaxx1, tmaxy1, tmaxz1 ), tmax );
if (n0.x <= n0.y) UPDATE_HITMASK2;
if (n1.x <= n1.y) UPDATE_HITMASK3;
#endif
}
}
{ // second 4
const unsigned meta4 = as_uint( n1.w ), is_inner4 = (meta4 & (meta4 << 1)) & 0x10101010;
const unsigned inner_mask4 = sign_extend_s8x4( is_inner4 << 3 );
const unsigned bit_index4 = (meta4 ^ (octinv4 & inner_mask4)) & 0x1F1F1F1F;
const unsigned child_bits4 = (meta4 >> 5) & 0x07070707;
const float4 lox4 = convert_float4( as_uchar4( rD.x < 0 ? n3.w : n2.y ) ), hix4 = convert_float4( as_uchar4( rD.x < 0 ? n2.y : n3.w ) );
const float4 loy4 = convert_float4( as_uchar4( rD.y < 0 ? n4.y : n2.w ) ), hiy4 = convert_float4( as_uchar4( rD.y < 0 ? n2.w : n4.y ) );
const float4 loz4 = convert_float4( as_uchar4( rD.z < 0 ? n4.w : n3.y ) ), hiz4 = convert_float4( as_uchar4( rD.z < 0 ? n3.y : n4.w ) );
{
#ifdef SIMD_AABBTEST
const float4 tminx4 = lox4 * idir4.xxxx + orig4.xxxx, tmaxx4 = hix4 * idir4.xxxx + orig4.xxxx;
const float4 tminy4 = loy4 * idir4.yyyy + orig4.yyyy, tmaxy4 = hiy4 * idir4.yyyy + orig4.yyyy;
const float4 tminz4 = loz4 * idir4.zzzz + orig4.zzzz, tmaxz4 = hiz4 * idir4.zzzz + orig4.zzzz;
const float cmina = fmax( fmax( fmax( tminx4.x, tminy4.x ), tminz4.x ), 0 );
const float cmaxa = fmin( fmin( fmin( tmaxx4.x, tmaxy4.x ), tmaxz4.x ), tmax );
const float cminb = fmax( fmax( fmax( tminx4.y, tminy4.y ), tminz4.y ), 0 );
const float cmaxb = fmin( fmin( fmin( tmaxx4.y, tmaxy4.y ), tmaxz4.y ), tmax );
if (cmina <= cmaxa) UPDATE_HITMASK0;
if (cminb <= cmaxb) UPDATE_HITMASK1;
#else
float tminx0 = _native_fma( lox4.x, idirx, origx ), tminx1 = _native_fma( lox4.y, idirx, origx );
float tminy0 = _native_fma( loy4.x, idiry, origy ), tminy1 = _native_fma( loy4.y, idiry, origy );
float tminz0 = _native_fma( loz4.x, idirz, origz ), tminz1 = _native_fma( loz4.y, idirz, origz );
float tmaxx0 = _native_fma( hix4.x, idirx, origx ), tmaxx1 = _native_fma( hix4.y, idirx, origx );
float tmaxy0 = _native_fma( hiy4.x, idiry, origy ), tmaxy1 = _native_fma( hiy4.y, idiry, origy );
float tmaxz0 = _native_fma( hiz4.x, idirz, origz ), tmaxz1 = _native_fma( hiz4.y, idirz, origz );
n0.x = fmax( fmax_fmax( tminx0, tminy0, tminz0 ), 0 );
n0.y = fmin( fmin_fmin( tmaxx0, tmaxy0, tmaxz0 ), tmax );
n1.x = fmax( fmax_fmax( tminx1, tminy1, tminz1 ), 0 );
n1.y = fmin( fmin_fmin( tmaxx1, tmaxy1, tmaxz1 ), tmax );
if (n0.x <= n0.y) UPDATE_HITMASK0;
if (n1.x <= n1.y) UPDATE_HITMASK1;
#endif
#ifdef SIMD_AABBTEST
const float cminc = fmax( fmax( fmax( tminx4.z, tminy4.z ), tminz4.z ), 0 );
const float cmaxc = fmin( fmin( fmin( tmaxx4.z, tmaxy4.z ), tmaxz4.z ), tmax );
const float cmind = fmax( fmax( fmax( tminx4.w, tminy4.w ), tminz4.w ), 0 );
const float cmaxd = fmin( fmin( fmin( tmaxx4.w, tmaxy4.w ), tmaxz4.w ), tmax );
if (cminc <= cmaxc) UPDATE_HITMASK2;
if (cmind <= cmaxd) UPDATE_HITMASK3;
#else
tminx0 = _native_fma( lox4.z, idirx, origx ), tminx1 = _native_fma( lox4.w, idirx, origx );
tminy0 = _native_fma( loy4.z, idiry, origy ), tminy1 = _native_fma( loy4.w, idiry, origy );
tminz0 = _native_fma( loz4.z, idirz, origz ), tminz1 = _native_fma( loz4.w, idirz, origz );
tmaxx0 = _native_fma( hix4.z, idirx, origx ), tmaxx1 = _native_fma( hix4.w, idirx, origx );
tmaxy0 = _native_fma( hiy4.z, idiry, origy ), tmaxy1 = _native_fma( hiy4.w, idiry, origy );
tmaxz0 = _native_fma( hiz4.z, idirz, origz ), tmaxz1 = _native_fma( hiz4.w, idirz, origz );
n0.x = fmax( fmax_fmax( tminx0, tminy0, tminz0 ), 0 );
n0.y = fmin( fmin_fmin( tmaxx0, tmaxy0, tmaxz0 ), tmax );
n1.x = fmax( fmax_fmax( tminx1, tminy1, tminz1 ), 0 );
n1.y = fmin( fmin_fmin( tmaxx1, tmaxy1, tmaxz1 ), tmax );
if (n0.x <= n0.y) UPDATE_HITMASK2;
if (n1.x <= n1.y) UPDATE_HITMASK3;
#endif
}
}
ngroup.y = (hitmask & 0xFF000000) | (as_uint( n0.w ) >> 24), tgroup.y = hitmask & 0x00FFFFFF;
}
}
else tgroup = ngroup, ngroup = (uint2)(0);
while (tgroup.y != 0)
{
#ifdef CWBVH_COMPRESSED_TRIS
// Fast intersection of triangle data for the algorithm in:
// "Fast Ray-Triangle Intersections by Coordinate Transformation"
// Baldwin & Weber, 2016.
const unsigned triangleIndex = __bfind( tgroup.y ), triAddr = tgroup.x + triangleIndex * 4;
const float4 T2 = cwbvhTris[triAddr + 2];
const float transS = T2.x * O.x + T2.y * O.y + T2.z * O.z + T2.w;
const float transD = T2.x * D.x + T2.y * D.y + T2.z * D.z;
const float d = -transS / transD;
if (d > 0 && d < tmax)
{
const float4 T0 = cwbvhTris[triAddr + 0];
const float4 T1 = cwbvhTris[triAddr + 1];
#ifdef SIMD_AABBTEST
const float4 I = O4 + d * D4;
#else
const float3 I = O + d * D;
#endif
const float u = T0.x * I.x + T0.y * I.y + T0.z * I.z + T0.w;
const float v = T1.x * I.x + T1.y * I.y + T1.z * I.z + T1.w;
const bool hit = u >= 0 && v >= 0 && u + v < 1;
if (hit) return true;
}
#else
// Möller-Trumbore intersection; triangles are stored as 3x16 bytes,
// with the original primitive index in the (otherwise unused) w
// component of vertex 0.
const int triangleIndex = __bfind( tgroup.y ), triAddr = tgroup.x + triangleIndex * 3;
const float3 v0 = cwbvhTris[triAddr].xyz;
const float3 e1 = cwbvhTris[triAddr + 1].xyz - v0;
const float3 e2 = cwbvhTris[triAddr + 2].xyz - v0;
#ifdef SIMD_AABBTEST
const float3 r = cross( D4.xyz, e2 );
#else
const float3 r = cross( D, e2 );
#endif
const float a = dot( e1, r );
if (fabs( a ) > 0.0000001f)
{
const float f = 1 / a;
#ifdef SIMD_AABBTEST
const float3 s = O4.xyz - v0;
#else
const float3 s = O - v0;
#endif
const float u = f * dot( s, r );
if (u >= 0 && u <= 1)
{
const float3 q = cross( s, e1 );
#ifdef SIMD_AABBTEST
const float v = f * dot( D4.xyz, q );
#else
const float v = f * dot( D, q );
#endif
if (v >= 0 && u + v <= 1)
{
const float d = f * dot( e2, q );
if (d > 0.0f && d < tmax) return true;
}
}
}
#endif
tgroup.y -= 1 << triangleIndex;
}
if (ngroup.y <= 0x00FFFFFF)
{
if (stackPtr > 0) { STACK_POP( ngroup ); } else break;
}
} while (true);
return false; // no occlusion found.
}

void kernel batch_cwbvh( global const float4* cwbvhNodes, global const float4* cwbvhTris, global struct Ray* rayData )
{
// initialize ray
Expand Down
Loading
Loading