diff --git a/bindings/torch/tinycudann/modules.py b/bindings/torch/tinycudann/modules.py
index f4be29e8..639b6a91 100644
--- a/bindings/torch/tinycudann/modules.py
+++ b/bindings/torch/tinycudann/modules.py
@@ -39,6 +39,10 @@ def backward(ctx, doutput):
 		if doutput is None:
 			return None, None, None, None
 
+		if not doutput.is_cuda:
+			print("TCNN WARNING: doutput must be a CUDA tensor, but isn't. This indicates suboptimal performance.")
+			doutput = doutput.cuda()
+
 		input, params, output = ctx.saved_tensors
 		with torch.no_grad():
 			scaled_grad = doutput * ctx.loss_scale
@@ -63,7 +67,10 @@ def __init__(self, seed=1337):
 		self.loss_scale = 128.0 if self.native_tcnn_module.param_precision() == _C.Precision.Fp16 else 1.0
 
 	def forward(self, x):
-		# TCNN only supports batch sizes that are a multiple of 128. Apply the corresponding padding here.
+		if not x.is_cuda:
+			print("TCNN WARNING: input must be a CUDA tensor, but isn't. This indicates suboptimal performance.")
+			x = x.cuda()
+
 		batch_size = x.shape[0]
 		batch_size_granularity = int(_C.batch_size_granularity())
 		padded_batch_size = (batch_size + batch_size_granularity-1) // batch_size_granularity * batch_size_granularity