diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py
index b5724a1a..154f501b 100644
--- a/kernel_tuner/backends/compiler.py
+++ b/kernel_tuner/backends/compiler.py
@@ -265,12 +265,23 @@ def compile(self, kernel_instance):
             if platform.system() == "Darwin":
                 lib_extension = ".dylib"
 
-            subprocess.check_call([self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"])
-            subprocess.check_call(
+            subprocess.run(
+                [self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True
+            )
+
+            subprocess.run(
                 [self.compiler, filename + ".o"]
                 + compiler_options
                 + ["-shared", "-o", filename + lib_extension]
-                + lib_args
+                + lib_args,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True
             )
 
             self.lib = np.ctypeslib.load_library(filename, ".")
@@ -396,10 +407,16 @@ def memcpy_htod(self, dest, src):
 
     def cleanup_lib(self):
         """unload the previously loaded shared library"""
+        if self.lib is None:
+            return
+        
         if not self.using_openmp and not self.using_openacc:
             # this if statement is necessary because shared libraries that use
             # OpenMP will core dump when unloaded, this is a well-known issue with OpenMP
             logging.debug("unloading shared library")
-            _ctypes.dlclose(self.lib._handle)
+            try:
+                _ctypes.dlclose(self.lib._handle)
+            finally:
+                self.lib = None
 
     units = {}
diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
index abd4a017..4323c411 100644
--- a/kernel_tuner/core.py
+++ b/kernel_tuner/core.py
@@ -647,8 +647,10 @@ def compile_kernel(self, instance, verbose):
             shared_mem_error_messages = [
                 "uses too much shared data",
                 "local memory limit exceeded",
+                r"local memory \(\d+\) exceeds limit \(\d+\)",
             ]
-            if any(msg in str(e) for msg in shared_mem_error_messages):
+            error_message = str(e.stderr) if hasattr(e, "stderr") else str(e)
+            if any(re.search(msg, error_message) for msg in shared_mem_error_messages):
                 logging.debug(
                     "compile_kernel failed due to kernel using too much shared memory"
                 )
@@ -715,7 +717,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose)
         )
 
         # check for templated kernel
-        if kernel_source.lang in ["CUDA", "NVCUDA"] and "<" in name and ">" in name:
+        if kernel_source.lang in ["CUDA", "NVCUDA", "HIP"] and "<" in name and ">" in name:
             kernel_string, name = wrap_templated_kernel(kernel_string, name)
 
         # Preprocess GPU arguments. Require for handling `Tunable` arguments
diff --git a/test/test_compiler_functions.py b/test/test_compiler_functions.py
index 0c9d7f86..913fee85 100644
--- a/test/test_compiler_functions.py
+++ b/test/test_compiler_functions.py
@@ -198,11 +198,11 @@ def test_compile_detects_device_code(npct, subprocess):
     cfunc = CompilerFunctions()
     cfunc.compile(kernel_instance)
 
-    print(subprocess.check_call.call_args_list)
+    print(subprocess.run.call_args_list)
 
     # assert the filename suffix used for source compilation is .cu
     dot_cu_used = False
-    for call in subprocess.check_call.call_args_list:
+    for call in subprocess.run.call_args_list:
         args, kwargs = call
         args = args[0]
         print(args)