diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py index b5724a1a..154f501b 100644 --- a/kernel_tuner/backends/compiler.py +++ b/kernel_tuner/backends/compiler.py @@ -265,12 +265,23 @@ def compile(self, kernel_instance): if platform.system() == "Darwin": lib_extension = ".dylib" - subprocess.check_call([self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"]) - subprocess.check_call( + subprocess.run( + [self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True + ) + + subprocess.run( [self.compiler, filename + ".o"] + compiler_options + ["-shared", "-o", filename + lib_extension] - + lib_args + + lib_args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=True ) self.lib = np.ctypeslib.load_library(filename, ".") @@ -396,10 +407,16 @@ def memcpy_htod(self, dest, src): def cleanup_lib(self): """unload the previously loaded shared library""" + if self.lib is None: + return + if not self.using_openmp and not self.using_openacc: # this if statement is necessary because shared libraries that use # OpenMP will core dump when unloaded, this is a well-known issue with OpenMP logging.debug("unloading shared library") - _ctypes.dlclose(self.lib._handle) + try: + _ctypes.dlclose(self.lib._handle) + finally: + self.lib = None units = {} diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index abd4a017..4323c411 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -647,8 +647,10 @@ def compile_kernel(self, instance, verbose): shared_mem_error_messages = [ "uses too much shared data", "local memory limit exceeded", + r"local memory \(\d+\) exceeds limit \(\d+\)", ] - if any(msg in str(e) for msg in shared_mem_error_messages): + error_message = str(e.stderr) if hasattr(e, "stderr") else str(e) + if any(re.search(msg, error_message) for msg in shared_mem_error_messages): logging.debug( "compile_kernel failed due to kernel using too much shared memory" ) @@ -715,7 +717,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose) ) # check for templated kernel - if kernel_source.lang in ["CUDA", "NVCUDA"] and "<" in name and ">" in name: + if kernel_source.lang in ["CUDA", "NVCUDA", "HIP"] and "<" in name and ">" in name: kernel_string, name = wrap_templated_kernel(kernel_string, name) # Preprocess GPU arguments. Require for handling `Tunable` arguments diff --git a/test/test_compiler_functions.py b/test/test_compiler_functions.py index 0c9d7f86..913fee85 100644 --- a/test/test_compiler_functions.py +++ b/test/test_compiler_functions.py @@ -198,11 +198,11 @@ def test_compile_detects_device_code(npct, subprocess): cfunc = CompilerFunctions() cfunc.compile(kernel_instance) - print(subprocess.check_call.call_args_list) + print(subprocess.run.call_args_list) # assert the filename suffix used for source compilation is .cu dot_cu_used = False - for call in subprocess.check_call.call_args_list: + for call in subprocess.run.call_args_list: args, kwargs = call args = args[0] print(args)