Created
September 1, 2020 21:16
-
-
Save glubsy/8ce410ee0304569d69e1f1f6d2b18c25 to your computer and use it in GitHub Desktop.
Test regex union versus separate regex matching in a loop for possible performance gain
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Measure performance difference between two regex matching methods. | |
# We test each line in a data set (fake virtual directory paths in this case). | |
# 1) by looping through each compiled regex and testing each against each | |
# line, which should be something like O(n*m). | |
# 2) by combining all the regex expressions into one and matching that single | |
# expression against each line, which should be 0(n) probably... | |
# Note: we stop testing as soon as there is one match. If any of the regex | |
# expression matches, we don't need to continue | |
import re | |
import random | |
import os | |
import string | |
import functools | |
import time | |
#import cProfile | |
times = {} | |
default_regexes = [ | |
r".*\w{5}\/\w{3}.*", | |
r"^\/\w{4}.*$", | |
r".*\/\w{2}.*\/.*", | |
r".*blah.*", | |
r".*pwaio.*", | |
r".*azb$"] | |
def timer(func): | |
@functools.wraps(func) | |
def wrapper_timer(*args): | |
start = time.perf_counter_ns() | |
value = func(*args) | |
end = time.perf_counter_ns() | |
total = end - start | |
print(f"TIMER: function {func.__name__!r} took {total} ns.") | |
times[func.__name__] = total | |
return value | |
return wrapper_timer | |
def get_random_string(length): | |
letters = string.ascii_letters + string.digits + string.whitespace + r"!#$%&\'()*+,-./:;<=>@[]^_`{}~" | |
result_str = ''.join(random.choice(letters) for _ in range(length)) | |
return result_str | |
def create_random_data(num_lines): | |
random_fs = [] | |
for _ in range(num_lines): | |
random_path = str() | |
random_dirs = [] | |
# depth of virtual path | |
random_depth = random.choice([i for i in range(3, 10)]) | |
for i in range(random_depth): | |
# length of string | |
random_num = random.choice([i for i in range(3, 20)]) | |
random_dirs.append(get_random_string(random_num)) | |
random_path = os.sep + os.sep.join(random_dirs) | |
random_fs.append(random_path) | |
return random_fs | |
def compile_re(): | |
compiled_re_set = set() | |
for expr in default_regexes: | |
compiled_re_set.add(re.compile(expr)) | |
return compiled_re_set | |
def compile_re_combined(): | |
return re.compile('|'.join(expr for expr in default_regexes)) | |
@timer | |
def match_loop(data_set, compiled_re_set): | |
matches = 0 | |
for line in data_set: | |
for expr in compiled_re_set: | |
if expr.match(line): | |
matches += 1 | |
# no need to continue with further expressions | |
break | |
return matches | |
@timer | |
def match_combined(data_set, compiled_re_set_combined): | |
matches = 0 | |
for line in data_set: | |
if compiled_re_set_combined.match(line): | |
matches += 1 | |
return matches | |
def do_tests(test_data): | |
""" Do tests several times in case of caching...""" | |
loop_matches = match_loop(random_data, compiled_re_set) | |
combined_matches = match_combined(random_data, compiled_re_set_combined) | |
print(f"Number of matches with loop: {loop_matches}") | |
print(f"Number of matches with combined RE: {combined_matches}") | |
combined_time = times[match_combined.__name__] | |
loop_time = times[match_loop.__name__] | |
if combined_time > loop_time: | |
print(f"\033[93mLoop was faster than combined by \ | |
{combined_time - loop_time} ns!\033[0m\n--------------") | |
else: | |
print(f"Combined was faster than loop by \ | |
{loop_time - combined_time} ns.\n--------------") | |
if __name__ == "__main__": | |
random_data = create_random_data(10000) | |
# print(f"Random dataset:\n{random_data}") | |
compiled_re_set = compile_re() | |
compiled_re_set_combined = compile_re_combined() | |
for _ in range(10): | |
do_tests(random_data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment