Skip to content

Instantly share code, notes, and snippets.

@glubsy
Created September 1, 2020 21:16
Show Gist options
  • Save glubsy/8ce410ee0304569d69e1f1f6d2b18c25 to your computer and use it in GitHub Desktop.
Save glubsy/8ce410ee0304569d69e1f1f6d2b18c25 to your computer and use it in GitHub Desktop.
Test regex union versus separate regex matching in a loop for possible performance gain
# Measure performance difference between two regex matching methods.
# We test each line in a data set (fake virtual directory paths in this case).
# 1) by looping through each compiled regex and testing each against each
# line, which should be something like O(n*m).
# 2) by combining all the regex expressions into one and matching that single
# expression against each line, which should be 0(n) probably...
# Note: we stop testing as soon as there is one match. If any of the regex
# expression matches, we don't need to continue
import re
import random
import os
import string
import functools
import time
#import cProfile
times = {}
default_regexes = [
r".*\w{5}\/\w{3}.*",
r"^\/\w{4}.*$",
r".*\/\w{2}.*\/.*",
r".*blah.*",
r".*pwaio.*",
r".*azb$"]
def timer(func):
@functools.wraps(func)
def wrapper_timer(*args):
start = time.perf_counter_ns()
value = func(*args)
end = time.perf_counter_ns()
total = end - start
print(f"TIMER: function {func.__name__!r} took {total} ns.")
times[func.__name__] = total
return value
return wrapper_timer
def get_random_string(length):
letters = string.ascii_letters + string.digits + string.whitespace + r"!#$%&\'()*+,-./:;<=>@[]^_`{}~"
result_str = ''.join(random.choice(letters) for _ in range(length))
return result_str
def create_random_data(num_lines):
random_fs = []
for _ in range(num_lines):
random_path = str()
random_dirs = []
# depth of virtual path
random_depth = random.choice([i for i in range(3, 10)])
for i in range(random_depth):
# length of string
random_num = random.choice([i for i in range(3, 20)])
random_dirs.append(get_random_string(random_num))
random_path = os.sep + os.sep.join(random_dirs)
random_fs.append(random_path)
return random_fs
def compile_re():
compiled_re_set = set()
for expr in default_regexes:
compiled_re_set.add(re.compile(expr))
return compiled_re_set
def compile_re_combined():
return re.compile('|'.join(expr for expr in default_regexes))
@timer
def match_loop(data_set, compiled_re_set):
matches = 0
for line in data_set:
for expr in compiled_re_set:
if expr.match(line):
matches += 1
# no need to continue with further expressions
break
return matches
@timer
def match_combined(data_set, compiled_re_set_combined):
matches = 0
for line in data_set:
if compiled_re_set_combined.match(line):
matches += 1
return matches
def do_tests(test_data):
""" Do tests several times in case of caching..."""
loop_matches = match_loop(random_data, compiled_re_set)
combined_matches = match_combined(random_data, compiled_re_set_combined)
print(f"Number of matches with loop: {loop_matches}")
print(f"Number of matches with combined RE: {combined_matches}")
combined_time = times[match_combined.__name__]
loop_time = times[match_loop.__name__]
if combined_time > loop_time:
print(f"\033[93mLoop was faster than combined by \
{combined_time - loop_time} ns!\033[0m\n--------------")
else:
print(f"Combined was faster than loop by \
{loop_time - combined_time} ns.\n--------------")
if __name__ == "__main__":
random_data = create_random_data(10000)
# print(f"Random dataset:\n{random_data}")
compiled_re_set = compile_re()
compiled_re_set_combined = compile_re_combined()
for _ in range(10):
do_tests(random_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment