fixed issue with potential overwritten values in expansion_to_origina…

…l_dict. fixed formatting
vmenger · LydiaMennesHealth · Feb 2, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 11, 2024
commit 3c95c187b37754382ac1433da5d73c2fbcbb289d
diff --git a/docdeid/ds/lookup.py b/docdeid/ds/lookup.py
@@ -323,7 +323,6 @@ def longest_matching_prefix(
 
         longest_match = None
         current_node = self
-        # create match on the fly to maintain the property that the match that fit the trie is returned
         match = []
 
         for i in itertools.count():
@@ -340,7 +339,8 @@ def longest_matching_prefix(
                     self._apply_matching_pipeline(item[start_i + i])
                 )
 
-            # get the value that matches the trie if any. Same as an any() call but returns the value that matched
+            # Get the value that matches the trie if any.
+            # This is lazy like an any() call but returns the value that matched
             matched = next((t for t in cur_items if t in current_node.children), None)
             if matched is None:
                 break

diff --git a/docdeid/process/doc_processor.py b/docdeid/process/doc_processor.py
@@ -28,7 +28,7 @@ class DocProcessorGroup:
 
     def __init__(self) -> None:
         self._processors: OrderedDict[
-            str, Union[DocProcessor | DocProcessorGroup]
+            str, Union[DocProcessor, DocProcessorGroup]
         ] = OrderedDict()
 
     def get_names(self, recursive: bool = True) -> list[str]:

diff --git a/docdeid/str/expander.py b/docdeid/str/expander.py
@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from collections import defaultdict
 from typing import Iterable
 
 from docdeid.str.processor import StringModifier
@@ -35,13 +36,14 @@ def expand_item_iterable(self, items: Iterable[str]) -> set[str]:
 
     def get_expansion_to_original_dict(self, items: Iterable[str]) -> dict[str, str]:
         """Expand a set of strings into a dictionary where the keys are results from
-        expand_item and values the original text."""
+        expand_item and values a set of original text(s)."""
 
-        # This can get overwritten if different original texts map to the same expansion due to multiple operations...
-        result_dict = {}
+        # Need a set because otherwise value might be overwritten
+        # when multiple items expand to the same value
+        result_dict = defaultdict(set)
         for item in items:
             for expansion in self.expand_item(item):
-                result_dict[expansion] = item
+                result_dict[expansion].add(item)
         return result_dict
 
 

diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py
@@ -241,10 +241,11 @@ def token_lookup(
         else:
             expansion_dict = expander.get_expansion_to_original_dict(words)
             # get the original words of which the expansion matched the lookup values
-            matched_words = [
-                expansion_dict[m]
+            matched_words = set(
+                expansion_dict[elem]
                 for m in set(expansion_dict.keys()).intersection(lookup_values)
-            ]
+                for elem in m
+            )
 
         for word in matched_words:
             tokens.update(self._text_to_tokens[pipe_key][word])