integrated expanderr into TokenList and added testing

vmenger · LydiaMennesHealth · Feb 2, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 11, 2024
commit 07fe44ccc7f63c5cb6ec3cf6add29084b27597bb
diff --git a/docdeid/ds/lookup.py b/docdeid/ds/lookup.py
@@ -5,6 +5,7 @@
 from typing import Iterable, Iterator, Optional, Union
 
 from docdeid.ds.ds import Datastructure
+from docdeid.str.expander import Expander
 from docdeid.str.processor import StringModifier, StringProcessor, StripString
 
 
@@ -100,7 +101,6 @@ def remove_items_from_iterable(self, items: Iterable[str]) -> None:
         """
 
         for item in items:
-
             item = self._apply_matching_pipeline(item)
 
             if item in self._items:
@@ -277,7 +277,6 @@ def add_item(self, item: list[str]) -> None:
             self.is_terminal = True
 
         else:
-
             head, tail = self._apply_matching_pipeline(item[0]), item[1:]
 
             if head not in self.children:
@@ -304,7 +303,7 @@ def __contains__(self, item: list[str]) -> bool:
         return (head in self.children) and tail in self.children[head]
 
     def longest_matching_prefix(
-        self, item: list[str], start_i: int = 0
+        self, item: list[str], start_i: int = 0, expander: Optional[Expander] = None
     ) -> Union[list[str], None]:
         """
         Finds the longest matching prefix of a list of strings. This is used to find the
@@ -324,27 +323,29 @@ def longest_matching_prefix(
 
         longest_match = None
         current_node = self
+        # create match on the fly to maintain the property that the match that fit the trie is returned
+        match = []
 
         for i in itertools.count():
-
             if current_node.is_terminal:
                 longest_match = i
 
-            if start_i + i >= len(item) or (
-                self._apply_matching_pipeline(item[start_i + i])
-                not in current_node.children
-            ):
+            if start_i + i >= len(item):
                 break
 
-            current_node = current_node.children[
-                self._apply_matching_pipeline(item[start_i + i])
-            ]
-
-        return (
-            [
-                self._apply_matching_pipeline(item)
-                for item in item[start_i : start_i + longest_match]
-            ]
-            if longest_match
-            else None
-        )
+            if expander is None:
+                cur_items = [self._apply_matching_pipeline(item[start_i + i])]
+            else:
+                cur_items = expander.expand_item(
+                    self._apply_matching_pipeline(item[start_i + i])
+                )
+
+            # get the value that matches the trie if any. Same as an any() call but returns the value that matched
+            matched = next((t for t in cur_items if t in current_node.children), None)
+            if matched is None:
+                break
+
+            match.append(matched)
+            current_node = current_node.children[matched]
+
+        return match[:longest_match] if longest_match else None
diff --git a/docdeid/process/annotator.py b/docdeid/process/annotator.py
@@ -8,6 +8,7 @@
 from docdeid.ds.lookup import LookupSet, LookupTrie
 from docdeid.pattern import TokenPattern
 from docdeid.process.doc_processor import DocProcessor
+from docdeid.str.expander import Expander
 from docdeid.str.processor import StringModifier
 from docdeid.tokenizer import Token, Tokenizer
 
@@ -68,14 +69,12 @@ def __init__(
         tokenizer_name: str = "default",
         **kwargs,
     ) -> None:
-
         self.lookup_set = LookupSet(matching_pipeline=matching_pipeline)
         self.lookup_set.add_items_from_iterable(items=lookup_values)
         self._tokenizer_name = tokenizer_name
         super().__init__(*args, **kwargs)
 
     def _tokens_to_annotations(self, tokens: Iterable[Token]) -> list[Annotation]:
-
         return [
             Annotation(
                 text=token.text,
@@ -90,7 +89,6 @@ def _tokens_to_annotations(self, tokens: Iterable[Token]) -> list[Annotation]:
         ]
 
     def annotate(self, doc: Document) -> list[Annotation]:
-
         tokens = doc.get_tokens(tokenizer_name=self._tokenizer_name)
 
         annotate_tokens = tokens.token_lookup(
@@ -131,13 +129,12 @@ def __init__(
         tokenizer: Optional[Tokenizer] = None,
         trie: Optional[LookupTrie] = None,
         overlapping: bool = False,
+        expander: Optional[list[Expander]] = None,
         **kwargs,
     ) -> None:
-
         self._start_words: set[str] = set()
 
         if (trie is not None) and (lookup_values is None) and (tokenizer is None):
-
             self._trie = trie
             self._matching_pipeline = trie.matching_pipeline or []
             self._start_words = set(trie.children.keys())
@@ -153,15 +150,14 @@ def __init__(
             )
 
         self.overlapping = overlapping
+        self.expander = expander
 
         super().__init__(*args, **kwargs)
 
     def _init_lookup_structures(
         self, lookup_values: Iterable[str], tokenizer: Tokenizer
     ) -> None:
-
         for val in lookup_values:
-
             texts = [token.text for token in tokenizer.tokenize(val)]
 
             if len(texts) > 0:
@@ -175,12 +171,13 @@ def _init_lookup_structures(
                 self._start_words.add(start_token)
 
     def annotate(self, doc: Document) -> list[Annotation]:
-
         tokens = doc.get_tokens()
 
         start_tokens = sorted(
             tokens.token_lookup(
-                self._start_words, matching_pipeline=self._matching_pipeline
+                self._start_words,
+                matching_pipeline=self._matching_pipeline,
+                expander=self.expander,
             ),
             key=lambda token: token.start_char,
         )
@@ -192,12 +189,11 @@ def annotate(self, doc: Document) -> list[Annotation]:
         min_i = 0
 
         for i in start_indices:
-
             if i < min_i:
                 continue
 
             longest_matching_prefix = self._trie.longest_matching_prefix(
-                tokens_text, start_i=i
+                tokens_text, start_i=i, expander=self.expander
             )
 
             if longest_matching_prefix is None:
@@ -247,7 +243,6 @@ def __init__(
         pre_match_words: Optional[list[str]] = None,
         **kwargs,
     ) -> None:
-
         if isinstance(regexp_pattern, str):
             regexp_pattern = re.compile(regexp_pattern)
 
@@ -264,12 +259,13 @@ def __init__(
         super().__init__(*args, **kwargs)
 
     def _validate_match(
-        self, match: re.Match, doc: Document  # pylint: disable=W0613
+        self,
+        match: re.Match,
+        doc: Document,  # pylint: disable=W0613
     ) -> bool:
         return True
 
     def annotate(self, doc: Document) -> list[Annotation]:
-
         if self.pre_match_words is not None:
             try:
                 if (
@@ -284,7 +280,6 @@ def annotate(self, doc: Document) -> list[Annotation]:
         annotations = []
 
         for match in self.regexp_pattern.finditer(doc.text):
-
             if not self._validate_match(match, doc):
                 continue
 
@@ -324,7 +319,6 @@ def annotate(self, doc: Document) -> list[Annotation]:
             return annotations
 
         for token in doc.get_tokens():
-
             if not self.pattern.token_precondition(token):
                 continue
 

diff --git a/docdeid/str/expander.py b/docdeid/str/expander.py
@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from typing import Iterable
 
 from docdeid.str.processor import StringModifier
 
@@ -7,7 +8,7 @@ class Expander(ABC):
     """Abstract class for string expansion."""
 
     @abstractmethod
-    def expand_item(self, item: str) -> list[str]:
+    def expand_item(self, item: str) -> set[str]:
         """
         Expand a string into a list of strings that contains the original and possibly additional strings.
 
@@ -18,6 +19,28 @@ def expand_item(self, item: str) -> list[str]:
             The expanded items.
         """
 
+    def expand_item_iterable(self, items: Iterable[str]) -> set[str]:
+        """
+        Expand a set of strings into a set of strings that contains the original and possibly additional strings.
+
+        Args:
+            words: The input set of strings.
+
+        Returns:
+            The expanded set of strings.
+        """
+        return set.union(*(self.expand_item(item) for item in items))
+
+    def get_expansion_to_original_dict(self, items: Iterable[str]) -> dict[str, str]:
+        """Expand a set of strings into a dictionary where the keys are results from expand_item and values the original text."""
+
+        # This can get overwritten if different original texts map to the same expansion due to multiple operations...
+        result_dict = {}
+        for item in items:
+            for expansion in self.expand_item(item):
+                result_dict[expansion] = item
+        return result_dict
+
 
 class MinimumLengthExpander(Expander):
     """Expands a string by applying the lists of string processors. These are only applied to tokens whose length >= minimum length"""

diff --git a/docdeid/str/processor.py b/docdeid/str/processor.py
@@ -168,3 +168,10 @@ def __init__(self, min_len: int) -> None:
 
     def filter(self, item: str) -> bool:
         return len(item) >= self.min_len
+
+
+class TitleCase(StringModifier):
+    """Titlecase string."""
+
+    def process(self, item: str) -> str:
+        return item.title()
diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py
@@ -7,6 +7,7 @@
 from typing import Iterator, Literal, Optional
 
 from docdeid.str import StringModifier
+from docdeid.str.expander import Expander
 
 
 @dataclass(frozen=True)
@@ -149,7 +150,6 @@ def __init__(self, tokens: list[Token], link_tokens: bool = True) -> None:
         self._text_to_tokens: dict[str, defaultdict[str, list[Token]]] = {}
 
     def _link_tokens(self) -> None:
-
         for i in range(len(self._tokens) - 1):
             self._tokens[i].set_next_token(self._tokens[i + 1])
             self._tokens[i + 1].set_previous_token(self._tokens[i])
@@ -168,15 +168,13 @@ def token_index(self, token: Token) -> int:
     def _init_token_lookup(
         self, matching_pipeline: Optional[list[StringModifier]] = None
     ) -> None:
-
         matching_pipeline = matching_pipeline or []
         pipe_key = str(matching_pipeline)
 
         words = set()
         text_to_tokens = defaultdict(list)
 
         for token in self._tokens:
-
             text = token.text
 
             for string_modifier in matching_pipeline:
@@ -213,6 +211,7 @@ def token_lookup(
         self,
         lookup_values: set[str],
         matching_pipeline: Optional[list[StringModifier]] = None,
+        expander: Optional[Expander] = None,
     ) -> set[Token]:
         """
         Lookup all tokens of which the text matches a certain set of lookup values.
@@ -229,27 +228,37 @@ def token_lookup(
         matching_pipeline = matching_pipeline or []
         pipe_key = str(matching_pipeline)
 
+        # this is already part of get_words func -> remove?
         if pipe_key not in self._text_to_tokens:
             self._init_token_lookup(matching_pipeline)
 
         tokens = set()
         words = self.get_words(matching_pipeline)
 
-        for word in words.intersection(lookup_values):
+        if expander is None:
+            matched_words = words.intersection(lookup_values)
+
+        else:
+            # make expansions if expander is provided
+            expansion_dict = expander.get_expansion_to_original_dict(words)
+            # get the original words of which the expansion or original matched the lookup values
+            matched_words = [
+                expansion_dict[m]
+                for m in set(expansion_dict.keys()).intersection(lookup_values)
+            ]
+
+        for word in matched_words:
             tokens.update(self._text_to_tokens[pipe_key][word])
 
         return tokens
 
     def __iter__(self) -> Iterator[Token]:
-
         return iter(self._tokens)
 
     def __len__(self) -> int:
-
         return len(self._tokens)
 
     def __getitem__(self, index: int) -> Token:
-
         return self._tokens[index]
 
     def __eq__(self, other: object) -> bool:
@@ -343,7 +352,6 @@ def _split_text(self, text: str) -> list[Token]:
         matches = [*re.finditer(r"\b", text)]
 
         for start_match, end_match in zip(matches, matches[1:]):
-
             start_char = start_match.span(0)[0]
             end_char = end_match.span(0)[0]