cleaned up accidentally duplicated test

vmenger · LydiaMennesHealth · Feb 2, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 11, 2024
commit 03d8b8524861eec8c3ca774fd8668b980e82e528
diff --git a/docdeid/tokenizer.py b/docdeid/tokenizer.py
@@ -239,9 +239,8 @@ def token_lookup(
             matched_words = words.intersection(lookup_values)
 
         else:
-            # make expansions if expander is provided
             expansion_dict = expander.get_expansion_to_original_dict(words)
-            # get the original words of which the expansion or original matched the lookup values
+            # get the original words of which the expansion matched the lookup values
             matched_words = [
                 expansion_dict[m]
                 for m in set(expansion_dict.keys()).intersection(lookup_values)

diff --git a/tests/unit/process/test_annotator.py b/tests/unit/process/test_annotator.py
@@ -1,7 +1,6 @@
 import re
 from unittest.mock import patch
 
-import docdeid.ds
 from docdeid.annotation import Annotation
 from docdeid.document import Document
 from docdeid.pattern import TokenPattern
@@ -101,163 +100,6 @@ def test_multi_token_lookup_with_overlap(self, long_text, long_tokenlist):
             Annotation(text="dr. John", start_char=11, end_char=19, tag="prefix"),
             Annotation(text="John Smith", start_char=15, end_char=25, tag="prefix"),
         ]
-
-
-class TestMultiTokenLookupAnnotator:
-    def test_multi_token(self, long_text, long_tokenlist):
-        doc = Document(long_text)
-        annotator = MultiTokenLookupAnnotator(
-            lookup_values=["my name", "my wife"],
-            tokenizer=WordBoundaryTokenizer(),
-            tag="prefix",
-        )
-        expected_annotations = [
-            Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
-        ]
-
-        with patch.object(doc, "get_tokens", return_value=long_tokenlist):
-            annotations = annotator.annotate(doc)
-
-        assert annotations == expected_annotations
-
-    def test_multi_token_with_matching_pipeline(self, long_text, long_tokenlist):
-        doc = Document(long_text)
-
-        annotator = MultiTokenLookupAnnotator(
-            lookup_values=["my name", "my wife"],
-            tokenizer=WordBoundaryTokenizer(),
-            matching_pipeline=[LowercaseString()],
-            tag="prefix",
-        )
-        expected_annotations = [
-            Annotation(text="My name", start_char=0, end_char=7, tag="prefix"),
-            Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
-        ]
-
-        with patch.object(doc, "get_tokens", return_value=long_tokenlist):
-            annotations = annotator.annotate(doc)
-
-        assert annotations == expected_annotations
-
-    def test_multi_token_lookup_with_overlap(self, long_text, long_tokenlist):
-        doc = Document(long_text)
-
-        annotator = MultiTokenLookupAnnotator(
-            lookup_values=["dr. John", "John Smith"],
-            tokenizer=WordBoundaryTokenizer(),
-            tag="prefix",
-            overlapping=True,
-        )
-
-        expected_annotations = [
-            Annotation(text="dr. John", start_char=11, end_char=19, tag="prefix"),
-            Annotation(text="John Smith", start_char=15, end_char=25, tag="prefix"),
-        ]
-
-
-class TestMultiTokenLookupAnnotator:
-    def test_multi_token(self, long_text, long_tokenlist):
-        doc = Document(long_text)
-        annotator = MultiTokenLookupAnnotator(
-            lookup_values=["my name", "my wife"],
-            tokenizer=WordBoundaryTokenizer(),
-            tag="prefix",
-        )
-        expected_annotations = [
-            Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
-        ]
-
-        with patch.object(doc, "get_tokens", return_value=long_tokenlist):
-            annotations = annotator.annotate(doc)
-
-        assert annotations == expected_annotations
-
-    def test_multi_token_with_matching_pipeline(self, long_text, long_tokenlist):
-        doc = Document(long_text)
-
-        annotator = MultiTokenLookupAnnotator(
-            lookup_values=["my name", "my wife"],
-            tokenizer=WordBoundaryTokenizer(),
-            matching_pipeline=[LowercaseString()],
-            tag="prefix",
-        )
-        expected_annotations = [
-            Annotation(text="My name", start_char=0, end_char=7, tag="prefix"),
-            Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
-        ]
-
-        with patch.object(doc, "get_tokens", return_value=long_tokenlist):
-            annotations = annotator.annotate(doc)
-
-        assert annotations == expected_annotations
-
-    def test_multi_token_lookup_with_overlap(self, long_text, long_tokenlist):
-        doc = Document(long_text)
-
-        annotator = MultiTokenLookupAnnotator(
-            lookup_values=["dr. John", "John Smith"],
-            tokenizer=WordBoundaryTokenizer(),
-            tag="prefix",
-            overlapping=True,
-        )
-
-        expected_annotations = [
-            Annotation(text="dr. John", start_char=11, end_char=19, tag="prefix"),
-            Annotation(text="John Smith", start_char=15, end_char=25, tag="prefix"),
-        ]
-
-
-class TestMultiTokenLookupAnnotator:
-    def test_multi_token(self, long_text, long_tokenlist):
-        doc = Document(long_text)
-        annotator = MultiTokenLookupAnnotator(
-            lookup_values=["my name", "my wife"],
-            tokenizer=WordBoundaryTokenizer(),
-            tag="prefix",
-        )
-        expected_annotations = [
-            Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
-        ]
-
-        with patch.object(doc, "get_tokens", return_value=long_tokenlist):
-            annotations = annotator.annotate(doc)
-
-        assert annotations == expected_annotations
-
-    def test_multi_token_with_matching_pipeline(self, long_text, long_tokenlist):
-        doc = Document(long_text)
-
-        annotator = MultiTokenLookupAnnotator(
-            lookup_values=["my name", "my wife"],
-            tokenizer=WordBoundaryTokenizer(),
-            matching_pipeline=[LowercaseString()],
-            tag="prefix",
-        )
-        expected_annotations = [
-            Annotation(text="My name", start_char=0, end_char=7, tag="prefix"),
-            Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
-        ]
-
-        with patch.object(doc, "get_tokens", return_value=long_tokenlist):
-            annotations = annotator.annotate(doc)
-
-        assert annotations == expected_annotations
-
-    def test_multi_token_lookup_with_overlap(self, long_text, long_tokenlist):
-        doc = Document(long_text)
-
-        annotator = MultiTokenLookupAnnotator(
-            lookup_values=["dr. John", "John Smith"],
-            tokenizer=WordBoundaryTokenizer(),
-            tag="prefix",
-            overlapping=True,
-        )
-
-        expected_annotations = [
-            Annotation(text="dr. John", start_char=11, end_char=19, tag="prefix"),
-            Annotation(text="John Smith", start_char=15, end_char=25, tag="prefix"),
-        ]
-
         with patch.object(doc, "get_tokens", return_value=long_tokenlist):
             annotations = annotator.annotate(doc)
 
@@ -286,156 +128,6 @@ def test_multi_token_with_expander(self, long_text, long_tokenlist):
         assert annotations == expected_annotations
 
 
-class TestMultiTokenLookupAnnotator:
-    def test_multi_token(self, long_text, long_tokenlist):
-        doc = Document(long_text)
-        annotator = MultiTokenLookupAnnotator(
-            lookup_values=["my name", "my wife"],
-            tokenizer=WordBoundaryTokenizer(),
-            tag="prefix",
-        )
-        expected_annotations = [
-            Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
-        ]
-
-        with patch.object(doc, "get_tokens", return_value=long_tokenlist):
-            annotations = annotator.annotate(doc)
-
-        assert annotations == expected_annotations
-
-    def test_multi_token_with_matching_pipeline(self, long_text, long_tokenlist):
-        doc = Document(long_text)
-
-        annotator = MultiTokenLookupAnnotator(
-            lookup_values=["my name", "my wife"],
-            tokenizer=WordBoundaryTokenizer(),
-            matching_pipeline=[LowercaseString()],
-            tag="prefix",
-        )
-        expected_annotations = [
-            Annotation(text="My name", start_char=0, end_char=7, tag="prefix"),
-            Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
-        ]
-
-        with patch.object(doc, "get_tokens", return_value=long_tokenlist):
-            annotations = annotator.annotate(doc)
-
-        assert annotations == expected_annotations
-
-    def test_multi_token_lookup_with_overlap(self, long_text, long_tokenlist):
-        doc = Document(long_text)
-
-        annotator = MultiTokenLookupAnnotator(
-            lookup_values=["dr. John", "John Smith"],
-            tokenizer=WordBoundaryTokenizer(),
-            tag="prefix",
-            overlapping=True,
-        )
-
-        expected_annotations = [
-            Annotation(text="dr. John", start_char=11, end_char=19, tag="prefix"),
-            Annotation(text="John Smith", start_char=15, end_char=25, tag="prefix"),
-        ]
-
-
-class TestMultiTokenLookupAnnotator:
-    def test_multi_token(self, long_text, long_tokenlist):
-        doc = Document(long_text)
-        annotator = MultiTokenLookupAnnotator(
-            lookup_values=["my name", "my wife"],
-            tokenizer=WordBoundaryTokenizer(),
-            tag="prefix",
-        )
-        expected_annotations = [
-            Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
-        ]
-
-        with patch.object(doc, "get_tokens", return_value=long_tokenlist):
-            annotations = annotator.annotate(doc)
-
-        assert annotations == expected_annotations
-
-    def test_multi_token_with_matching_pipeline(self, long_text, long_tokenlist):
-        doc = Document(long_text)
-
-        annotator = MultiTokenLookupAnnotator(
-            lookup_values=["my name", "my wife"],
-            tokenizer=WordBoundaryTokenizer(),
-            matching_pipeline=[LowercaseString()],
-            tag="prefix",
-        )
-        expected_annotations = [
-            Annotation(text="My name", start_char=0, end_char=7, tag="prefix"),
-            Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
-        ]
-
-        with patch.object(doc, "get_tokens", return_value=long_tokenlist):
-            annotations = annotator.annotate(doc)
-
-        assert annotations == expected_annotations
-
-    def test_multi_token_lookup_with_overlap(self, long_text, long_tokenlist):
-        doc = Document(long_text)
-
-        annotator = MultiTokenLookupAnnotator(
-            lookup_values=["dr. John", "John Smith"],
-            tokenizer=WordBoundaryTokenizer(),
-            tag="prefix",
-            overlapping=True,
-        )
-
-        expected_annotations = [
-            Annotation(text="dr. John", start_char=11, end_char=19, tag="prefix"),
-            Annotation(text="John Smith", start_char=15, end_char=25, tag="prefix"),
-        ]
-
-        with patch.object(doc, "get_tokens", return_value=long_tokenlist):
-            annotations = annotator.annotate(doc)
-
-        assert annotations == expected_annotations
-
-    def test_multi_token_lookup_no_overlap(self, long_text, long_tokenlist):
-        doc = Document(long_text)
-
-        annotator = MultiTokenLookupAnnotator(
-            lookup_values=["dr. John", "John Smith"],
-            tokenizer=WordBoundaryTokenizer(),
-            tag="prefix",
-            overlapping=False,
-        )
-
-        expected_annotations = [
-            Annotation(text="dr. John", start_char=11, end_char=19, tag="prefix"),
-        ]
-
-        with patch.object(doc, "get_tokens", return_value=long_tokenlist):
-            annotations = annotator.annotate(doc)
-
-        assert annotations == expected_annotations
-
-    def test_multi_token_lookup_with_trie(self, long_text, long_tokenlist):
-        doc = Document(long_text)
-
-        trie = docdeid.ds.LookupTrie(matching_pipeline=[LowercaseString()])
-        trie.add_item(["my", " ", "name"])
-        trie.add_item(["my", " ", "wife"])
-
-        annotator = MultiTokenLookupAnnotator(
-            trie=trie,
-            tag="prefix",
-        )
-
-        expected_annotations = [
-            Annotation(text="My name", start_char=0, end_char=7, tag="prefix"),
-            Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
-        ]
-
-        with patch.object(doc, "get_tokens", return_value=long_tokenlist):
-            annotations = annotator.annotate(doc)
-
-        assert annotations == expected_annotations
-
-
 class TestRegexpAnnotator:
     def test_regexp_annotator(self, long_text):
         doc = Document(long_text)