Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Transformations lookup #19

Merged
merged 14 commits into from
Feb 2, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
cleaned up accidentally duplicated test
  • Loading branch information
LydiaMennesHealth committed Jan 11, 2024
commit 03d8b8524861eec8c3ca774fd8668b980e82e528
3 changes: 1 addition & 2 deletions docdeid/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,9 +239,8 @@ def token_lookup(
matched_words = words.intersection(lookup_values)

else:
# make expansions if expander is provided
expansion_dict = expander.get_expansion_to_original_dict(words)
# get the original words of which the expansion or original matched the lookup values
# get the original words of which the expansion matched the lookup values
matched_words = [
expansion_dict[m]
for m in set(expansion_dict.keys()).intersection(lookup_values)
Expand Down
308 changes: 0 additions & 308 deletions tests/unit/process/test_annotator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import re
from unittest.mock import patch

import docdeid.ds
from docdeid.annotation import Annotation
from docdeid.document import Document
from docdeid.pattern import TokenPattern
Expand Down Expand Up @@ -101,163 +100,6 @@ def test_multi_token_lookup_with_overlap(self, long_text, long_tokenlist):
Annotation(text="dr. John", start_char=11, end_char=19, tag="prefix"),
Annotation(text="John Smith", start_char=15, end_char=25, tag="prefix"),
]


class TestMultiTokenLookupAnnotator:
def test_multi_token(self, long_text, long_tokenlist):
doc = Document(long_text)
annotator = MultiTokenLookupAnnotator(
lookup_values=["my name", "my wife"],
tokenizer=WordBoundaryTokenizer(),
tag="prefix",
)
expected_annotations = [
Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
]

with patch.object(doc, "get_tokens", return_value=long_tokenlist):
annotations = annotator.annotate(doc)

assert annotations == expected_annotations

def test_multi_token_with_matching_pipeline(self, long_text, long_tokenlist):
doc = Document(long_text)

annotator = MultiTokenLookupAnnotator(
lookup_values=["my name", "my wife"],
tokenizer=WordBoundaryTokenizer(),
matching_pipeline=[LowercaseString()],
tag="prefix",
)
expected_annotations = [
Annotation(text="My name", start_char=0, end_char=7, tag="prefix"),
Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
]

with patch.object(doc, "get_tokens", return_value=long_tokenlist):
annotations = annotator.annotate(doc)

assert annotations == expected_annotations

def test_multi_token_lookup_with_overlap(self, long_text, long_tokenlist):
doc = Document(long_text)

annotator = MultiTokenLookupAnnotator(
lookup_values=["dr. John", "John Smith"],
tokenizer=WordBoundaryTokenizer(),
tag="prefix",
overlapping=True,
)

expected_annotations = [
Annotation(text="dr. John", start_char=11, end_char=19, tag="prefix"),
Annotation(text="John Smith", start_char=15, end_char=25, tag="prefix"),
]


class TestMultiTokenLookupAnnotator:
def test_multi_token(self, long_text, long_tokenlist):
doc = Document(long_text)
annotator = MultiTokenLookupAnnotator(
lookup_values=["my name", "my wife"],
tokenizer=WordBoundaryTokenizer(),
tag="prefix",
)
expected_annotations = [
Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
]

with patch.object(doc, "get_tokens", return_value=long_tokenlist):
annotations = annotator.annotate(doc)

assert annotations == expected_annotations

def test_multi_token_with_matching_pipeline(self, long_text, long_tokenlist):
doc = Document(long_text)

annotator = MultiTokenLookupAnnotator(
lookup_values=["my name", "my wife"],
tokenizer=WordBoundaryTokenizer(),
matching_pipeline=[LowercaseString()],
tag="prefix",
)
expected_annotations = [
Annotation(text="My name", start_char=0, end_char=7, tag="prefix"),
Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
]

with patch.object(doc, "get_tokens", return_value=long_tokenlist):
annotations = annotator.annotate(doc)

assert annotations == expected_annotations

def test_multi_token_lookup_with_overlap(self, long_text, long_tokenlist):
doc = Document(long_text)

annotator = MultiTokenLookupAnnotator(
lookup_values=["dr. John", "John Smith"],
tokenizer=WordBoundaryTokenizer(),
tag="prefix",
overlapping=True,
)

expected_annotations = [
Annotation(text="dr. John", start_char=11, end_char=19, tag="prefix"),
Annotation(text="John Smith", start_char=15, end_char=25, tag="prefix"),
]


class TestMultiTokenLookupAnnotator:
def test_multi_token(self, long_text, long_tokenlist):
doc = Document(long_text)
annotator = MultiTokenLookupAnnotator(
lookup_values=["my name", "my wife"],
tokenizer=WordBoundaryTokenizer(),
tag="prefix",
)
expected_annotations = [
Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
]

with patch.object(doc, "get_tokens", return_value=long_tokenlist):
annotations = annotator.annotate(doc)

assert annotations == expected_annotations

def test_multi_token_with_matching_pipeline(self, long_text, long_tokenlist):
doc = Document(long_text)

annotator = MultiTokenLookupAnnotator(
lookup_values=["my name", "my wife"],
tokenizer=WordBoundaryTokenizer(),
matching_pipeline=[LowercaseString()],
tag="prefix",
)
expected_annotations = [
Annotation(text="My name", start_char=0, end_char=7, tag="prefix"),
Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
]

with patch.object(doc, "get_tokens", return_value=long_tokenlist):
annotations = annotator.annotate(doc)

assert annotations == expected_annotations

def test_multi_token_lookup_with_overlap(self, long_text, long_tokenlist):
doc = Document(long_text)

annotator = MultiTokenLookupAnnotator(
lookup_values=["dr. John", "John Smith"],
tokenizer=WordBoundaryTokenizer(),
tag="prefix",
overlapping=True,
)

expected_annotations = [
Annotation(text="dr. John", start_char=11, end_char=19, tag="prefix"),
Annotation(text="John Smith", start_char=15, end_char=25, tag="prefix"),
]

with patch.object(doc, "get_tokens", return_value=long_tokenlist):
annotations = annotator.annotate(doc)

Expand Down Expand Up @@ -286,156 +128,6 @@ def test_multi_token_with_expander(self, long_text, long_tokenlist):
assert annotations == expected_annotations


class TestMultiTokenLookupAnnotator:
def test_multi_token(self, long_text, long_tokenlist):
doc = Document(long_text)
annotator = MultiTokenLookupAnnotator(
lookup_values=["my name", "my wife"],
tokenizer=WordBoundaryTokenizer(),
tag="prefix",
)
expected_annotations = [
Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
]

with patch.object(doc, "get_tokens", return_value=long_tokenlist):
annotations = annotator.annotate(doc)

assert annotations == expected_annotations

def test_multi_token_with_matching_pipeline(self, long_text, long_tokenlist):
doc = Document(long_text)

annotator = MultiTokenLookupAnnotator(
lookup_values=["my name", "my wife"],
tokenizer=WordBoundaryTokenizer(),
matching_pipeline=[LowercaseString()],
tag="prefix",
)
expected_annotations = [
Annotation(text="My name", start_char=0, end_char=7, tag="prefix"),
Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
]

with patch.object(doc, "get_tokens", return_value=long_tokenlist):
annotations = annotator.annotate(doc)

assert annotations == expected_annotations

def test_multi_token_lookup_with_overlap(self, long_text, long_tokenlist):
doc = Document(long_text)

annotator = MultiTokenLookupAnnotator(
lookup_values=["dr. John", "John Smith"],
tokenizer=WordBoundaryTokenizer(),
tag="prefix",
overlapping=True,
)

expected_annotations = [
Annotation(text="dr. John", start_char=11, end_char=19, tag="prefix"),
Annotation(text="John Smith", start_char=15, end_char=25, tag="prefix"),
]


class TestMultiTokenLookupAnnotator:
def test_multi_token(self, long_text, long_tokenlist):
doc = Document(long_text)
annotator = MultiTokenLookupAnnotator(
lookup_values=["my name", "my wife"],
tokenizer=WordBoundaryTokenizer(),
tag="prefix",
)
expected_annotations = [
Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
]

with patch.object(doc, "get_tokens", return_value=long_tokenlist):
annotations = annotator.annotate(doc)

assert annotations == expected_annotations

def test_multi_token_with_matching_pipeline(self, long_text, long_tokenlist):
doc = Document(long_text)

annotator = MultiTokenLookupAnnotator(
lookup_values=["my name", "my wife"],
tokenizer=WordBoundaryTokenizer(),
matching_pipeline=[LowercaseString()],
tag="prefix",
)
expected_annotations = [
Annotation(text="My name", start_char=0, end_char=7, tag="prefix"),
Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
]

with patch.object(doc, "get_tokens", return_value=long_tokenlist):
annotations = annotator.annotate(doc)

assert annotations == expected_annotations

def test_multi_token_lookup_with_overlap(self, long_text, long_tokenlist):
doc = Document(long_text)

annotator = MultiTokenLookupAnnotator(
lookup_values=["dr. John", "John Smith"],
tokenizer=WordBoundaryTokenizer(),
tag="prefix",
overlapping=True,
)

expected_annotations = [
Annotation(text="dr. John", start_char=11, end_char=19, tag="prefix"),
Annotation(text="John Smith", start_char=15, end_char=25, tag="prefix"),
]

with patch.object(doc, "get_tokens", return_value=long_tokenlist):
annotations = annotator.annotate(doc)

assert annotations == expected_annotations

def test_multi_token_lookup_no_overlap(self, long_text, long_tokenlist):
doc = Document(long_text)

annotator = MultiTokenLookupAnnotator(
lookup_values=["dr. John", "John Smith"],
tokenizer=WordBoundaryTokenizer(),
tag="prefix",
overlapping=False,
)

expected_annotations = [
Annotation(text="dr. John", start_char=11, end_char=19, tag="prefix"),
]

with patch.object(doc, "get_tokens", return_value=long_tokenlist):
annotations = annotator.annotate(doc)

assert annotations == expected_annotations

def test_multi_token_lookup_with_trie(self, long_text, long_tokenlist):
doc = Document(long_text)

trie = docdeid.ds.LookupTrie(matching_pipeline=[LowercaseString()])
trie.add_item(["my", " ", "name"])
trie.add_item(["my", " ", "wife"])

annotator = MultiTokenLookupAnnotator(
trie=trie,
tag="prefix",
)

expected_annotations = [
Annotation(text="My name", start_char=0, end_char=7, tag="prefix"),
Annotation(text="my wife", start_char=39, end_char=46, tag="prefix"),
]

with patch.object(doc, "get_tokens", return_value=long_tokenlist):
annotations = annotator.annotate(doc)

assert annotations == expected_annotations


class TestRegexpAnnotator:
def test_regexp_annotator(self, long_text):
doc = Document(long_text)
Expand Down