Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Transformations lookup #19

Merged
merged 14 commits into from
Feb 2, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fixed issue with potential overwritten values in expansion_to_origina…
…l_dict. fixed formatting
  • Loading branch information
LydiaMennesHealth committed Jan 17, 2024
commit 3c95c187b37754382ac1433da5d73c2fbcbb289d
4 changes: 2 additions & 2 deletions docdeid/ds/lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,6 @@ def longest_matching_prefix(

longest_match = None
current_node = self
# create match on the fly to maintain the property that the match that fit the trie is returned
match = []

for i in itertools.count():
Expand All @@ -340,7 +339,8 @@ def longest_matching_prefix(
self._apply_matching_pipeline(item[start_i + i])
)

# get the value that matches the trie if any. Same as an any() call but returns the value that matched
# Get the value that matches the trie if any.
# This is lazy like an any() call but returns the value that matched
matched = next((t for t in cur_items if t in current_node.children), None)
if matched is None:
break
Expand Down
2 changes: 1 addition & 1 deletion docdeid/process/doc_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class DocProcessorGroup:

def __init__(self) -> None:
self._processors: OrderedDict[
str, Union[DocProcessor | DocProcessorGroup]
str, Union[DocProcessor, DocProcessorGroup]
] = OrderedDict()

def get_names(self, recursive: bool = True) -> list[str]:
Expand Down
10 changes: 6 additions & 4 deletions docdeid/str/expander.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from abc import ABC, abstractmethod
from collections import defaultdict
from typing import Iterable

from docdeid.str.processor import StringModifier
Expand Down Expand Up @@ -35,13 +36,14 @@ def expand_item_iterable(self, items: Iterable[str]) -> set[str]:

def get_expansion_to_original_dict(self, items: Iterable[str]) -> dict[str, str]:
"""Expand a set of strings into a dictionary where the keys are results from
expand_item and values the original text."""
expand_item and values a set of original text(s)."""

# This can get overwritten if different original texts map to the same expansion due to multiple operations...
result_dict = {}
# Need a set because otherwise value might be overwritten
# when multiple items expand to the same value
result_dict = defaultdict(set)
for item in items:
for expansion in self.expand_item(item):
result_dict[expansion] = item
result_dict[expansion].add(item)
return result_dict


Expand Down
7 changes: 4 additions & 3 deletions docdeid/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,11 @@ def token_lookup(
else:
expansion_dict = expander.get_expansion_to_original_dict(words)
# get the original words of which the expansion matched the lookup values
matched_words = [
expansion_dict[m]
matched_words = set(
expansion_dict[elem]
for m in set(expansion_dict.keys()).intersection(lookup_values)
]
for elem in m
)

for word in matched_words:
tokens.update(self._text_to_tokens[pipe_key][word])
Expand Down
Loading