Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Transformations lookup #19

Merged
merged 14 commits into from
Feb 2, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
integrated expanderr into TokenList and added testing
  • Loading branch information
LydiaMennesHealth committed Jan 11, 2024
commit 07fe44ccc7f63c5cb6ec3cf6add29084b27597bb
41 changes: 21 additions & 20 deletions docdeid/ds/lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Iterable, Iterator, Optional, Union

from docdeid.ds.ds import Datastructure
from docdeid.str.expander import Expander
from docdeid.str.processor import StringModifier, StringProcessor, StripString


Expand Down Expand Up @@ -100,7 +101,6 @@ def remove_items_from_iterable(self, items: Iterable[str]) -> None:
"""

for item in items:

item = self._apply_matching_pipeline(item)

if item in self._items:
Expand Down Expand Up @@ -277,7 +277,6 @@ def add_item(self, item: list[str]) -> None:
self.is_terminal = True

else:

head, tail = self._apply_matching_pipeline(item[0]), item[1:]

if head not in self.children:
Expand All @@ -304,7 +303,7 @@ def __contains__(self, item: list[str]) -> bool:
return (head in self.children) and tail in self.children[head]

def longest_matching_prefix(
self, item: list[str], start_i: int = 0
self, item: list[str], start_i: int = 0, expander: Optional[Expander] = None
) -> Union[list[str], None]:
"""
Finds the longest matching prefix of a list of strings. This is used to find the
Expand All @@ -324,27 +323,29 @@ def longest_matching_prefix(

longest_match = None
current_node = self
# create match on the fly to maintain the property that the match that fit the trie is returned
match = []

for i in itertools.count():

if current_node.is_terminal:
longest_match = i

if start_i + i >= len(item) or (
self._apply_matching_pipeline(item[start_i + i])
not in current_node.children
):
if start_i + i >= len(item):
break

current_node = current_node.children[
self._apply_matching_pipeline(item[start_i + i])
]

return (
[
self._apply_matching_pipeline(item)
for item in item[start_i : start_i + longest_match]
]
if longest_match
else None
)
if expander is None:
cur_items = [self._apply_matching_pipeline(item[start_i + i])]
else:
cur_items = expander.expand_item(
self._apply_matching_pipeline(item[start_i + i])
)

# get the value that matches the trie if any. Same as an any() call but returns the value that matched
matched = next((t for t in cur_items if t in current_node.children), None)
if matched is None:
break

match.append(matched)
current_node = current_node.children[matched]

return match[:longest_match] if longest_match else None
26 changes: 10 additions & 16 deletions docdeid/process/annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from docdeid.ds.lookup import LookupSet, LookupTrie
from docdeid.pattern import TokenPattern
from docdeid.process.doc_processor import DocProcessor
from docdeid.str.expander import Expander
from docdeid.str.processor import StringModifier
from docdeid.tokenizer import Token, Tokenizer

Expand Down Expand Up @@ -68,14 +69,12 @@ def __init__(
tokenizer_name: str = "default",
**kwargs,
) -> None:

self.lookup_set = LookupSet(matching_pipeline=matching_pipeline)
self.lookup_set.add_items_from_iterable(items=lookup_values)
self._tokenizer_name = tokenizer_name
super().__init__(*args, **kwargs)

def _tokens_to_annotations(self, tokens: Iterable[Token]) -> list[Annotation]:

return [
Annotation(
text=token.text,
Expand All @@ -90,7 +89,6 @@ def _tokens_to_annotations(self, tokens: Iterable[Token]) -> list[Annotation]:
]

def annotate(self, doc: Document) -> list[Annotation]:

tokens = doc.get_tokens(tokenizer_name=self._tokenizer_name)

annotate_tokens = tokens.token_lookup(
Expand Down Expand Up @@ -131,13 +129,12 @@ def __init__(
tokenizer: Optional[Tokenizer] = None,
trie: Optional[LookupTrie] = None,
overlapping: bool = False,
expander: Optional[list[Expander]] = None,
**kwargs,
) -> None:

self._start_words: set[str] = set()

if (trie is not None) and (lookup_values is None) and (tokenizer is None):

self._trie = trie
self._matching_pipeline = trie.matching_pipeline or []
self._start_words = set(trie.children.keys())
Expand All @@ -153,15 +150,14 @@ def __init__(
)

self.overlapping = overlapping
self.expander = expander

super().__init__(*args, **kwargs)

def _init_lookup_structures(
self, lookup_values: Iterable[str], tokenizer: Tokenizer
) -> None:

for val in lookup_values:

texts = [token.text for token in tokenizer.tokenize(val)]

if len(texts) > 0:
Expand All @@ -175,12 +171,13 @@ def _init_lookup_structures(
self._start_words.add(start_token)

def annotate(self, doc: Document) -> list[Annotation]:

tokens = doc.get_tokens()

start_tokens = sorted(
tokens.token_lookup(
self._start_words, matching_pipeline=self._matching_pipeline
self._start_words,
matching_pipeline=self._matching_pipeline,
expander=self.expander,
),
key=lambda token: token.start_char,
)
Expand All @@ -192,12 +189,11 @@ def annotate(self, doc: Document) -> list[Annotation]:
min_i = 0

for i in start_indices:

if i < min_i:
continue

longest_matching_prefix = self._trie.longest_matching_prefix(
tokens_text, start_i=i
tokens_text, start_i=i, expander=self.expander
)

if longest_matching_prefix is None:
Expand Down Expand Up @@ -247,7 +243,6 @@ def __init__(
pre_match_words: Optional[list[str]] = None,
**kwargs,
) -> None:

if isinstance(regexp_pattern, str):
regexp_pattern = re.compile(regexp_pattern)

Expand All @@ -264,12 +259,13 @@ def __init__(
super().__init__(*args, **kwargs)

def _validate_match(
self, match: re.Match, doc: Document # pylint: disable=W0613
self,
match: re.Match,
doc: Document, # pylint: disable=W0613
) -> bool:
return True

def annotate(self, doc: Document) -> list[Annotation]:

if self.pre_match_words is not None:
try:
if (
Expand All @@ -284,7 +280,6 @@ def annotate(self, doc: Document) -> list[Annotation]:
annotations = []

for match in self.regexp_pattern.finditer(doc.text):

if not self._validate_match(match, doc):
continue

Expand Down Expand Up @@ -324,7 +319,6 @@ def annotate(self, doc: Document) -> list[Annotation]:
return annotations

for token in doc.get_tokens():

if not self.pattern.token_precondition(token):
continue

Expand Down
25 changes: 24 additions & 1 deletion docdeid/str/expander.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from abc import ABC, abstractmethod
from typing import Iterable

from docdeid.str.processor import StringModifier

Expand All @@ -7,7 +8,7 @@ class Expander(ABC):
"""Abstract class for string expansion."""

@abstractmethod
def expand_item(self, item: str) -> list[str]:
def expand_item(self, item: str) -> set[str]:
"""
Expand a string into a list of strings that contains the original and possibly additional strings.

Expand All @@ -18,6 +19,28 @@ def expand_item(self, item: str) -> list[str]:
The expanded items.
"""

def expand_item_iterable(self, items: Iterable[str]) -> set[str]:
"""
Expand a set of strings into a set of strings that contains the original and possibly additional strings.

Args:
words: The input set of strings.

Returns:
The expanded set of strings.
"""
return set.union(*(self.expand_item(item) for item in items))

def get_expansion_to_original_dict(self, items: Iterable[str]) -> dict[str, str]:
"""Expand a set of strings into a dictionary where the keys are results from expand_item and values the original text."""

# This can get overwritten if different original texts map to the same expansion due to multiple operations...
result_dict = {}
for item in items:
for expansion in self.expand_item(item):
result_dict[expansion] = item
return result_dict


class MinimumLengthExpander(Expander):
"""Expands a string by applying the lists of string processors. These are only applied to tokens whose length >= minimum length"""
Expand Down
7 changes: 7 additions & 0 deletions docdeid/str/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,10 @@ def __init__(self, min_len: int) -> None:

def filter(self, item: str) -> bool:
return len(item) >= self.min_len


class TitleCase(StringModifier):
"""Titlecase string."""

def process(self, item: str) -> str:
return item.title()
24 changes: 16 additions & 8 deletions docdeid/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import Iterator, Literal, Optional

from docdeid.str import StringModifier
from docdeid.str.expander import Expander


@dataclass(frozen=True)
Expand Down Expand Up @@ -149,7 +150,6 @@ def __init__(self, tokens: list[Token], link_tokens: bool = True) -> None:
self._text_to_tokens: dict[str, defaultdict[str, list[Token]]] = {}

def _link_tokens(self) -> None:

for i in range(len(self._tokens) - 1):
self._tokens[i].set_next_token(self._tokens[i + 1])
self._tokens[i + 1].set_previous_token(self._tokens[i])
Expand All @@ -168,15 +168,13 @@ def token_index(self, token: Token) -> int:
def _init_token_lookup(
self, matching_pipeline: Optional[list[StringModifier]] = None
) -> None:

matching_pipeline = matching_pipeline or []
pipe_key = str(matching_pipeline)

words = set()
text_to_tokens = defaultdict(list)

for token in self._tokens:

text = token.text

for string_modifier in matching_pipeline:
Expand Down Expand Up @@ -213,6 +211,7 @@ def token_lookup(
self,
lookup_values: set[str],
matching_pipeline: Optional[list[StringModifier]] = None,
expander: Optional[Expander] = None,
) -> set[Token]:
"""
Lookup all tokens of which the text matches a certain set of lookup values.
Expand All @@ -229,27 +228,37 @@ def token_lookup(
matching_pipeline = matching_pipeline or []
pipe_key = str(matching_pipeline)

# this is already part of get_words func -> remove?
if pipe_key not in self._text_to_tokens:
self._init_token_lookup(matching_pipeline)

tokens = set()
words = self.get_words(matching_pipeline)

for word in words.intersection(lookup_values):
if expander is None:
matched_words = words.intersection(lookup_values)

else:
# make expansions if expander is provided
expansion_dict = expander.get_expansion_to_original_dict(words)
# get the original words of which the expansion or original matched the lookup values
matched_words = [
expansion_dict[m]
for m in set(expansion_dict.keys()).intersection(lookup_values)
]

for word in matched_words:
tokens.update(self._text_to_tokens[pipe_key][word])

return tokens

def __iter__(self) -> Iterator[Token]:

return iter(self._tokens)

def __len__(self) -> int:

return len(self._tokens)

def __getitem__(self, index: int) -> Token:

return self._tokens[index]

def __eq__(self, other: object) -> bool:
Expand Down Expand Up @@ -343,7 +352,6 @@ def _split_text(self, text: str) -> list[Token]:
matches = [*re.finditer(r"\b", text)]

for start_match, end_match in zip(matches, matches[1:]):

start_char = start_match.span(0)[0]
end_char = end_match.span(0)[0]

Expand Down
Loading