Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

v1.0.0 #18

Merged
merged 45 commits into from
Dec 20, 2023
Merged
Changes from 1 commit
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
a684463
Rename tokenize module
vmenger Nov 30, 2023
4b02432
Add token index
vmenger Nov 30, 2023
6fb99e0
Add frozendict dependency
vmenger Dec 1, 2023
a178872
Speedup annotation sorting
vmenger Dec 1, 2023
2e01493
Speedup regexp annotator
vmenger Dec 1, 2023
903f540
Speedup single token annotator
vmenger Dec 1, 2023
949ec06
Fix test
vmenger Dec 1, 2023
1dae5f3
Speedup multi token lookup annotator
vmenger Dec 1, 2023
7e91a06
Add offset option
vmenger Dec 1, 2023
e66c038
Add repr for string processor
vmenger Dec 1, 2023
dde29cb
Move link token logic
vmenger Dec 1, 2023
ab5cc70
Add token lookup logic with matching pipeline
vmenger Dec 1, 2023
b0c493f
Update offset to start_i
vmenger Dec 1, 2023
608c3f5
Update previous and next tests
vmenger Dec 1, 2023
3ef03ee
Update get_words and token_lookup tests
vmenger Dec 1, 2023
79f6190
Formatting
vmenger Dec 1, 2023
eeffb20
Cleanup annotator code
vmenger Dec 1, 2023
280e7a8
Cleanup tokenizer code
vmenger Dec 1, 2023
f4d21ef
Formatting
vmenger Dec 1, 2023
fcc8873
Rename instance var
vmenger Dec 1, 2023
798ddfc
Add info when not presenting callbacks as frozendicht
vmenger Dec 1, 2023
60f58b6
Update changelog
vmenger Dec 1, 2023
b43fc0f
Update dependencies
vmenger Dec 1, 2023
4079155
Optimize caching
vmenger Dec 4, 2023
bd65b0a
Optimize caching
vmenger Dec 4, 2023
b03108b
Add option to directly add trie to multi token lookup
vmenger Dec 5, 2023
61d6909
Update formatting and linting
vmenger Dec 7, 2023
51aea0f
Update formatting and linting
vmenger Dec 7, 2023
0a0624b
Linting
vmenger Dec 7, 2023
e4998ad
Formattin
vmenger Dec 8, 2023
a72ba4b
Improve pattern
vmenger Dec 8, 2023
fbdbd55
Improve processor and processor group abstraction
vmenger Dec 8, 2023
936a990
Rename files
vmenger Dec 8, 2023
8b486a7
Formatting
vmenger Dec 8, 2023
7945cb0
Order test classes
vmenger Dec 8, 2023
9587342
Move dev dependency
vmenger Dec 8, 2023
0a2b8f2
Update changelog
vmenger Dec 8, 2023
43c8241
Remove lcov file
vmenger Dec 8, 2023
46de845
Update docs
vmenger Dec 8, 2023
a9b9bd7
Update changlog
vmenger Dec 8, 2023
f3c15d2
Update token serializing
vmenger Dec 8, 2023
528a5b9
Rename pre_match_tokens internally
vmenger Dec 11, 2023
1263c37
Fix typo
vmenger Dec 12, 2023
dcb36bc
Use casefold instead of lower
vmenger Dec 13, 2023
a12775c
Prepare release
vmenger Dec 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Improve processor and processor group abstraction
  • Loading branch information
vmenger committed Dec 8, 2023
commit fbdbd55bc94d1d94147ad8073e0333c804598c2c
36 changes: 23 additions & 13 deletions docdeid/process/doc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from abc import ABC, abstractmethod
from collections import OrderedDict
from typing import Iterator, Optional
from typing import Iterator, Optional, Union

from docdeid.document import Document

Expand All @@ -19,15 +19,17 @@ def process(self, doc: Document, **kwargs) -> None:
"""


class DocProcessorGroup(DocProcessor):
class DocProcessorGroup:
"""
A group of :class:`.DocProcessor`, that executes the containing processors in order.

A :class:`.DocProcessorGroup` can itself be part of a :class:`.DocProcessorGroup`.
"""

def __init__(self) -> None:
self._processors: OrderedDict[str, DocProcessor] = OrderedDict()
self._processors: OrderedDict[
str, Union[DocProcessor | DocProcessorGroup]
] = OrderedDict()

def get_names(self, recursive: bool = True) -> list[str]:
"""
Expand All @@ -52,7 +54,10 @@ def get_names(self, recursive: bool = True) -> list[str]:
return names

def add_processor(
self, name: str, processor: DocProcessor, position: Optional[int] = None
self,
name: str,
processor: Union[DocProcessor, "DocProcessorGroup"],
position: Optional[int] = None,
) -> None:
"""
Add a document processor to the group.
Expand Down Expand Up @@ -90,7 +95,7 @@ def remove_processor(self, name: str) -> None:
"""
del self._processors[name]

def __getitem__(self, name: str) -> DocProcessor:
def __getitem__(self, name: str) -> Union[DocProcessor, "DocProcessorGroup"]:
"""
Get a document processor by name.

Expand All @@ -103,23 +108,25 @@ def __getitem__(self, name: str) -> DocProcessor:

return self._processors[name]

def process(self, doc: Document, **kwargs) -> None:
def process(
self,
doc: Document,
enabled: Optional[set[str]] = None,
disabled: Optional[set[str]] = None,
) -> None:
"""
Process a document, by passing it to this groups processors.
Process a document, by passing it to this group's processors.

Args:
doc: The document to be processed.
enabled: A set of strings, indicating which document processors to run for
this document. By default all document processors are used. In case of
this document. By default, all document processors are used. In case of
nested, it's necessary to supply both the name of the processor group,
as well as all of its containing processors (or a subset thereof).
and all of its containing processors (or a subset thereof).
disabled: A set of strings, indicating which document processors not to
run for this document. Cannot be used together with `enabled`.
"""

enabled = kwargs.get("enabled", None)
disabled = kwargs.get("disabled", None)

if (enabled is not None) and (disabled is not None):
raise RuntimeError("Cannot use enabled and disabled simultaneously")

Expand All @@ -131,7 +138,10 @@ def process(self, doc: Document, **kwargs) -> None:
if (disabled is not None) and (name in disabled):
continue

proc.process(doc, enabled=enabled, disabled=disabled)
if isinstance(proc, DocProcessor):
proc.process(doc)
elif isinstance(proc, DocProcessorGroup):
proc.process(doc, enabled=enabled, disabled=disabled)

def __iter__(self) -> Iterator:
"""
Expand Down