Log annotated text after every processor

vmenger · mkorvas · Mar 1, 2024 · Mar 1, 2024 · Mar 4, 2024 · Mar 4, 2024
commit c7b4c896f1ed3718a118d70cae5156d7b481b1ff
diff --git a/docdeid/process/doc_processor.py b/docdeid/process/doc_processor.py
@@ -1,8 +1,10 @@
+import logging
 from abc import ABC, abstractmethod
 from collections import OrderedDict
 from typing import Iterator, Optional, Union
 
 from docdeid.document import Document
+from docdeid.utils import annotate_doc
 
 
 class DocProcessor(ABC):  # pylint: disable=R0903
@@ -143,6 +145,8 @@ def process(
             elif isinstance(proc, DocProcessorGroup):
                 proc.process(doc, enabled=enabled, disabled=disabled)
 
+            logging.debug("after %s: %s", name, annotate_doc(doc))
+
     def __iter__(self) -> Iterator:
 
         return iter(self._processors.items())
diff --git a/docdeid/utils.py b/docdeid/utils.py
@@ -1,3 +1,5 @@
+from collections import defaultdict
+
 from frozendict import frozendict
 
 from docdeid.document import Document
@@ -32,3 +34,32 @@ def annotate_intext(doc: Document) -> str:
         )
 
     return text
+
+
+def annotate_doc(doc: Document) -> str:
+    """\
+    Adds XML-like markup for annotations into the text of a document.
+
+    Handles also nested mentions and in a way also overlapping mentions, even
+    though this kind of markup cannot really represent them.
+    """
+    annos_from_shortest = sorted(
+        doc.annotations,
+        key=lambda anno: anno.end_char - anno.start_char)
+    idx_to_anno_starts = defaultdict(list)
+    idx_to_anno_ends = defaultdict(list)
+    for anno in annos_from_shortest:
+        idx_to_anno_starts[anno.start_char].append(anno)
+        idx_to_anno_ends[anno.end_char].append(anno)
+    markup_indices = sorted(set(idx_to_anno_starts).union(idx_to_anno_ends))
+    chunks = list()
+    last_idx = 0
+    for idx in markup_indices:
+        chunks.append(doc.text[last_idx:idx])
+        for ending_anno in idx_to_anno_ends[idx]:
+            chunks.append(f'</{ending_anno.tag.upper()}>')
+        for starting_anno in reversed(idx_to_anno_starts[idx]):
+            chunks.append(f'<{starting_anno.tag.upper()}>')
+        last_idx = idx
+    chunks.append(doc.text[last_idx:])
+    return ''.join(chunks)