feat: Adds XML export method to DocumentBuilder (#544)

* 512: add export_as_xml * 512: add export_as_xml * #544 apply changes * #544 apply codefactor error * #544: apply changes again * #544 apply changes and add example * #544 minor fix in example * #544 minor fix in example * #544: apply changes * #544: apply changes * #544: remove Exception and change to TypeError Co-authored-by: felix <felix.dittrich@memoresa.de>
mindee · Nov 2, 2021 · 92c1eeb · 92c1eeb
1 parent b27c3a6
commit 92c1eeb
Show file tree

Hide file tree

Showing 3 changed files with 133 additions and 2 deletions.
diff --git a/docs/source/using_models.rst b/docs/source/using_models.rst
@@ -295,4 +295,35 @@ For reference, here is the JSON export for the same `Document` as above::
             ]
         }
     ]
-  }
+  }
+
+To export the outpout as XML (hocr-format) you can use the `export_as_xml` method::
+
+  xml_output = result.export_as_xml()
+  for output in xml_output:
+    xml_bytes_string = output[0]
+    xml_element = output[1]
+
+For reference, here is a sample XML byte string output::
+
+  <?xml version="1.0" encoding="UTF-8"?>
+  <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
+    <head>
+      <title>docTR - hOCR</title>
+      <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+      <meta name="ocr-system" content="doctr 0.5.0" />
+      <meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
+    </head>
+    <body>
+      <div class="ocr_page" id="page_1" title="image; bbox 0 0 3456 3456; ppageno 0" />
+      <div class="ocr_carea" id="block_1_1" title="bbox 857 529 2504 2710">
+        <p class="ocr_par" id="par_1_1" title="bbox 857 529 2504 2710">
+          <span class="ocr_line" id="line_1_1" title="bbox 857 529 2504 2710; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0">
+            <span class="ocrx_word" id="word_1_1" title="bbox 1552 540 1778 580; x_wconf 99">Hello</span>
+            <span class="ocrx_word" id="word_1_2" title="bbox 1782 529 1900 583; x_wconf 99">XML</span>
+            <span class="ocrx_word" id="word_1_3" title="bbox 1420 597 1684 641; x_wconf 81">World</span>
+          </span>
+        </p>
+      </div>
+    </body>
+  </html>
diff --git a/doctr/io/elements.py b/doctr/io/elements.py
@@ -12,6 +12,10 @@
 from doctr.utils.common_types import BoundingBox, RotatedBbox
 from doctr.utils.repr import NestedObject
 
+import doctr
+from xml.etree import ElementTree as ET
+from xml.etree.ElementTree import Element as ETElement, SubElement
+
 __all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']
 
 
@@ -253,6 +257,83 @@ def synthesize(self, **kwargs) -> np.ndarray:
 
         return synthesize_page(self.export(), **kwargs)
 
+    def export_as_xml(self, file_title: str = 'docTR - XML export (hOCR)') -> Tuple[bytes, ET.ElementTree]:
+        """Export the page as XML (hOCR-format)
+        convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md
+
+        Args:
+            file_title: the title of the XML file
+
+        Returns:
+            a tuple of the XML byte string, and its ElementTree
+        """
+        p_idx = self.page_idx
+        block_count: int = 1
+        line_count: int = 1
+        word_count: int = 1
+        width, height = self.dimensions
+        language = self.language if 'language' in self.language.keys() else 'en'
+        # Create the XML root element
+        page_hocr = ETElement('html', attrib={'xmlns': 'http://www.w3.org/1999/xhtml', 'xml:lang': str(language)})
+        # Create the header / SubElements of the root element
+        head = SubElement(page_hocr, 'head')
+        SubElement(head, 'title').text = file_title
+        SubElement(head, 'meta', attrib={'http-equiv': 'Content-Type', 'content': 'text/html; charset=utf-8'})
+        SubElement(head, 'meta', attrib={'name': 'ocr-system', 'content': f"python-doctr {doctr.__version__}"})
+        SubElement(head, 'meta', attrib={'name': 'ocr-capabilities',
+                                         'content': 'ocr_page ocr_carea ocr_par ocr_line ocrx_word'})
+        # Create the body
+        body = SubElement(page_hocr, 'body')
+        SubElement(body, 'div', attrib={
+            'class': 'ocr_page',
+            'id': f'page_{p_idx + 1}',
+            'title': f'image; bbox 0 0 {width} {height}; ppageno 0'
+        })
+        # iterate over the blocks / lines / words and create the XML elements in body line by line with the attributes
+        for block in self.blocks:
+            if len(block.geometry) != 2:
+                raise TypeError("XML export is only available for straight bounding boxes for now.")
+            (xmin, ymin), (xmax, ymax) = block.geometry  # type: ignore[misc]
+            block_div = SubElement(body, 'div', attrib={
+                'class': 'ocr_carea',
+                'id': f'block_{block_count}',
+                'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \
+                    {int(round(xmax * width))} {int(round(ymax * height))}'
+            })
+            paragraph = SubElement(block_div, 'p', attrib={
+                'class': 'ocr_par',
+                'id': f'par_{block_count}',
+                'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \
+                    {int(round(xmax * width))} {int(round(ymax * height))}'
+            })
+            block_count += 1
+            for line in block.lines:
+                (xmin, ymin), (xmax, ymax) = line.geometry  # type: ignore[misc]
+                # NOTE: baseline, x_size, x_descenders, x_ascenders is currently initalized to 0
+                line_span = SubElement(paragraph, 'span', attrib={
+                    'class': 'ocr_line',
+                    'id': f'line_{line_count}',
+                    'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \
+                        {int(round(xmax * width))} {int(round(ymax * height))}; \
+                        baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0'
+                })
+                line_count += 1
+                for word in line.words:
+                    (xmin, ymin), (xmax, ymax) = word.geometry  # type: ignore[misc]
+                    conf = word.confidence
+                    word_div = SubElement(line_span, 'span', attrib={
+                        'class': 'ocrx_word',
+                        'id': f'word_{word_count}',
+                        'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \
+                            {int(round(xmax * width))} {int(round(ymax * height))}; \
+                            x_wconf {int(round(conf * 100))}'
+                    })
+                    # set the text
+                    word_div.text = word.value
+                    word_count += 1
+
+        return (ET.tostring(page_hocr, encoding='utf-8', method='xml'), ET.ElementTree(page_hocr))
+
     @classmethod
     def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
         kwargs = {k: save_dict[k] for k in cls._exported_keys}
@@ -298,6 +379,17 @@ def synthesize(self, **kwargs) -> List[np.ndarray]:
 
         return [page.synthesize() for page in self.pages]
 
+    def export_as_xml(self, **kwargs) -> List[Tuple[bytes, ET.ElementTree]]:
+        """Export the document as XML (hOCR-format)
+
+        Args:
+            **kwargs: additional keyword arguments passed to the Page.export_as_xml method
+
+        Returns:
+            list of tuple of (bytes, ElementTree)
+        """
+        return [page.export_as_xml(**kwargs) for page in self.pages]
+
     @classmethod
     def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
         kwargs = {k: save_dict[k] for k in cls._exported_keys}

diff --git a/test/common/test_io_elements.py b/test/common/test_io_elements.py
@@ -1,6 +1,7 @@
-import pytest
 import numpy as np
+import pytest
 from doctr.io import elements
+from xml.etree.ElementTree import ElementTree
 
 
 def _mock_words(size=(1., 1.), offset=(0, 0), confidence=0.9):
@@ -190,6 +191,10 @@ def test_page():
     assert page.export() == {"blocks": [b.export() for b in blocks], "page_idx": page_idx, "dimensions": page_size,
                              "orientation": orientation, "language": language}
 
+    # Export XML
+    assert isinstance(page.export_as_xml(), tuple) and isinstance(
+        page.export_as_xml()[0], (bytes, bytearray)) and isinstance(page.export_as_xml()[1], ElementTree)
+
     # Repr
     assert '\n'.join(repr(page).split('\n')[:2]) == f'Page(\n  dimensions={repr(page_size)}'
 
@@ -217,6 +222,9 @@ def test_document():
     # Export
     assert doc.export() == {"pages": [p.export() for p in pages]}
 
+    # Export XML
+    assert isinstance(doc.export_as_xml(), list) and len(doc.export_as_xml()) == len(pages)
+
     # Show
     doc.show([np.zeros((256, 256, 3), dtype=np.uint8) for _ in range(len(pages))], block=False)