From 92c1eeb9dc5562fb4d7f6a743a5d9e5694cbe2c0 Mon Sep 17 00:00:00 2001 From: Felix Dittrich Date: Tue, 2 Nov 2021 15:38:29 +0100 Subject: [PATCH] feat: Adds XML export method to DocumentBuilder (#544) * 512: add export_as_xml * 512: add export_as_xml * #544 apply changes * #544 apply codefactor error * #544: apply changes again * #544 apply changes and add example * #544 minor fix in example * #544 minor fix in example * #544: apply changes * #544: apply changes * #544: remove Exception and change to TypeError Co-authored-by: felix --- docs/source/using_models.rst | 33 +++++++++++- doctr/io/elements.py | 92 +++++++++++++++++++++++++++++++++ test/common/test_io_elements.py | 10 +++- 3 files changed, 133 insertions(+), 2 deletions(-) diff --git a/docs/source/using_models.rst b/docs/source/using_models.rst index fc4a3b8979..c44627f466 100644 --- a/docs/source/using_models.rst +++ b/docs/source/using_models.rst @@ -295,4 +295,35 @@ For reference, here is the JSON export for the same `Document` as above:: ] } ] - } \ No newline at end of file + } + +To export the outpout as XML (hocr-format) you can use the `export_as_xml` method:: + + xml_output = result.export_as_xml() + for output in xml_output: + xml_bytes_string = output[0] + xml_element = output[1] + +For reference, here is a sample XML byte string output:: + + + + + docTR - hOCR + + + + + +
+
+

+ + Hello + XML + World + +

+
+ + \ No newline at end of file diff --git a/doctr/io/elements.py b/doctr/io/elements.py index f477145543..3ad01ad63d 100644 --- a/doctr/io/elements.py +++ b/doctr/io/elements.py @@ -12,6 +12,10 @@ from doctr.utils.common_types import BoundingBox, RotatedBbox from doctr.utils.repr import NestedObject +import doctr +from xml.etree import ElementTree as ET +from xml.etree.ElementTree import Element as ETElement, SubElement + __all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document'] @@ -253,6 +257,83 @@ def synthesize(self, **kwargs) -> np.ndarray: return synthesize_page(self.export(), **kwargs) + def export_as_xml(self, file_title: str = 'docTR - XML export (hOCR)') -> Tuple[bytes, ET.ElementTree]: + """Export the page as XML (hOCR-format) + convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md + + Args: + file_title: the title of the XML file + + Returns: + a tuple of the XML byte string, and its ElementTree + """ + p_idx = self.page_idx + block_count: int = 1 + line_count: int = 1 + word_count: int = 1 + width, height = self.dimensions + language = self.language if 'language' in self.language.keys() else 'en' + # Create the XML root element + page_hocr = ETElement('html', attrib={'xmlns': 'http://www.w3.org/1999/xhtml', 'xml:lang': str(language)}) + # Create the header / SubElements of the root element + head = SubElement(page_hocr, 'head') + SubElement(head, 'title').text = file_title + SubElement(head, 'meta', attrib={'http-equiv': 'Content-Type', 'content': 'text/html; charset=utf-8'}) + SubElement(head, 'meta', attrib={'name': 'ocr-system', 'content': f"python-doctr {doctr.__version__}"}) + SubElement(head, 'meta', attrib={'name': 'ocr-capabilities', + 'content': 'ocr_page ocr_carea ocr_par ocr_line ocrx_word'}) + # Create the body + body = SubElement(page_hocr, 'body') + SubElement(body, 'div', attrib={ + 'class': 'ocr_page', + 'id': f'page_{p_idx + 1}', + 'title': f'image; bbox 0 0 {width} {height}; ppageno 0' + }) + # iterate over the blocks / lines / words and create the XML elements in body line by line with the attributes + for block in self.blocks: + if len(block.geometry) != 2: + raise TypeError("XML export is only available for straight bounding boxes for now.") + (xmin, ymin), (xmax, ymax) = block.geometry # type: ignore[misc] + block_div = SubElement(body, 'div', attrib={ + 'class': 'ocr_carea', + 'id': f'block_{block_count}', + 'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \ + {int(round(xmax * width))} {int(round(ymax * height))}' + }) + paragraph = SubElement(block_div, 'p', attrib={ + 'class': 'ocr_par', + 'id': f'par_{block_count}', + 'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \ + {int(round(xmax * width))} {int(round(ymax * height))}' + }) + block_count += 1 + for line in block.lines: + (xmin, ymin), (xmax, ymax) = line.geometry # type: ignore[misc] + # NOTE: baseline, x_size, x_descenders, x_ascenders is currently initalized to 0 + line_span = SubElement(paragraph, 'span', attrib={ + 'class': 'ocr_line', + 'id': f'line_{line_count}', + 'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \ + {int(round(xmax * width))} {int(round(ymax * height))}; \ + baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0' + }) + line_count += 1 + for word in line.words: + (xmin, ymin), (xmax, ymax) = word.geometry # type: ignore[misc] + conf = word.confidence + word_div = SubElement(line_span, 'span', attrib={ + 'class': 'ocrx_word', + 'id': f'word_{word_count}', + 'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \ + {int(round(xmax * width))} {int(round(ymax * height))}; \ + x_wconf {int(round(conf * 100))}' + }) + # set the text + word_div.text = word.value + word_count += 1 + + return (ET.tostring(page_hocr, encoding='utf-8', method='xml'), ET.ElementTree(page_hocr)) + @classmethod def from_dict(cls, save_dict: Dict[str, Any], **kwargs): kwargs = {k: save_dict[k] for k in cls._exported_keys} @@ -298,6 +379,17 @@ def synthesize(self, **kwargs) -> List[np.ndarray]: return [page.synthesize() for page in self.pages] + def export_as_xml(self, **kwargs) -> List[Tuple[bytes, ET.ElementTree]]: + """Export the document as XML (hOCR-format) + + Args: + **kwargs: additional keyword arguments passed to the Page.export_as_xml method + + Returns: + list of tuple of (bytes, ElementTree) + """ + return [page.export_as_xml(**kwargs) for page in self.pages] + @classmethod def from_dict(cls, save_dict: Dict[str, Any], **kwargs): kwargs = {k: save_dict[k] for k in cls._exported_keys} diff --git a/test/common/test_io_elements.py b/test/common/test_io_elements.py index fb87c37d44..53b62eb013 100644 --- a/test/common/test_io_elements.py +++ b/test/common/test_io_elements.py @@ -1,6 +1,7 @@ -import pytest import numpy as np +import pytest from doctr.io import elements +from xml.etree.ElementTree import ElementTree def _mock_words(size=(1., 1.), offset=(0, 0), confidence=0.9): @@ -190,6 +191,10 @@ def test_page(): assert page.export() == {"blocks": [b.export() for b in blocks], "page_idx": page_idx, "dimensions": page_size, "orientation": orientation, "language": language} + # Export XML + assert isinstance(page.export_as_xml(), tuple) and isinstance( + page.export_as_xml()[0], (bytes, bytearray)) and isinstance(page.export_as_xml()[1], ElementTree) + # Repr assert '\n'.join(repr(page).split('\n')[:2]) == f'Page(\n dimensions={repr(page_size)}' @@ -217,6 +222,9 @@ def test_document(): # Export assert doc.export() == {"pages": [p.export() for p in pages]} + # Export XML + assert isinstance(doc.export_as_xml(), list) and len(doc.export_as_xml()) == len(pages) + # Show doc.show([np.zeros((256, 256, 3), dtype=np.uint8) for _ in range(len(pages))], block=False)