docTR - hOCR

+ + Hello + XML + World + +

+ + \ No newline at end of file diff --git a/doctr/io/elements.py b/doctr/io/elements.py index f477145543..3ad01ad63d 100644 --- a/doctr/io/elements.py +++ b/doctr/io/elements.py @@ -12,6 +12,10 @@ from doctr.utils.common_types import BoundingBox, RotatedBbox from doctr.utils.repr import NestedObject +import doctr +from xml.etree import ElementTree as ET +from xml.etree.ElementTree import Element as ETElement, SubElement + __all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document'] @@ -253,6 +257,83 @@ def synthesize(self, **kwargs) -> np.ndarray: return synthesize_page(self.export(), **kwargs) + def export_as_xml(self, file_title: str = 'docTR - XML export (hOCR)') -> Tuple[bytes, ET.ElementTree]: + """Export the page as XML (hOCR-format) + convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md + + Args: + file_title: the title of the XML file + + Returns: + a tuple of the XML byte string, and its ElementTree + """ + p_idx = self.page_idx + block_count: int = 1 + line_count: int = 1 + word_count: int = 1 + width, height = self.dimensions + language = self.language if 'language' in self.language.keys() else 'en' + # Create the XML root element + page_hocr = ETElement('html', attrib={'xmlns': 'http://www.w3.org/1999/xhtml', 'xml:lang': str(language)}) + # Create the header / SubElements of the root element + head = SubElement(page_hocr, 'head') + SubElement(head, 'title').text = file_title + SubElement(head, 'meta', attrib={'http-equiv': 'Content-Type', 'content': 'text/html; charset=utf-8'}) + SubElement(head, 'meta', attrib={'name': 'ocr-system', 'content': f"python-doctr {doctr.__version__}"}) + SubElement(head, 'meta', attrib={'name': 'ocr-capabilities', + 'content': 'ocr_page ocr_carea ocr_par ocr_line ocrx_word'}) + # Create the body + body = SubElement(page_hocr, 'body') + SubElement(body, 'div', attrib={ + 'class': 'ocr_page', + 'id': f'page_{p_idx + 1}', + 'title': f'image; bbox 0 0 {width} {height}; ppageno 0' + }) + # iterate over the blocks / lines / words and create the XML elements in body line by line with the attributes + for block in self.blocks: + if len(block.geometry) != 2: + raise TypeError("XML export is only available for straight bounding boxes for now.") + (xmin, ymin), (xmax, ymax) = block.geometry # type: ignore[misc] + block_div = SubElement(body, 'div', attrib={ + 'class': 'ocr_carea', + 'id': f'block_{block_count}', + 'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \ + {int(round(xmax * width))} {int(round(ymax * height))}' + }) + paragraph = SubElement(block_div, 'p', attrib={ + 'class': 'ocr_par', + 'id': f'par_{block_count}', + 'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \ + {int(round(xmax * width))} {int(round(ymax * height))}' + }) + block_count += 1 + for line in block.lines: + (xmin, ymin), (xmax, ymax) = line.geometry # type: ignore[misc] + # NOTE: baseline, x_size, x_descenders, x_ascenders is currently initalized to 0 + line_span = SubElement(paragraph, 'span', attrib={ + 'class': 'ocr_line', + 'id': f'line_{line_count}', + 'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \ + {int(round(xmax * width))} {int(round(ymax * height))}; \ + baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0' + }) + line_count += 1 + for word in line.words: + (xmin, ymin), (xmax, ymax) = word.geometry # type: ignore[misc] + conf = word.confidence + word_div = SubElement(line_span, 'span', attrib={ + 'class': 'ocrx_word', + 'id': f'word_{word_count}', + 'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \ + {int(round(xmax * width))} {int(round(ymax * height))}; \ + x_wconf {int(round(conf * 100))}' + }) + # set the text + word_div.text = word.value + word_count += 1 + + return (ET.tostring(page_hocr, encoding='utf-8', method='xml'), ET.ElementTree(page_hocr)) + @classmethod def from_dict(cls, save_dict: Dict[str, Any], **kwargs): kwargs = {k: save_dict[k] for k in cls._exported_keys} @@ -298,6 +379,17 @@ def synthesize(self, **kwargs) -> List[np.ndarray]: return [page.synthesize() for page in self.pages] + def export_as_xml(self, **kwargs) -> List[Tuple[bytes, ET.ElementTree]]: + """Export the document as XML (hOCR-format) + + Args: + **kwargs: additional keyword arguments passed to the Page.export_as_xml method + + Returns: + list of tuple of (bytes, ElementTree) + """ + return [page.export_as_xml(**kwargs) for page in self.pages] + @classmethod def from_dict(cls, save_dict: Dict[str, Any], **kwargs): kwargs = {k: save_dict[k] for k in cls._exported_keys} diff --git a/test/common/test_io_elements.py b/test/common/test_io_elements.py index fb87c37d44..53b62eb013 100644 --- a/test/common/test_io_elements.py +++ b/test/common/test_io_elements.py @@ -1,6 +1,7 @@ -import pytest import numpy as np +import pytest from doctr.io import elements +from xml.etree.ElementTree import ElementTree def _mock_words(size=(1., 1.), offset=(0, 0), confidence=0.9): @@ -190,6 +191,10 @@ def test_page(): assert page.export() == {"blocks": [b.export() for b in blocks], "page_idx": page_idx, "dimensions": page_size, "orientation": orientation, "language": language} + # Export XML + assert isinstance(page.export_as_xml(), tuple) and isinstance( + page.export_as_xml()[0], (bytes, bytearray)) and isinstance(page.export_as_xml()[1], ElementTree) + # Repr assert '\n'.join(repr(page).split('\n')[:2]) == f'Page(\n dimensions={repr(page_size)}' @@ -217,6 +222,9 @@ def test_document(): # Export assert doc.export() == {"pages": [p.export() for p in pages]} + # Export XML + assert isinstance(doc.export_as_xml(), list) and len(doc.export_as_xml()) == len(pages) + # Show doc.show([np.zeros((256, 256, 3), dtype=np.uint8) for _ in range(len(pages))], block=False)