Skip to content

Commit

Permalink
feat: Adds XML export method to DocumentBuilder (#544)
Browse files Browse the repository at this point in the history
* 512: add export_as_xml

* 512: add export_as_xml

* #544 apply changes

* #544 apply codefactor error

* #544: apply changes again

* #544 apply changes and add example

* #544 minor fix in example

* #544 minor fix in example

* #544: apply changes

* #544: apply changes

* #544: remove Exception and change to TypeError

Co-authored-by: felix <felix.dittrich@memoresa.de>
  • Loading branch information
felixdittrich92 and felix authored Nov 2, 2021
1 parent b27c3a6 commit 92c1eeb
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 2 deletions.
33 changes: 32 additions & 1 deletion docs/source/using_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -295,4 +295,35 @@ For reference, here is the JSON export for the same `Document` as above::
]
}
]
}
}

To export the outpout as XML (hocr-format) you can use the `export_as_xml` method::

xml_output = result.export_as_xml()
for output in xml_output:
xml_bytes_string = output[0]
xml_element = output[1]

For reference, here is a sample XML byte string output::

<?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
<title>docTR - hOCR</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="ocr-system" content="doctr 0.5.0" />
<meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
</head>
<body>
<div class="ocr_page" id="page_1" title="image; bbox 0 0 3456 3456; ppageno 0" />
<div class="ocr_carea" id="block_1_1" title="bbox 857 529 2504 2710">
<p class="ocr_par" id="par_1_1" title="bbox 857 529 2504 2710">
<span class="ocr_line" id="line_1_1" title="bbox 857 529 2504 2710; baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0">
<span class="ocrx_word" id="word_1_1" title="bbox 1552 540 1778 580; x_wconf 99">Hello</span>
<span class="ocrx_word" id="word_1_2" title="bbox 1782 529 1900 583; x_wconf 99">XML</span>
<span class="ocrx_word" id="word_1_3" title="bbox 1420 597 1684 641; x_wconf 81">World</span>
</span>
</p>
</div>
</body>
</html>
92 changes: 92 additions & 0 deletions doctr/io/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
from doctr.utils.common_types import BoundingBox, RotatedBbox
from doctr.utils.repr import NestedObject

import doctr
from xml.etree import ElementTree as ET
from xml.etree.ElementTree import Element as ETElement, SubElement

__all__ = ['Element', 'Word', 'Artefact', 'Line', 'Block', 'Page', 'Document']


Expand Down Expand Up @@ -253,6 +257,83 @@ def synthesize(self, **kwargs) -> np.ndarray:

return synthesize_page(self.export(), **kwargs)

def export_as_xml(self, file_title: str = 'docTR - XML export (hOCR)') -> Tuple[bytes, ET.ElementTree]:
"""Export the page as XML (hOCR-format)
convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md
Args:
file_title: the title of the XML file
Returns:
a tuple of the XML byte string, and its ElementTree
"""
p_idx = self.page_idx
block_count: int = 1
line_count: int = 1
word_count: int = 1
width, height = self.dimensions
language = self.language if 'language' in self.language.keys() else 'en'
# Create the XML root element
page_hocr = ETElement('html', attrib={'xmlns': 'http://www.w3.org/1999/xhtml', 'xml:lang': str(language)})
# Create the header / SubElements of the root element
head = SubElement(page_hocr, 'head')
SubElement(head, 'title').text = file_title
SubElement(head, 'meta', attrib={'http-equiv': 'Content-Type', 'content': 'text/html; charset=utf-8'})
SubElement(head, 'meta', attrib={'name': 'ocr-system', 'content': f"python-doctr {doctr.__version__}"})
SubElement(head, 'meta', attrib={'name': 'ocr-capabilities',
'content': 'ocr_page ocr_carea ocr_par ocr_line ocrx_word'})
# Create the body
body = SubElement(page_hocr, 'body')
SubElement(body, 'div', attrib={
'class': 'ocr_page',
'id': f'page_{p_idx + 1}',
'title': f'image; bbox 0 0 {width} {height}; ppageno 0'
})
# iterate over the blocks / lines / words and create the XML elements in body line by line with the attributes
for block in self.blocks:
if len(block.geometry) != 2:
raise TypeError("XML export is only available for straight bounding boxes for now.")
(xmin, ymin), (xmax, ymax) = block.geometry # type: ignore[misc]
block_div = SubElement(body, 'div', attrib={
'class': 'ocr_carea',
'id': f'block_{block_count}',
'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \
{int(round(xmax * width))} {int(round(ymax * height))}'
})
paragraph = SubElement(block_div, 'p', attrib={
'class': 'ocr_par',
'id': f'par_{block_count}',
'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \
{int(round(xmax * width))} {int(round(ymax * height))}'
})
block_count += 1
for line in block.lines:
(xmin, ymin), (xmax, ymax) = line.geometry # type: ignore[misc]
# NOTE: baseline, x_size, x_descenders, x_ascenders is currently initalized to 0
line_span = SubElement(paragraph, 'span', attrib={
'class': 'ocr_line',
'id': f'line_{line_count}',
'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \
{int(round(xmax * width))} {int(round(ymax * height))}; \
baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0'
})
line_count += 1
for word in line.words:
(xmin, ymin), (xmax, ymax) = word.geometry # type: ignore[misc]
conf = word.confidence
word_div = SubElement(line_span, 'span', attrib={
'class': 'ocrx_word',
'id': f'word_{word_count}',
'title': f'bbox {int(round(xmin * width))} {int(round(ymin * height))} \
{int(round(xmax * width))} {int(round(ymax * height))}; \
x_wconf {int(round(conf * 100))}'
})
# set the text
word_div.text = word.value
word_count += 1

return (ET.tostring(page_hocr, encoding='utf-8', method='xml'), ET.ElementTree(page_hocr))

@classmethod
def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
kwargs = {k: save_dict[k] for k in cls._exported_keys}
Expand Down Expand Up @@ -298,6 +379,17 @@ def synthesize(self, **kwargs) -> List[np.ndarray]:

return [page.synthesize() for page in self.pages]

def export_as_xml(self, **kwargs) -> List[Tuple[bytes, ET.ElementTree]]:
"""Export the document as XML (hOCR-format)
Args:
**kwargs: additional keyword arguments passed to the Page.export_as_xml method
Returns:
list of tuple of (bytes, ElementTree)
"""
return [page.export_as_xml(**kwargs) for page in self.pages]

@classmethod
def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
kwargs = {k: save_dict[k] for k in cls._exported_keys}
Expand Down
10 changes: 9 additions & 1 deletion test/common/test_io_elements.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest
import numpy as np
import pytest
from doctr.io import elements
from xml.etree.ElementTree import ElementTree


def _mock_words(size=(1., 1.), offset=(0, 0), confidence=0.9):
Expand Down Expand Up @@ -190,6 +191,10 @@ def test_page():
assert page.export() == {"blocks": [b.export() for b in blocks], "page_idx": page_idx, "dimensions": page_size,
"orientation": orientation, "language": language}

# Export XML
assert isinstance(page.export_as_xml(), tuple) and isinstance(
page.export_as_xml()[0], (bytes, bytearray)) and isinstance(page.export_as_xml()[1], ElementTree)

# Repr
assert '\n'.join(repr(page).split('\n')[:2]) == f'Page(\n dimensions={repr(page_size)}'

Expand Down Expand Up @@ -217,6 +222,9 @@ def test_document():
# Export
assert doc.export() == {"pages": [p.export() for p in pages]}

# Export XML
assert isinstance(doc.export_as_xml(), list) and len(doc.export_as_xml()) == len(pages)

# Show
doc.show([np.zeros((256, 256, 3), dtype=np.uint8) for _ in range(len(pages))], block=False)

Expand Down

0 comments on commit 92c1eeb

Please sign in to comment.