diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst index 7d68e136d3..6a5a346d8a 100644 --- a/docs/source/datasets.rst +++ b/docs/source/datasets.rst @@ -22,6 +22,7 @@ Here are all datasets that are available through docTR: .. autoclass:: IIIT5K .. autoclass:: SVT .. autoclass:: SynthText +.. autoclass:: IC03 Data Loading diff --git a/doctr/datasets/__init__.py b/doctr/datasets/__init__.py index bef74405df..8ba266c0eb 100644 --- a/doctr/datasets/__init__.py +++ b/doctr/datasets/__init__.py @@ -5,6 +5,7 @@ from .detection import * from .doc_artefacts import * from .funsd import * +from .ic03 import * from .iiit5k import * from .ocr import * from .recognition import * diff --git a/doctr/datasets/ic03.py b/doctr/datasets/ic03.py new file mode 100644 index 0000000000..75cf387014 --- /dev/null +++ b/doctr/datasets/ic03.py @@ -0,0 +1,100 @@ +# Copyright (C) 2021, Mindee. + +# This program is licensed under the Apache License version 2. +# See LICENSE or go to for full license details. + +import os +from typing import Any, Callable, Dict, List, Optional, Tuple + +import defusedxml.ElementTree as ET +import numpy as np + +from .datasets import VisionDataset + +__all__ = ['IC03'] + + +class IC03(VisionDataset): + """IC03 dataset from `"ICDAR 2003 Robust Reading Competitions: Entries, Results and Future Directions" + `_. + + Example:: + >>> from doctr.datasets import IC03 + >>> train_set = IC03(train=True, download=True) + >>> img, target = train_set[0] + + Args: + train: whether the subset should be the training one + sample_transforms: composable transformations that will be applied to each image + rotated_bbox: whether polygons should be considered as rotated bounding box (instead of straight ones) + **kwargs: keyword arguments from `VisionDataset`. + """ + + TRAIN = ('http://www.iapr-tc11.org/dataset/ICDAR2003_RobustReading/TrialTrain/scene.zip', + '9d86df514eb09dd693fb0b8c671ef54a0cfe02e803b1bbef9fc676061502eb94', + 'ic03_train.zip') + TEST = ('http://www.iapr-tc11.org/dataset/ICDAR2003_RobustReading/TrialTest/scene.zip', + 'dbc4b5fd5d04616b8464a1b42ea22db351ee22c2546dd15ac35611857ea111f8', + 'ic03_test.zip') + + def __init__( + self, + train: bool = True, + sample_transforms: Optional[Callable[[Any], Any]] = None, + rotated_bbox: bool = False, + **kwargs: Any, + ) -> None: + + url, sha256, file_name = self.TRAIN if train else self.TEST + super().__init__(url, file_name, sha256, True, **kwargs) + self.sample_transforms = sample_transforms + self.train = train + self.data: List[Tuple[str, Dict[str, Any]]] = [] + np_dtype = np.float32 + + # Load xml data + tmp_root = os.path.join(self.root, 'SceneTrialTrain' if self.train else 'SceneTrialTest') + xml_tree = ET.parse(os.path.join(tmp_root, 'words.xml')) + xml_root = xml_tree.getroot() + + for image in xml_root: + name, resolution, rectangles = image + + # File existence check + if not os.path.exists(os.path.join(tmp_root, name.text)): + raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, name.text)}") + + if rotated_bbox: + # x_center, y_center, width, height, 0 + _boxes = [ + [float(rect.attrib['x']) + float(rect.attrib['width']) / 2, + float(rect.attrib['y']) + float(rect.attrib['height']) / 2, + float(rect.attrib['width']), float(rect.attrib['height']), float(rect.attrib['rotation'])] + for rect in rectangles + ] + else: + # x_min, y_min, x_max, y_max + _boxes = [ + [float(rect.attrib['x']), float(rect.attrib['y']), + float(rect.attrib['x']) + float(rect.attrib['width']), + float(rect.attrib['y']) + float(rect.attrib['height'])] + for rect in rectangles + ] + + # filter images without boxes + if len(_boxes) > 0: + # Convert them to relative + w, h = int(resolution.attrib['x']), int(resolution.attrib['y']) + boxes = np.asarray(_boxes, dtype=np_dtype) + boxes[:, [0, 2]] /= w + boxes[:, [1, 3]] /= h + + # Get the labels + labels = [lab.text for rect in rectangles for lab in rect if lab.text] + + self.data.append((name.text, dict(boxes=boxes, labels=labels))) + + self.root = tmp_root + + def extra_repr(self) -> str: + return f"train={self.train}" diff --git a/doctr/datasets/svt.py b/doctr/datasets/svt.py index 61c3f13ba2..e2bb9727df 100644 --- a/doctr/datasets/svt.py +++ b/doctr/datasets/svt.py @@ -61,13 +61,15 @@ def __init__( raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, name.text)}") if rotated_bbox: + # x_center, y_center, width, height, 0 _boxes = [ [float(rect.attrib['x']) + float(rect.attrib['width']) / 2, float(rect.attrib['y']) + float(rect.attrib['height']) / 2, - float(rect.attrib['width']), float(rect.attrib['height'])] + float(rect.attrib['width']), float(rect.attrib['height']), 0.0] for rect in rectangles ] else: + # x_min, y_min, x_max, y_max _boxes = [ [float(rect.attrib['x']), float(rect.attrib['y']), float(rect.attrib['x']) + float(rect.attrib['width']), diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py index a1ca8a54f0..f9d3d39569 100644 --- a/tests/pytorch/test_datasets_pt.py +++ b/tests/pytorch/test_datasets_pt.py @@ -34,6 +34,8 @@ def test_visiondataset(): ['SVT', False, [512, 512], 249, False], ['SynthText', True, [512, 512], 27, True], # Actual set has 772875 samples ['SynthText', False, [512, 512], 3, False], # Actual set has 85875 samples + ['IC03', True, [512, 512], 246, True], + ['IC03', False, [512, 512], 249, False], ], ) def test_dataset(dataset_name, train, input_size, size, rotate): diff --git a/tests/tensorflow/test_datasets_tf.py b/tests/tensorflow/test_datasets_tf.py index 08c6067e9a..ee42a76d04 100644 --- a/tests/tensorflow/test_datasets_tf.py +++ b/tests/tensorflow/test_datasets_tf.py @@ -24,6 +24,8 @@ ['SVT', False, [512, 512], 249, False], ['SynthText', True, [512, 512], 27, True], # Actual set has 772875 samples ['SynthText', False, [512, 512], 3, False], # Actual set has 85875 samples + ['IC03', True, [512, 512], 246, True], + ['IC03', False, [512, 512], 249, False], ], ) def test_dataset(dataset_name, train, input_size, size, rotate):