Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[datasets] Add IIIT HWS dataset #1199

Merged
merged 2 commits into from
May 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ Supported datasets
* IC13 from `ICDAR 2013 <http://dagdata.cvc.uab.es/icdar2013competition/>`_.
* IMGUR5K from `"TextStyleBrush: Transfer of Text Aesthetics from a Single Example" <https://github.com/facebookresearch/IMGUR5K-Handwriting-Dataset>`_.
* MJSynth from `"Synthetic Data and Artificial Neural Networks for Natural Scene Text Recognition" <https://www.robots.ox.ac.uk/~vgg/data/text/>`_.
* IIITHWS from `"Generating Synthetic Data for Text Recognition" <https://github.com/kris314/hwnet>`_.


.. toctree::
Expand Down
2 changes: 2 additions & 0 deletions docs/source/modules/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ doctr.datasets

.. autoclass:: MJSynth

.. autoclass:: IIITHWS

.. autoclass:: DocArtefacts

Synthetic dataset generator
Expand Down
6 changes: 4 additions & 2 deletions docs/source/using_doctr/using_datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ Recognition
This datasets contains the information to train or validate a text recognition model.

+-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
| **Dataset** | **Train Samples** | **Test Samples** | **Information** |
| **Dataset** | **Train Samples** | **Test Samples** | **Information** |
+=============================+=================================+=================================+=============================================+
| FUNSD | 21888 | 8707 | english |
+-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
Expand All @@ -80,7 +80,9 @@ This datasets contains the information to train or validate a text recognition m
+-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
| IMGUR5K | 207901 | 22672 | english / handwritten / external resources |
+-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
| MJSynth | 7581382 | 1337891 | english |
| MJSynth | 7581382 | 1337891 | english / external resources |
+-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
| IIITHWS | 7141797 | 793533 | english / handwritten / external resources |
+-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+

.. code:: python3
Expand Down
1 change: 1 addition & 0 deletions doctr/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .ic03 import *
from .ic13 import *
from .iiit5k import *
from .iiithws import *
from .imgur5k import *
from .mjsynth import *
from .ocr import *
Expand Down
74 changes: 74 additions & 0 deletions doctr/datasets/iiithws.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Copyright (C) 2021-2023, Mindee.

# This program is licensed under the Apache License 2.0.
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.

import os
from random import sample
from typing import Any, List, Tuple

from tqdm import tqdm

from .datasets import AbstractDataset

__all__ = ["IIITHWS"]


class IIITHWS(AbstractDataset):
"""IIITHWS dataset from `"Generating Synthetic Data for Text Recognition"
<https://arxiv.org/pdf/1608.04224.pdf>`_ | `"repository" <https://github.com/kris314/hwnet>`_ |
`"website" <https://cvit.iiit.ac.in/research/projects/cvit-projects/matchdocimgs>`_.

>>> # NOTE: This is a pure recognition dataset without bounding box labels.
>>> # NOTE: You need to download the dataset.
>>> from doctr.datasets import IIITHWS
>>> train_set = IIITHWS(img_folder="/path/to/iiit-hws/Images_90K_Normalized",
>>> label_path="/path/to/IIIT-HWS-90K.txt",
>>> train=True)
>>> img, target = train_set[0]
>>> test_set = IIITHWS(img_folder="/path/to/iiit-hws/Images_90K_Normalized",
>>> label_path="/path/to/IIIT-HWS-90K.txt")
>>> train=False)
>>> img, target = test_set[0]

Args:
img_folder: folder with all the images of the dataset
label_path: path to the file with the labels
train: whether the subset should be the training one
**kwargs: keyword arguments from `AbstractDataset`.
"""

def __init__(
self,
img_folder: str,
label_path: str,
train: bool = True,
**kwargs: Any,
) -> None:
super().__init__(img_folder, **kwargs)

# File existence check
if not os.path.exists(label_path) or not os.path.exists(img_folder):
raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")

self.data: List[Tuple[str, str]] = []
self.train = train

with open(label_path) as f:
annotations = f.readlines()

# Shuffle the dataset otherwise the test set will contain the same labels n times
annotations = sample(annotations, len(annotations))
train_samples = int(len(annotations) * 0.9)
set_slice = slice(train_samples) if self.train else slice(train_samples, None)

for annotation in tqdm(
iterable=annotations[set_slice], desc="Unpacking IIITHWS", total=len(annotations[set_slice])
):
img_path, label = annotation.split()[0:2]
img_path = os.path.join(img_folder, img_path)

self.data.append((img_path, label))

def extra_repr(self) -> str:
return f"train={self.train}"
27 changes: 27 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,3 +627,30 @@ def mock_mjsynth_dataset(tmpdir_factory, mock_image_stream):
with open(fn, "wb") as f:
f.write(file.getbuffer())
return str(root), str(label_file)


@pytest.fixture(scope="session")
def mock_iiithws_dataset(tmpdir_factory, mock_image_stream):
root = tmpdir_factory.mktemp("datasets")
iiithws_root = root.mkdir("iiit-hws")
image_folder = iiithws_root.mkdir("Images_90K_Normalized")
image_sub_folder = image_folder.mkdir("1")
label_file = iiithws_root.join("IIIT-HWS-90K.txt")
labels = [
"./iiit-hws/Images_90K_Normalized/1/499_5_3_0_0.png I 1 0\n",
"./iiit-hws/Images_90K_Normalized/1/117_1_3_0_0.png am 1 0\n",
"./iiit-hws/Images_90K_Normalized/1/80_7_3_0_0.png a 1 0\n",
"./iiit-hws/Images_90K_Normalized/1/585_3_2_0_0.png Jedi 1 0\n",
"./iiit-hws/Images_90K_Normalized/1/222_5_3_0_0.png ! 1 0\n",
]

with open(label_file, "w") as f:
for label in labels:
f.write(label)

file = BytesIO(mock_image_stream)
for label in labels:
fn = image_sub_folder.join(label.split()[0].split("/")[-1])
with open(fn, "wb") as f:
f.write(file.getbuffer())
return str(root), str(label_file)
12 changes: 12 additions & 0 deletions tests/pytorch/test_datasets_pt.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,3 +574,15 @@ def test_mjsynth_dataset(mock_mjsynth_dataset):
assert len(ds) == 4 # Actual set has 7581382 train and 1337891 test samples
assert repr(ds) == f"MJSynth(train={True})"
_validate_dataset_recognition_part(ds, input_size)


def test_iiithws_dataset(mock_iiithws_dataset):
input_size = (32, 128)
ds = datasets.IIITHWS(
*mock_iiithws_dataset,
img_transforms=Resize(input_size, preserve_aspect_ratio=True),
)

assert len(ds) == 4 # Actual set has 7141797 train and 793533 test samples
assert repr(ds) == f"IIITHWS(train={True})"
_validate_dataset_recognition_part(ds, input_size)
12 changes: 12 additions & 0 deletions tests/tensorflow/test_datasets_tf.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,3 +548,15 @@ def test_mjsynth_dataset(mock_mjsynth_dataset):
assert len(ds) == 4 # Actual set has 7581382 train and 1337891 test samples
assert repr(ds) == f"MJSynth(train={True})"
_validate_dataset_recognition_part(ds, input_size)


def test_iiithws_dataset(mock_iiithws_dataset):
input_size = (32, 128)
ds = datasets.IIITHWS(
*mock_iiithws_dataset,
img_transforms=Resize(input_size, preserve_aspect_ratio=True),
)

assert len(ds) == 4 # Actual set has 7141797 train and 793533 test samples
assert repr(ds) == f"IIITHWS(train={True})"
_validate_dataset_recognition_part(ds, input_size)