Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[datasets] Add MJSynth (Synth90K) #827

Merged
merged 33 commits into from
Apr 28, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
81c313e
backup
felixdittrich92 Jan 11, 2022
50574b5
Merge branch 'mindee:main' into main
felixdittrich92 Jan 11, 2022
5a6ed54
Merge branch 'mindee:main' into main
felixdittrich92 Jan 18, 2022
b9958a7
Merge branch 'mindee:main' into main
felixdittrich92 Jan 20, 2022
14c4651
Merge branch 'mindee:main' into main
felixdittrich92 Feb 16, 2022
779731f
Merge branch 'mindee:main' into main
felixdittrich92 Feb 18, 2022
ce2cdda
Merge branch 'mindee:main' into main
felixdittrich92 Feb 22, 2022
d13dc43
Merge branch 'mindee:main' into main
felixdittrich92 Feb 23, 2022
9a07d73
Merge branch 'mindee:main' into main
felixdittrich92 Feb 24, 2022
a002a70
Merge branch 'mindee:main' into main
felixdittrich92 Feb 24, 2022
6ad096e
Merge branch 'mindee:main' into main
felixdittrich92 Feb 25, 2022
1e77fd4
Merge branch 'mindee:main' into main
felixdittrich92 Mar 8, 2022
2be762c
Merge branch 'mindee:main' into main
felixdittrich92 Mar 10, 2022
e2f2055
Merge branch 'mindee:main' into main
felixdittrich92 Mar 11, 2022
bdc4e67
Merge branch 'mindee:main' into main
felixdittrich92 Mar 16, 2022
b525021
Merge branch 'mindee:main' into main
felixdittrich92 Mar 16, 2022
417a27b
Merge branch 'mindee:main' into main
felixdittrich92 Mar 16, 2022
9b3f5a1
Merge branch 'mindee:main' into main
felixdittrich92 Mar 18, 2022
93074a8
Merge branch 'mindee:main' into main
felixdittrich92 Mar 21, 2022
c64e209
Merge branch 'mindee:main' into main
felixdittrich92 Mar 22, 2022
fdc8381
Merge branch 'mindee:main' into main
felixdittrich92 Mar 25, 2022
bd68b07
Merge branch 'mindee:main' into main
felixdittrich92 Apr 5, 2022
7ac6ee2
Merge branch 'mindee:main' into main
felixdittrich92 Apr 5, 2022
1c79f32
Merge branch 'mindee:main' into main
felixdittrich92 Apr 7, 2022
45e43ac
Merge branch 'mindee:main' into main
felixdittrich92 Apr 13, 2022
53ba4b9
Merge branch 'mindee:main' into main
felixdittrich92 Apr 22, 2022
96b597c
Merge branch 'mindee:main' into main
felixdittrich92 Apr 27, 2022
5eebf06
add mjsynth loader
felixdittrich92 Feb 21, 2022
c1b714e
apply changes
felixdittrich92 Feb 21, 2022
d7bbe81
rename
felixdittrich92 Feb 21, 2022
4f45239
update
felixdittrich92 Mar 25, 2022
7877ac5
update
felixdittrich92 Apr 27, 2022
d42214e
fix tests
felixdittrich92 Apr 27, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
apply changes
  • Loading branch information
felixdittrich92 committed Apr 27, 2022
commit c1b714e7ee5398278ece95de90d15a64e029e9b9
24 changes: 12 additions & 12 deletions doctr/datasets/mjsynth.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,46 +22,46 @@ class MJSynth(AbstractDataset):
>>> # NOTE: You need to download the dataset.
>>> from doctr.datasets import MJSynth
>>> train_set = MJSynth(img_folder="/path/to/mjsynth/mnt/ramdisk/max/90kDICT32px",
>>> label_folder="/path/to/mjsynth/mnt/ramdisk/max/90kDICT32px/imlist.txt",
>>> train=True)
>>> label_path="/path/to/mjsynth/mnt/ramdisk/max/90kDICT32px/imlist.txt",
>>> train=True)
>>> img, target = train_set[0]
>>> test_set = MJSynth(img_folder="/path/to/mjsynth/mnt/ramdisk/max/90kDICT32px",
>>> labels_path="/path/to/mjsynth/mnt/ramdisk/max/90kDICT32px/imlist.txt")
>>> train=False)
>>> label_path="/path/to/mjsynth/mnt/ramdisk/max/90kDICT32px/imlist.txt")
>>> train=False)
>>> img, target = test_set[0]

Args:
img_folder: folder with all the images of the dataset
labels_path: folder with all annotation files for the images
label_path: path to the file with the labels
train: whether the subset should be the training one
**kwargs: keyword arguments from `AbstractDataset`.
"""

def __init__(
self,
img_folder: str,
labels_path: str,
label_path: str,
train: bool = True,
**kwargs: Any,
) -> None:
super().__init__(img_folder, **kwargs)

# File existence check
if not os.path.exists(labels_path) or not os.path.exists(img_folder):
if not os.path.exists(label_path) or not os.path.exists(img_folder):
raise FileNotFoundError(
f"unable to locate {labels_path if not os.path.exists(labels_path) else img_folder}")
f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")

self.data: List[Tuple[str, Dict[str, str]]] = []
self.data: List[Tuple[str, Dict[str, Any]]] = []
self.train = train

with open(labels_path) as f:
with open(label_path) as f:
img_paths = f.readlines()

train_samples = int(len(img_paths) * 0.85)
train_samples = int(len(img_paths) * 0.9)
set_slice = slice(train_samples) if self.train else slice(train_samples, None)

for path in tqdm(iterable=img_paths[set_slice], desc='Unpacking MJSynth', total=len(img_paths[set_slice])):
label = path.split('_')[1]
label = [path.split('_')[1]]
img_path = os.path.join(img_folder, path[2:]).strip()

self.data.append((img_path, dict(labels=label)))
Expand Down
18 changes: 9 additions & 9 deletions tests/pytorch/test_datasets_pt.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,26 @@
from doctr.transforms import Resize


def _validate_dataset(ds, input_size, batch_size=2, class_indices=False, is_polygons=False):
def _validate_dataset(ds, input_size, batch_size=2, class_indices=False, is_polygons=False, has_boxes=True):

# Fetch one sample
img, target = ds[0]
assert isinstance(img, torch.Tensor)
assert img.shape == (3, *input_size)
assert img.dtype == torch.float32
assert isinstance(target, dict)
assert isinstance(target['boxes'], np.ndarray) and target['boxes'].dtype == np.float32
if is_polygons:
assert target['boxes'].ndim == 3 and target['boxes'].shape[1:] == (4, 2)
else:
assert target['boxes'].ndim == 2 and target['boxes'].shape[1:] == (4,)
assert np.all(np.logical_and(target['boxes'] <= 1, target['boxes'] >= 0))
if has_boxes:
assert isinstance(target['boxes'], np.ndarray) and target['boxes'].dtype == np.float32
if is_polygons:
assert target['boxes'].ndim == 3 and target['boxes'].shape[1:] == (4, 2)
else:
assert target['boxes'].ndim == 2 and target['boxes'].shape[1:] == (4,)
assert np.all(np.logical_and(target['boxes'] <= 1, target['boxes'] >= 0))
assert len(target['labels']) == len(target['boxes'])
if class_indices:
assert isinstance(target['labels'], np.ndarray) and target['labels'].dtype == np.int64
else:
assert isinstance(target['labels'], list) and all(isinstance(s, str) for s in target['labels'])
assert len(target['labels']) == len(target['boxes'])

# Check batching
loader = DataLoader(
Expand Down Expand Up @@ -517,6 +518,5 @@ def test_mjsynth_dataset(mock_mjsynth_dataset):
)

assert len(ds) == 4 # Actual set has 7581382 train and 1337891 test samples
image, target = ds[0]
assert repr(ds) == f"MJSynth(train={True})"
_validate_dataset_recognition_part(ds, input_size)
18 changes: 9 additions & 9 deletions tests/tensorflow/test_datasets_tf.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,26 @@
from doctr.transforms import Resize


def _validate_dataset(ds, input_size, batch_size=2, class_indices=False, is_polygons=False):
def _validate_dataset(ds, input_size, batch_size=2, class_indices=False, is_polygons=False, has_boxes=True):

# Fetch one sample
img, target = ds[0]
assert isinstance(img, tf.Tensor)
assert img.shape == (*input_size, 3)
assert img.dtype == tf.float32
assert isinstance(target, dict)
assert isinstance(target['boxes'], np.ndarray) and target['boxes'].dtype == np.float32
if is_polygons:
assert target['boxes'].ndim == 3 and target['boxes'].shape[1:] == (4, 2)
else:
assert target['boxes'].ndim == 2 and target['boxes'].shape[1:] == (4,)
assert np.all(np.logical_and(target['boxes'] <= 1, target['boxes'] >= 0))
if has_boxes:
assert isinstance(target['boxes'], np.ndarray) and target['boxes'].dtype == np.float32
if is_polygons:
assert target['boxes'].ndim == 3 and target['boxes'].shape[1:] == (4, 2)
else:
assert target['boxes'].ndim == 2 and target['boxes'].shape[1:] == (4,)
assert np.all(np.logical_and(target['boxes'] <= 1, target['boxes'] >= 0))
assert len(target['labels']) == len(target['boxes'])
if class_indices:
assert isinstance(target['labels'], np.ndarray) and target['labels'].dtype == np.int64
else:
assert isinstance(target['labels'], list) and all(isinstance(s, str) for s in target['labels'])
assert len(target['labels']) == len(target['boxes'])

# Check batching
loader = DataLoader(ds, batch_size=batch_size)
Expand Down Expand Up @@ -502,6 +503,5 @@ def test_mjsynth_dataset(mock_mjsynth_dataset):
)

assert len(ds) == 4 # Actual set has 7581382 train and 1337891 test samples
image, target = ds[0]
assert repr(ds) == f"MJSynth(train={True})"
_validate_dataset_recognition_part(ds, input_size)