Update directory structure + add preprocessing for BAIR dataset

wilson1yan · Jun 24, 2021 · 575b75e · 575b75e
1 parent a693f20
commit 575b75e
Show file tree

Hide file tree

Showing 6 changed files with 135 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -24,7 +24,11 @@ $ DS_BUILD_SPARSE_ATTN=1 pip install deepspeed
 After installng `deepspeed`, you can train a sparse transformer by setting the flag `--attn_type sparse` in `scripts/train_videogpt.py`. The default supported sparsity configuration is an N-d strided sparsity layout, however, you can write your own arbitrary layouts to use.
 
 ## Dataset
-The default code accepts data as an HDF5 file with the specified format in `videogpt/data.py`, OR a directory format with the follow structure:
+The default code accepts data as an HDF5 file with the specified format in `videogpt/data.py`. An example of such a dataset can be constructed from the BAIR Robot data by running the script:
+```bash
+sh scripts/preprocess/bair/create_bair_dataset.sh datasets/bair
+``` 
+Alternatively, the code supports a dataset with the following directory structure:
 ```
 video_dataset/
     train/
@@ -52,7 +56,7 @@ video_dataset/
 ```
 An example of such a dataset can be constructed from [UCF-101](https://www.crcv.ucf.edu/data/UCF101.php) data by running the script 
 ```bash
-sh scripts/preprocess/create_ucf_dataset.sh datasets/ucf101
+sh scripts/preprocess/ucf101/create_ucf_dataset.sh datasets/ucf101
 ``` 
 You may need to install `unrar` and `unzip` for the code to work correctly.
 

diff --git a/scripts/preprocess/bair/bair_extract_images.py b/scripts/preprocess/bair/bair_extract_images.py
@@ -0,0 +1,66 @@
+import os
+import io
+
+import numpy as np
+from PIL import Image
+import tensorflow as tf
+
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import gfile
+
+import imageio
+
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--data_dir', default='', help='base directory to save processed data')
+opt = parser.parse_args()
+
+def get_seq(dname):
+    data_dir = '%s/softmotion30_44k/%s' % (opt.data_dir, dname)
+
+    filenames = gfile.Glob(os.path.join(data_dir, '*'))
+    if not filenames:
+        raise RuntimeError('No data files found.')
+
+    for f in filenames:
+        k=0
+        for serialized_example in tf.python_io.tf_record_iterator(f):
+            example = tf.train.Example()
+            example.ParseFromString(serialized_example)
+            image_seq, action_seq = [], []
+            for i in range(30):
+                image_name = str(i) + '/image_aux1/encoded'
+                byte_str = example.features.feature[image_name].bytes_list.value[0]
+                img = Image.frombytes('RGB', (64, 64), byte_str)
+                arr = np.array(img.getdata()).reshape(img.size[1], img.size[0], 3)
+                image_seq.append(arr.reshape(1, 64, 64, 3))
+
+                action_name = str(i) + '/action'
+                action = example.features.feature[action_name].float_list.value
+                action = np.array(action).astype('float32')
+                action_seq.append(action)
+            image_seq = np.concatenate(image_seq, axis=0)
+            action_seq = np.stack(action_seq, axis=0)
+            k=k+1
+            yield f, k, image_seq, action_seq
+
+def convert_data(dname):
+    seq_generator = get_seq(dname)
+    n = 0
+    while True:
+        n+=1
+        try:
+            f, k, seq, actions = next(seq_generator)
+            seq = seq.astype('uint8')
+        except StopIteration:
+            break
+        f = f.split('/')[-1]
+        os.makedirs('%s/processed_data/%s/%s/%d/' % (opt.data_dir, dname,  f[:-10], k), exist_ok=True)
+        for i in range(len(seq)):
+            imageio.imwrite('%s/processed_data/%s/%s/%d/%d.png' % (opt.data_dir, dname,  f[:-10], k, i), seq[i])
+        np.save('%s/processed_data/%s/%s/%d/actions.npy' % (opt.data_dir, dname, f[:-10], k), actions)
+
+        print('%s data: %s (%d)  (%d)' % (dname, f, k, n))
+
+convert_data('test')
+convert_data('train')
diff --git a/scripts/preprocess/bair/bair_image_to_hdf5.py b/scripts/preprocess/bair/bair_image_to_hdf5.py
@@ -0,0 +1,50 @@
+import glob
+import argparse
+import h5py
+import numpy as np
+from PIL import Image
+import os
+import os.path as osp
+from tqdm import tqdm
+import sys
+
+def convert_data(f, split):
+    root_dir = args.data_dir
+    path = osp.join(root_dir, 'processed_data', split)
+    traj_paths = glob.glob(osp.join(path, '*', '*'))
+    trajs, actions = [], []
+    for traj_path in tqdm(traj_paths):
+        image_paths = glob.glob(osp.join(traj_path, '*.png'))
+        image_paths.sort(key=lambda x: int(osp.splitext(osp.basename(x))[0]))
+        traj = []
+        for img_path in image_paths:
+            img = Image.open(img_path)
+            arr = np.array(img) # HWC
+            traj.append(arr)
+        traj = np.stack(traj, axis=0) # THWC
+        trajs.append(traj)
+
+        actions.append(np.load(osp.join(traj_path, 'actions.npy')))
+
+    idxs = np.arange(len(trajs)) * 30
+    trajs = np.concatenate(trajs, axis=0) # (NT)HWC
+    actions = np.concatenate(actions, axis=0) # (NT)(act_dim)
+
+    f.create_dataset(f'{split}_data', data=trajs)
+    f.create_dataset(f'{split}_actions', data=actions)
+    f.create_dataset(f'{split}_idx', data=idxs)
+
+    print(split)
+    print(f'\timages: {f[f"{split}_data"].shape}, {f[f"{split}_data"].dtype}')
+    print(f'\timages: {f[f"{split}_idx"].shape}, {f[f"{split}_idx"].dtype}')
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--data_dir', type=str, required=True)
+parser.add_argument('--output_dir', type=str, required=True)
+args = parser.parse_args()
+
+os.makedirs(args.output_dir, exist_ok=True)
+f = h5py.File(osp.join(args.output_dir, 'bair.hdf5'), 'a')
+convert_data(f, 'train')
+convert_data(f, 'test')
+f.close()
diff --git a/scripts/preprocess/bair/create_bair_dataset.sh b/scripts/preprocess/bair/create_bair_dataset.sh
@@ -0,0 +1,11 @@
+#! /bin/sh
+
+mkdir -p ${1}
+mkdir -p ~/.cache/bair
+wget -P ~/.cache/bair/ http://rail.eecs.berkeley.edu/datasets/bair_robot_pushing_dataset_v0.tar
+tar -xf ~/.cache/bair/bair_robot_pushing_dataset_v0.tar -C ~/.cache/bair/
+
+python scripts/preprocess/bair/bair_extract_images.py --data_dir ~/.cache/bair
+python scripts/preprocess/bair/bair_image_to_hdf5.py --data_dir ~/.cache/bair --output_dir ${1}
+
+rm -r ~/.cache/bair
diff --git a/scripts/preprocess/create_ucf_dataset.sh → ...s/preprocess/ucf101/create_ucf_dataset.sh b/scripts/preprocess/create_ucf_dataset.sh → ...s/preprocess/ucf101/create_ucf_dataset.sh
@@ -1,6 +1,6 @@
 #! /bin/sh
 
-mkdir -p datasets/ucf101
+mkdir -p ${1}
 
 # Download UCF-101 video files
 wget --no-check-certificate -P ${1} https://www.crcv.ucf.edu/data/UCF101/UCF101.rar
@@ -13,7 +13,7 @@ unzip ${1}/UCF101TrainTestSplits-RecognitionTask.zip -d ${1}
 rm ${1}/UCF101TrainTestSplits-RecognitionTask.zip
 
 # Move video files into train / test directories based on train/test split
-python scripts/preprocess/ucf_split_train_test.py ${1} 1 
+python scripts/preprocess/ucf101/ucf_split_train_test.py ${1} 1
 
 # Delete leftover files
 rm -r ${1}/UCF-101
diff --git a/scripts/preprocess/ucf_split_train_test.py → ...preprocess/ucf101/ucf_split_train_test.py b/scripts/preprocess/ucf_split_train_test.py → ...preprocess/ucf101/ucf_split_train_test.py