From 765a2454a0b8838fbbf76f9521584a9a563d5fcb Mon Sep 17 00:00:00 2001 From: Wilson Yan Date: Sat, 26 Jun 2021 15:15:45 -1000 Subject: [PATCH] Updates for FVD + support more pre-trained model features --- README.md | 19 ++++++++++++++----- scripts/compute_fvd.py | 7 ++++--- scripts/sample_videogpt.py | 2 +- videogpt/__init__.py | 2 +- videogpt/download.py | 12 ++++++++++++ videogpt/fvd/fvd.py | 17 ----------------- videogpt/fvd/pytorch_i3d.py | 20 +------------------- 7 files changed, 33 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index ae77e36..947dd06 100644 --- a/README.md +++ b/README.md @@ -11,15 +11,15 @@ We present VideoGPT: a conceptually simple architecture for scaling likelihood b ## Installation Change the `cudatoolkit` version compatible to your machine. ```bash -$ conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0 -$ pip install git+https://github.com/wilson1yan/VideoGPT.git +conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0 +pip install git+https://github.com/wilson1yan/VideoGPT.git ``` ### Sparse Attention (Optional) For limited compute scenarios, it may be beneficial to use [sparse attention](https://arxiv.org/abs/1904.10509). ```bash -$ sudo apt-get install llvm-9-dev -$ DS_BUILD_SPARSE_ATTN=1 pip install deepspeed +sudo apt-get install llvm-9-dev +DS_BUILD_SPARSE_ATTN=1 pip install deepspeed ``` After installng `deepspeed`, you can train a sparse transformer by setting the flag `--attn_type sparse` in `scripts/train_videogpt.py`. The default supported sparsity configuration is an N-d strided sparsity layout, however, you can write your own arbitrary layouts to use. @@ -107,6 +107,12 @@ Use the `scripts/train_vqvae.py` script to train a VQ-VAE. Execute `python scrip * `--resolution 128`: spatial resolution to train on * `--sequence_length 16`: temporal resolution, or video clip length +## Using Pretrained VideoGPTs +There are two available pre-trained VideoGPT models +* `bair_gpt`: single frame-conditional BAIR model using discrete encodings from `bair_stride4x2x2` VQ-VAE +* `ucf101_uncond_gpt`: unconditional UCF101 model using discrete encodings from `ucf101_stride4x4x4` VQ-VAE +Note that both pre-trained models use sparse attention. For purposes of fine-tuning, you will need to install sparse attention, however, sampling does not required sparse attention to be installed. + ## Training VideoGPT You can download a pretrained VQ-VAE, or train your own. Afterwards, use the `scripts/train_videogpt.py` script to train an VideoGPT model for sampling. Execute `python scripts/train_videogpt.py -h` for information on all available training settings. A subset of more relevant settings are listed below, along with default values. ### VideoGPT Specific Settings @@ -132,7 +138,10 @@ You can download a pretrained VQ-VAE, or train your own. Afterwards, use the `sc * `--sequence_length 16`: temporal resolution, or video clip length ## Sampling VideoGPT -After training, the VideoGPT model can be sampled using the `scripts/sample_videogpt.py`. You may need to install `ffmpeg`: `sudo apt-get install ffmpeg` +VideoGPT models can be sampled using the `scripts/sample_videogpt.py`. You can specify a path to a checkpoint during training, or the name of a pretrained model. You may need to install `ffmpeg`: `sudo apt-get install ffmpeg` + +## Evaluation +Evaluation is done primarily using [Frechet Video Distance (FVD)](https://arxiv.org/abs/1812.01717) for BAIR and Kinetics, and [Inception Score](https://arxiv.org/abs/1606.03498) for UCF-101. Inception Score can be computed by generating samples and using the code from the [TGANv2 repo](https://github.com/pfnet-research/tgan2). FVD can be computed through `python scripts/compute_fvd.py`, which runs a PyTorch-ported version of the [original codebase](https://github.com/google-research/google-research/tree/master/frechet_video_distance) ## Reproducing Paper Results Note that this repo is primarily designed for simplicity and extending off of our method. Reproducing the full paper results can be done using code found at a [separate repo](https://github.com/wilson1yan/VideoGPT-Paper). However, be aware that the code is not as clean. diff --git a/scripts/compute_fvd.py b/scripts/compute_fvd.py index 5bcfb7f..1aca155 100644 --- a/scripts/compute_fvd.py +++ b/scripts/compute_fvd.py @@ -1,6 +1,7 @@ import os import functools import argparse +from videogpt.download import load_i3d_pretrained from tqdm import tqdm import numpy as np @@ -8,7 +9,7 @@ import torch.multiprocessing as mp import torch.distributed as dist -from videogpt.fvd.fvd import get_fvd_logits, frechet_distance, load_fvd_model +from videogpt.fvd.fvd import get_fvd_logits, frechet_distance from videogpt import VideoData, VideoGPT, load_videogpt @@ -47,7 +48,7 @@ def main_worker(rank, size, args_in): loader = VideoData(args).test_dataloader() #################### Load I3D ######################################## - i3d = load_fvd_model(device) + i3d = load_i3d_pretrained(device) #################### Compute FVD ############################### fvds = [] @@ -122,7 +123,7 @@ def eval_fvd(i3d, videogpt, loader, device): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--ckpt', type=str, required=True) + parser.add_argument('--ckpt', type=str, default='bair_gpt') parser.add_argument('--n_trials', type=int, default=1, help="Number of trials to compute mean/std") parser.add_argument('--port', type=int, default=23452) args = parser.parse_args() diff --git a/scripts/sample_videogpt.py b/scripts/sample_videogpt.py index 39ca48b..2ead14a 100644 --- a/scripts/sample_videogpt.py +++ b/scripts/sample_videogpt.py @@ -7,7 +7,7 @@ parser = argparse.ArgumentParser() -parser.add_argument('--ckpt', type=str, required=None) +parser.add_argument('--ckpt', type=str, default='ucf101_uncond_gpt') parser.add_argument('--n', type=int, default=8) args = parser.parse_args() n = args.n diff --git a/videogpt/__init__.py b/videogpt/__init__.py index e6a7d30..d41e91c 100644 --- a/videogpt/__init__.py +++ b/videogpt/__init__.py @@ -2,5 +2,5 @@ from .vqvae import VQVAE from .gpt import VideoGPT from .data import VideoData -from .download import load_vqvae, load_videogpt, download +from .download import load_vqvae, load_videogpt, load_i3d_pretrained, download diff --git a/videogpt/download.py b/videogpt/download.py index 3b7fa48..fbfa640 100644 --- a/videogpt/download.py +++ b/videogpt/download.py @@ -64,6 +64,7 @@ def load_vqvae(model_name, device=torch.device('cpu')): _VIDEOGPT = { 'bair_gpt': '1fNTtJAgO6grEtPNrufkpbee1CfGztW-1', # 1-frame conditional, 16 frames of 64 x 64 images + 'ucf101_uncond_gpt': '1QkF_Sb2XVRgSbFT_SxQ6aZUeDFoliPQq', # unconditional, 16 frames of 128 x 128 images } def load_videogpt(model_name, device=torch.device('cpu')): @@ -73,3 +74,14 @@ def load_videogpt(model_name, device=torch.device('cpu')): gpt.eval() return gpt + + +_I3D_PRETRAINED_ID = '1mQK8KD8G6UWRa5t87SRMm5PVXtlpneJT' + +def load_i3d_pretrained(device=torch.device('cpu')): + from .fvd.pytorch_i3d import InceptionI3d + i3d = InceptionI3d(400, in_channels=3).to(device) + filepath = download(_I3D_PRETRAINED_ID, 'i3d_pretrained_400.pt') + i3d.load_state_dict(torch.load(filepath, map_location=device)) + i3d.eval() + return i3d diff --git a/videogpt/fvd/fvd.py b/videogpt/fvd/fvd.py index af6b861..7542f4c 100644 --- a/videogpt/fvd/fvd.py +++ b/videogpt/fvd/fvd.py @@ -1,13 +1,5 @@ -import argparse -import numpy as np - import torch -import torch.nn.functional as F -import torch.utils.data as data - from ..data import preprocess as preprocess_single -from .pytorch_i3d import InceptionI3d -import os def preprocess(videos, target_resolution=224): @@ -22,15 +14,6 @@ def get_fvd_logits(videos, i3d, device): embeddings = get_logits(i3d, videos, device) return embeddings -def load_fvd_model(device): - i3d = InceptionI3d(400, in_channels=3).to(device) - current_dir = os.path.dirname(os.path.abspath(__file__)) - i3d_path = os.path.join(current_dir, 'i3d_pretrained_400.pt') - i3d.load_state_dict(torch.load(i3d_path, map_location=device)) - i3d.eval() - return i3d - - # https://github.com/tensorflow/gan/blob/de4b8da3853058ea380a6152bd3bd454013bf619/tensorflow_gan/python/eval/classifier_metrics.py#L161 def _symmetric_matrix_square_root(mat, eps=1e-10): u, s, v = torch.svd(mat) diff --git a/videogpt/fvd/pytorch_i3d.py b/videogpt/fvd/pytorch_i3d.py index f12faaf..77d4830 100644 --- a/videogpt/fvd/pytorch_i3d.py +++ b/videogpt/fvd/pytorch_i3d.py @@ -1,16 +1,9 @@ -# https://github.com/piergiaj/pytorch-i3d +# Original code from https://github.com/piergiaj/pytorch-i3d import torch import torch.nn as nn import torch.nn.functional as F -from torch.autograd import Variable - import numpy as np -import os -import sys -from collections import OrderedDict - - class MaxPool3dSamePadding(nn.MaxPool3d): def compute_pad(self, dim, s): @@ -22,15 +15,12 @@ def compute_pad(self, dim, s): def forward(self, x): # compute 'same' padding (batch, channel, t, h, w) = x.size() - #print t,h,w out_t = np.ceil(float(t) / float(self.stride[0])) out_h = np.ceil(float(h) / float(self.stride[1])) out_w = np.ceil(float(w) / float(self.stride[2])) - #print out_t, out_h, out_w pad_t = self.compute_pad(0, t) pad_h = self.compute_pad(1, h) pad_w = self.compute_pad(2, w) - #print pad_t, pad_h, pad_w pad_t_f = pad_t // 2 pad_t_b = pad_t - pad_t_f @@ -40,8 +30,6 @@ def forward(self, x): pad_w_b = pad_w - pad_w_f pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b) - #print x.size() - #print pad x = F.pad(x, pad) return super(MaxPool3dSamePadding, self).forward(x) @@ -90,15 +78,12 @@ def compute_pad(self, dim, s): def forward(self, x): # compute 'same' padding (batch, channel, t, h, w) = x.size() - #print t,h,w out_t = np.ceil(float(t) / float(self._stride[0])) out_h = np.ceil(float(h) / float(self._stride[1])) out_w = np.ceil(float(w) / float(self._stride[2])) - #print out_t, out_h, out_w pad_t = self.compute_pad(0, t) pad_h = self.compute_pad(1, h) pad_w = self.compute_pad(2, w) - #print pad_t, pad_h, pad_w pad_t_f = pad_t // 2 pad_t_b = pad_t - pad_t_f @@ -108,10 +93,7 @@ def forward(self, x): pad_w_b = pad_w - pad_w_f pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b) - #print x.size() - #print pad x = F.pad(x, pad) - #print x.size() x = self.conv3d(x) if self._use_batch_norm: