Skip to content

Commit

Permalink
Compatible with imagebind models
Browse files Browse the repository at this point in the history
  • Loading branch information
luyao-cv committed Sep 21, 2023
1 parent 8e77192 commit 4d63133
Show file tree
Hide file tree
Showing 7 changed files with 163 additions and 24 deletions.
32 changes: 16 additions & 16 deletions applications/Audio2Img/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ example: Use audio generate image across modalities (e.g. Image, Text and Audio)
cd applications/Audio2Img

python audio2img_imagebind.py \
--model_name_or_path The dir name of imagebind checkpoint. \
--stable_unclip_model_name_or_path The dir name of StableUnCLIPImg2ImgPipeline pretrained checkpoint. \
--input_audio an audio file. \
--model_name_or_path imagebind-1.2b/ \
--stable_unclip_model_name_or_path stabilityai/stable-diffusion-2-1-unclip \
--input_audio https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/bird_audio.wav \
```

----
Expand All @@ -60,14 +60,14 @@ python audio2img_imagebind.py \
cd applications/Audio2Img

python audio2img_imagebind.py \
--model_name_or_path The dir name of imagebind checkpoint. \
--stable_unclip_model_name_or_path The dir name of StableUnCLIPImg2ImgPipeline pretrained checkpoint. \
--input_audio bird_audio.wav \
--model_name_or_path imagebind-1.2b/ \
--stable_unclip_model_name_or_path stabilityai/stable-diffusion-2-1-unclip \
--input_audio https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/bird_audio.wav \
```
#### 3.1.2 Result
| Input Audio | Output Image |
| --- | --- |
|[bird_audio.wav](https://github.com/luyao-cv/file_download/blob/main/assets/bird_audio.wav)| ![audio2img_output_bird](https://github.com/luyao-cv/file_download/blob/main/vis_audio2img/audio2img_output_bird.jpg) |
|[bird_audio.wav](https://github.com/luyao-cv/file_download/blob/main/assets/bird_audio.wav)| ![audio2img_output_bird](https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/audio2img_output_bird.jpg) |


#### Audio+Text to Image
Expand All @@ -76,15 +76,15 @@ python audio2img_imagebind.py \
cd applications/Audio2Img

python audio2img_imagebind.py \
--model_name_or_path The dir name of imagebind checkpoint. \
--stable_unclip_model_name_or_path The dir name of StableUnCLIPImg2ImgPipeline pretrained checkpoint. \
--input_audio bird_audio.wav \
--model_name_or_path imagebind-1.2b/ \
--stable_unclip_model_name_or_path stabilityai/stable-diffusion-2-1-unclip \
--input_audio https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/bird_audio.wav \
--input_text 'A photo.' \
```
#### 3.2.2 Result
| Input Audio | Input Text | Output Image |
| --- | --- | --- |
|[bird_audio.wav](https://github.com/luyao-cv/file_download/blob/main/assets/bird_audio.wav) | 'A photo.' | ![audio_text_to_img_output_bird_a_photo](https://github.com/luyao-cv/file_download/blob/main/vis_audio2img/audio_text_to_img_output_bird_a_photo.jpg)
|[bird_audio.wav](https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/bird_audio.wav) | 'A photo.' | ![audio_text_to_img_output_bird_a_photo](https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/audio_text_to_img_output_bird_a_photo.jpg)


#### Audio+Image to Image
Expand All @@ -93,14 +93,14 @@ python audio2img_imagebind.py \
cd applications/Audio2Img

python audio2img_imagebind.py \
--model_name_or_path The dir name of imagebind checkpoint. \
--stable_unclip_model_name_or_path The dir name of StableUnCLIPImg2ImgPipeline pretrained checkpoint. \
--input_audio wave.wav \
--input_image dog_image.jpg \
--model_name_or_path imagebind-1.2b/ \
--stable_unclip_model_name_or_path stabilityai/stable-diffusion-2-1-unclip \
--input_audio https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/wave.wav \
--input_image https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/dog_image.jpg \
```

#### 3.3.2 Result
| Input Audio | Input Image | Output Image |
| --- | --- | --- |
|[wave.wav](https://github.com/luyao-cv/file_download/blob/main/assets/wave.wav) | ![input_dog_image](https://github.com/luyao-cv/file_download/blob/main/assets/dog_image.jpg) | ![audio_img_to_img_output_wave_dog](https://github.com/luyao-cv/file_download/blob/main/vis_audio2img/audio_img_to_img_output_wave_dog.jpg)
|[wave.wav](https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/wave.wav) | ![input_dog_image](https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/dog_image.jpg) | ![audio_img_to_img_output_wave_dog](https://paddlenlp.bj.bcebos.com/models/community/paddlemix/audio-files/audio_img_to_img_output_wave_dog.jpg)

29 changes: 25 additions & 4 deletions applications/Audio2Img/audio2img_imagebind.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,28 @@
import numpy as np
import argparse
import requests
from ppdiffusers.utils import load_image
from PIL import Image
from dataclasses import dataclass, field
from paddlenlp.trainer import PdArgumentParser

from paddlemix.utils.log import logger
from paddlemix.models.imagebind.modeling import ImageBindModel
from paddlemix.models.imagebind.utils import *
from types import SimpleNamespace
# from paddlemix.models.imagebind.utils.resample import *
# from paddlemix.models.imagebind.utils.paddle_aux import *


ModalityType = SimpleNamespace(
VISION="vision",
TEXT="text",
AUDIO="audio",
THERMAL="thermal",
DEPTH="depth",
IMU="imu",
)

class Predictor:
def __init__(self, model_args):
self.processor = ImageBindProcessor.from_pretrained(model_args.model_name_or_path)
Expand All @@ -38,19 +50,28 @@ def main(model_args,data_args):

#bulid model
logger.info("imagebind_model: {}".format(model_args.model_name_or_path))

url = (data_args.input_image)
if os.path.isfile(url):
#read image
image_pil = Image.open(data_args.input_image).convert("RGB")
elif url:
image_pil = Image.open(requests.get(url, stream=True).raw).convert("RGB")
image_pil = load_image(url)
else:
image_pil = None

url = (data_args.input_audio)
if os.path.isfile(url):
#read image
input_audio = data_args.input_audio
elif url:
os.system("wget {}".format(url))
input_audio = os.path.basename(data_args.input_audio)
else:
input_audio = None

predictor = Predictor(model_args)

encoding = predictor.processor(images=image_pil,text="", audios=data_args.input_audio, return_tensors='pd')
encoding = predictor.processor(images=image_pil,text="", audios=input_audio, return_tensors='pd')
inputs = {}

if image_pil:
Expand All @@ -66,7 +87,7 @@ def main(model_args,data_args):

if image_pil:
logger.info("Generate vision embedding: {}".format(embeddings[ModalityType.VISION]))
image_proj_embeds += embeddings[ModalityType.VISION]
image_proj_embeds += embeddings[ModalityType.VISION]

if data_args.input_audio:
logger.info("Generate audio embedding: {}".format(embeddings[ModalityType.AUDIO]))
Expand Down
2 changes: 1 addition & 1 deletion applications/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ result = task(prompt=prompt)['result']
| [文本引导的图像变换(Image-to-Image Text-Guided Generation)](./image2image/README.md/#文本引导的图像变换image-to-image-text-guided-generation) | `stable-diffusion-v1-5` | [fastdeploy](../ppdiffusers/deploy/README.md/#文本引导的图像变换image-to-image-text-guided-generation) |
| [文本图像双引导图像生成(Dual Text and Image Guided Generation)](./image2image/README.md/#文本图像双引导图像生成dual-text-and-image-guided-generation) | `versatile-diffusion` ||
| [文本条件的视频生成(Text-to-Video Generation)](./text2video/README.md/#文本条件的视频生成text-to-video-generation) | `text-to-video-ms-1.7b` ||
| [音频生成图像(Audio-to-Chat Generation)](./Audio2Img/README.md/#audio-to-image) | `imagebind stable-diffusion-2-1-unclip` | |
| [音频生成图像(Audio-to-Image Generation)](./Audio2Img/README.md/#audio-to-image) | `imagebind stable-diffusion-2-1-unclip` | |
| [音频描述(Audio-to-Caption Generation)](./Audio2Caption/README.md/#音频描述audio-to-caption-generation) | `chatglm-6b whisper` | |
| [音频对话(Audio-to-Chat Generation)](./AudioChat/README.md/#音频对话audio-to-chat-generation) | `chatglm-6b whisper fastspeech2` | |
| [音乐生成(Music Generation)](./MusicGeneration/README.md/#音乐生成music-generation) | `chatglm-6b minigpt4 audioldm` | |
Expand Down
2 changes: 2 additions & 0 deletions paddlemix/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,5 @@
from .minigpt4.modeling import *
from .visualglm.configuration import *
from .visualglm.modeling import *
from .imagebind.modeling import *
from .imagebind.multimodal_modules import *
1 change: 1 addition & 0 deletions paddlemix/processors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@
from .tokenizer import SimpleTokenizer, tokenize
from .visualglm_image_processing import *
from .visualglm_processing import *
from .imagebind_processing import *
2 changes: 2 additions & 0 deletions paddlemix/processors/clip_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,7 @@ def __init__(
scale: Optional[Union[List[float], Tuple[float]]] = (0.9, 1.0),
do_collate: bool = False,
mode: str = "train",
interpolation: str = "bilinear",
**kwargs,
) -> None:
super().__init__(**kwargs)
Expand All @@ -347,6 +348,7 @@ def __init__(
self.do_rand_resize_crop = do_rand_resize_crop
self.scale = scale
self.do_collate = do_collate
self.interpolation = interpolation

def resize(
self,
Expand Down
119 changes: 116 additions & 3 deletions paddlemix/processors/imagebind_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@
import paddle
from paddle.vision.transforms import transforms as T
from paddlenlp.transformers.tokenizer_utils_base import BatchEncoding
from paddlevideo.data.clip_sampling import ConstantClipsPerVideoSampler

# from paddlevideo.data.clip_sampling import ConstantClipsPerVideoSampler
from typing import Any, Dict, NamedTuple, Optional, Tuple, Union, List
from abc import ABC, abstractmethod
from fractions import Fraction
from .base_processing import ProcessorMixin
from .processing_utils import BaseAudioProcessor

Expand Down Expand Up @@ -62,7 +64,7 @@ def __call__(self, text=None, images=None, audios=None, return_tensors=None, **k
encoding["audio_values"] = self.audio_processor(audios, return_tensors=return_tensors, **kwargs)

if text is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values
encoding["pixel_values"] = image_features['image']
return encoding
elif text is not None:
return encoding
Expand Down Expand Up @@ -192,3 +194,114 @@ def waveform2melspec(self, waveform, sample_rate, num_mel_bins, target_length):

fbank = fbank.unsqueeze(axis=0)
return fbank



class ClipInfo(NamedTuple):
"""
Named-tuple for clip information with:
clip_start_sec (Union[float, Fraction]): clip start time.
clip_end_sec (Union[float, Fraction]): clip end time.
clip_index (int): clip index in the video.
aug_index (int): augmentation index for the clip. Different augmentation methods
might generate multiple views for the same clip.
is_last_clip (bool): a bool specifying whether there are more clips to be
sampled from the video.
"""

clip_start_sec: Union[float, Fraction]
clip_end_sec: Union[float, Fraction]
clip_index: int
aug_index: int
is_last_clip: bool


class ClipSampler(ABC):
"""
Interface for clip samplers that take a video time, previous sampled clip time,
and returns a named-tuple ``ClipInfo``.
"""

def __init__(self, clip_duration: Union[float, Fraction]) -> None:
self._clip_duration = Fraction(clip_duration)
self._current_clip_index = 0
self._current_aug_index = 0

@abstractmethod
def __call__(
self,
last_clip_time: Union[float, Fraction],
video_duration: Union[float, Fraction],
annotation: Dict[str, Any],
) -> ClipInfo:
pass

def reset(self) -> None:
"""Resets any video-specific attributes in preperation for next video"""
pass



class ConstantClipsPerVideoSampler(ClipSampler):
"""
Evenly splits the video into clips_per_video increments and samples clips of size
clip_duration at these increments.
"""

def __init__(
self, clip_duration: float, clips_per_video: int, augs_per_clip: int = 1
) -> None:
super().__init__(clip_duration)
self._clips_per_video = clips_per_video
self._augs_per_clip = augs_per_clip

def __call__(
self, last_clip_time: float, video_duration: float, annotation: Dict[str, Any]
) -> ClipInfo:
"""
Args:
last_clip_time (float): Not used for ConstantClipsPerVideoSampler.
video_duration: (float): the duration (in seconds) for the video that's
being sampled.
annotation (Dict): Not used by this sampler.
Returns:
a named-tuple `ClipInfo`: includes the clip information of (clip_start_time,
clip_end_time, clip_index, aug_index, is_last_clip). The times are in seconds.
is_last_clip is True after clips_per_video clips have been sampled or the end
of the video is reached.
"""
max_possible_clip_start = Fraction(max(video_duration - self._clip_duration, 0))
uniform_clip = Fraction(max_possible_clip_start, self._clips_per_video)
clip_start_sec = uniform_clip * self._current_clip_index
clip_index = self._current_clip_index
aug_index = self._current_aug_index

self._current_aug_index += 1
if self._current_aug_index >= self._augs_per_clip:
self._current_clip_index += 1
self._current_aug_index = 0

# Last clip is True if sampled self._clips_per_video or if end of video is reached.
is_last_clip = False
if (
self._current_clip_index >= self._clips_per_video
or uniform_clip * self._current_clip_index > max_possible_clip_start
):
self._current_clip_index = 0
is_last_clip = True

if is_last_clip:
self.reset()

return ClipInfo(
clip_start_sec,
clip_start_sec + self._clip_duration,
clip_index,
aug_index,
is_last_clip,
)

def reset(self):
self._current_clip_index = 0
self._current_aug_index = 0

0 comments on commit 4d63133

Please sign in to comment.