Correcciones explicación transformers, explicación ViTs, y comienzo e…

…xplicación multimodales
pabloggarc · May 8, 2024 · c91be55 · c91be55
1 parent 7f58f5e
commit c91be55
Show file tree

Hide file tree

Showing 9 changed files with 156 additions and 21 deletions.
diff --git a/Memoria/img/attention.pdf b/Memoria/img/attention.pdf
diff --git a/Memoria/img/imagen-texto.pdf b/Memoria/img/imagen-texto.pdf
diff --git a/Memoria/img/imagen_gato.jpg b/Memoria/img/imagen_gato.jpg
diff --git a/Memoria/img/imagen_texto_gato.png b/Memoria/img/imagen_texto_gato.png
diff --git a/Memoria/img/variantes_crossed.png b/Memoria/img/variantes_crossed.png
diff --git a/Memoria/img/vit.pdf b/Memoria/img/vit.pdf
diff --git a/Memoria/library.bib b/Memoria/library.bib
@@ -283,3 +283,57 @@ @article{normalization
    url = {https://arxiv.org/abs/1607.06450v1},
    year = {2016},
 }
+@article{vit,
+   abstract = {While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to computer vision remain limited. In vision, attention is either applied in conjunction with convolutional networks, or used to replace certain components of convolutional networks while keeping their overall structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train.},
+   author = {Alexey Dosovitskiy and Lucas Beyer and Alexander Kolesnikov and Dirk Weissenborn and Xiaohua Zhai and Thomas Unterthiner and Mostafa Dehghani and Matthias Minderer and Georg Heigold and Sylvain Gelly and Jakob Uszkoreit and Neil Houlsby},
+   journal = {ICLR 2021 - 9th International Conference on Learning Representations},
+   month = {10},
+   publisher = {International Conference on Learning Representations, ICLR},
+   title = {An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
+   url = {https://arxiv.org/abs/2010.11929v2},
+   year = {2020},
+}
+@article{multimodal_dl,
+   abstract = {Deep networks have been successfully applied to unsupervised feature learning for single modalities (e.g., text, images or audio). In this work, we propose a novel application of deep networks to learn features over multiple modalities. We present a series of tasks for multimodal learning and show how to train deep networks that learn features to address these tasks. In particular, we demonstrate cross modality feature learning, where better features for one modality (e.g., video) can be learned if multiple modalities (e.g., audio and video) are present at feature learning time. Furthermore, we show how to learn a shared representation between modalities and evaluate it on a unique task, where the classifier is trained with audio-only data but tested with video-only data and vice-versa. Our models are validated on the CUAVE and AVLet-ters datasets on audiovisual speech classification , demonstrating best published visual speech classification on AVLetters and effective shared representation learning.},
+   author = {Jiquan Ngiam and Aditya Khosla and Mingyu Kim and Juhan Nam and Honglak Lee and Andrew Y Ng},
+   title = {Multimodal Deep Learning},
+   year = {2011},
+}
+@article{multimodal_transformers,
+   abstract = {Transformer is a promising neural network learner, and has achieved great success in various machine learning tasks. Thanks to the recent prevalence of multimodal applications and Big Data, Transformer-based multimodal learning has become a hot topic in AI research. This paper presents a comprehensive survey of Transformer techniques oriented at multimodal data. The main contents of this survey include: (1) a background of multimodal learning, Transformer ecosystem, and the multimodal Big Data era, (2) a systematic review of Vanilla Transformer, Vision Transformer, and multimodal Transformers, from a geometrically topological perspective, (3) a review of multimodal Transformer applications, via two important paradigms, i.e., for multimodal pretraining and for specific multimodal tasks, (4) a summary of the common challenges and designs shared by the multimodal Transformer models and applications, and (5) a discussion of open problems and potential research directions for the community.},
+   author = {Peng Xu and Xiatian Zhu and David A. Clifton},
+   doi = {10.1109/TPAMI.2023.3275156},
+   issn = {19393539},
+   issue = {10},
+   journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
+   keywords = {Multimodal learning,deep learning,introductory,machine learning,taxonomy,transformer},
+   month = {10},
+   pages = {12113-12132},
+   pmid = {37167049},
+   publisher = {IEEE Computer Society},
+   title = {Multimodal Learning With Transformers: A Survey},
+   volume = {45},
+   year = {2023},
+}
+@article{coseno,
+   abstract = {Cosine similarity is a widely implemented metric in information retrieval and related studies. This metric models a text as a vector of terms and the similarity between two texts is derived from cosine value between two texts' term vectors. Cosine similarity however still can't handle the semantic meaning of the text perfectly. This paper proposes an enhancement of cosine similarity measurement by incorporating semantic checking between dimensions of two term vectors. This strategy aims to increase the similarity value between two term vectors which contain semantic relation between their dimensions with different syntax. Experimental result shows our proposal yields a promising result.},
+   author = {Teruaki Kitasuka and Masayoshi Aritsugi and Faisal Rahutomo},
+   keywords = {Index Terms-cosine similarity,WordNet,semantic},
+   title = {Semantic Cosine Similarity},
+   url = {https://www.researchgate.net/publication/262525676},
+   year = {2012},
+}
+@article{clip,
+   abstract = {State-of-the-art computer vision systems are trained to predict a fixed set of predetermined object categories. This restricted form of supervision limits their generality and usability since additional labeled data is needed to specify any other visual concept. Learning directly from raw text about images is a promising alternative which leverages a much broader source of supervision. We demonstrate that the simple pre-training task of predicting which caption goes with which image is an efficient and scalable way to learn SOTA image representations from scratch on a dataset of 400 million (image, text) pairs collected from the internet. After pre-training, natural language is used to reference learned visual concepts (or describe new ones) enabling zero-shot transfer of the model to downstream tasks. We study the performance of this approach by benchmarking on over 30 different existing computer vision datasets, spanning tasks such as OCR, action recognition in videos, geo-localization, and many types of fine-grained object classification. The model transfers non-trivially to most tasks and is often competitive with a fully supervised baseline without the need for any dataset specific training. For instance, we match the accuracy of the original ResNet-50 on ImageNet zero-shot without needing to use any of the 1.28 million training examples it was trained on. We release our code and pre-trained model weights at https://github.com/OpenAI/CLIP.},
+   author = {Alec Radford and Jong Wook Kim and Chris Hallacy and Aditya Ramesh and Gabriel Goh and Sandhini Agarwal and Girish Sastry and Amanda Askell and Pamela Mishkin and Jack Clark and Gretchen Krueger and Ilya Sutskever},
+   isbn = {9781713845065},
+   issn = {26403498},
+   journal = {Proceedings of Machine Learning Research},
+   month = {2},
+   pages = {8748-8763},
+   publisher = {ML Research Press},
+   title = {Learning Transferable Visual Models From Natural Language Supervision},
+   volume = {139},
+   url = {https://arxiv.org/abs/2103.00020v1},
+   year = {2021},
+}