diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index e9e17cd6c0d2ba..df99e7e9269f0c 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -4,10 +4,12 @@ on: push: branches: - master + - model-templates paths: - "src/**" - "tests/**" - ".github/**" + - "templates/**" # pull_request: repository_dispatch: @@ -55,6 +57,14 @@ jobs: python -c "import torch; print('Cuda available:', torch.cuda.is_available())" python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())" + - name: Create model files + run: | + source .env/bin/activate + transformers-cli add-new-model --testing --testing_file=templates/cookiecutter/tests/encoder-bert-tokenizer.json --path=templates/cookiecutter + transformers-cli add-new-model --testing --testing_file=templates/cookiecutter/tests/pt-encoder-bert-tokenizer.json --path=templates/cookiecutter + transformers-cli add-new-model --testing --testing_file=templates/cookiecutter/tests/standalone.json --path=templates/cookiecutter + transformers-cli add-new-model --testing --testing_file=templates/cookiecutter/tests/tf-encoder-bert-tokenizer.json --path=templates/cookiecutter + - name: Run all non-slow tests on GPU env: OMP_NUM_THREADS: 1 @@ -116,6 +126,14 @@ jobs: TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))" TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))" + - name: Create model files + run: | + source .env/bin/activate + transformers-cli add-new-model --testing --testing_file=templates/cookiecutter/tests/encoder-bert-tokenizer.json --path=templates/cookiecutter + transformers-cli add-new-model --testing --testing_file=templates/cookiecutter/tests/pt-encoder-bert-tokenizer.json --path=templates/cookiecutter + transformers-cli add-new-model --testing --testing_file=templates/cookiecutter/tests/standalone.json --path=templates/cookiecutter + transformers-cli add-new-model --testing --testing_file=templates/cookiecutter/tests/tf-encoder-bert-tokenizer.json --path=templates/cookiecutter + - name: Run all non-slow tests on GPU env: OMP_NUM_THREADS: 1 diff --git a/setup.py b/setup.py index 4eeffff6c055ac..f77766dd71bfd4 100644 --- a/setup.py +++ b/setup.py @@ -98,12 +98,13 @@ extras["tokenizers"] = ["tokenizers==0.9.2"] extras["onnxruntime"] = ["onnxruntime>=1.4.0", "onnxruntime-tools>=1.4.2"] +extras["modelcreation"] = ["cookiecutter==1.7.2"] extras["serving"] = ["pydantic", "uvicorn", "fastapi", "starlette"] extras["sentencepiece"] = ["sentencepiece==0.1.91"] extras["retrieval"] = ["faiss-cpu", "datasets"] -extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil"] + extras["retrieval"] +extras["testing"] = ["pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil"] + extras["retrieval"] + extras["modelcreation"] # sphinx-rtd-theme==0.5.0 introduced big changes in the style. extras["docs"] = ["recommonmark", "sphinx==3.2.1", "sphinx-markdown-tables", "sphinx-rtd-theme==0.4.3", "sphinx-copybutton"] extras["quality"] = ["black >= 20.8b1", "isort >= 5.5.4", "flake8 >= 3.8.3"] @@ -111,7 +112,7 @@ extras["all"] = extras["tf"] + extras["torch"] + extras["flax"] + extras["sentencepiece"] + extras["tokenizers"] -extras["dev"] = extras["all"] + extras["testing"] + extras["quality"] + extras["ja"] + extras["docs"] + extras["sklearn"] +extras["dev"] = extras["all"] + extras["testing"] + extras["quality"] + extras["ja"] + extras["docs"] + extras["sklearn"] + extras["modelcreation"] setup( diff --git a/src/transformers/commands/add_new_model.py b/src/transformers/commands/add_new_model.py new file mode 100644 index 00000000000000..6cab10de0e5ab6 --- /dev/null +++ b/src/transformers/commands/add_new_model.py @@ -0,0 +1,192 @@ +import json +import os +import shutil +from argparse import ArgumentParser, Namespace +from pathlib import Path +from typing import List + +from cookiecutter.main import cookiecutter +from transformers.commands import BaseTransformersCLICommand + +from ..utils import logging + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +def add_new_model_command_factory(args: Namespace): + return AddNewModelCommand(args.testing, args.testing_file, path=args.path) + + +class AddNewModelCommand(BaseTransformersCLICommand): + @staticmethod + def register_subcommand(parser: ArgumentParser): + add_new_model_parser = parser.add_parser("add-new-model") + add_new_model_parser.add_argument("--testing", action="store_true", help="If in testing mode.") + add_new_model_parser.add_argument("--testing_file", type=str, help="Configuration file on which to run.") + add_new_model_parser.add_argument( + "--path", type=str, help="Path to cookiecutter. Should only be used for testing purposes." + ) + add_new_model_parser.set_defaults(func=add_new_model_command_factory) + + def __init__(self, testing: bool, testing_file: str, path=None, *args): + self._testing = testing + self._testing_file = testing_file + self._path = path + + def run(self): + # Ensure that there is no other `cookiecutter-template-xxx` directory in the current working directory + directories = [directory for directory in os.listdir() if "cookiecutter-template-" == directory[:22]] + if len(directories) > 0: + raise ValueError( + "Several directories starting with `cookiecutter-template-` in current working directory. " + "Please clean your directory by removing all folders startign with `cookiecutter-template-` or " + "change your working directory." + ) + + path_to_transformer_root = ( + Path(__file__).parent.parent.parent.parent if self._path is None else Path(self._path).parent.parent + ) + path_to_cookiecutter = path_to_transformer_root / "templates" / "cookiecutter" + + # Execute cookiecutter + if not self._testing: + cookiecutter(str(path_to_cookiecutter)) + else: + with open(self._testing_file, "r") as configuration_file: + testing_configuration = json.load(configuration_file) + + cookiecutter( + str(path_to_cookiecutter if self._path is None else self._path), + no_input=True, + extra_context=testing_configuration, + ) + + directory = [directory for directory in os.listdir() if "cookiecutter-template-" in directory[:22]][0] + + # Retrieve configuration + with open(directory + "/configuration.json", "r") as configuration_file: + configuration = json.load(configuration_file) + + lowercase_model_name = configuration["lowercase_modelname"] + pytorch_or_tensorflow = configuration["generate_tensorflow_and_pytorch"] + os.remove(f"{directory}/configuration.json") + + output_pytorch = "PyTorch" in pytorch_or_tensorflow + output_tensorflow = "TensorFlow" in pytorch_or_tensorflow + + shutil.move( + f"{directory}/configuration_{lowercase_model_name}.py", + f"{path_to_transformer_root}/src/transformers/configuration_{lowercase_model_name}.py", + ) + + def remove_copy_lines(path): + with open(path, "r") as f: + lines = f.readlines() + with open(path, "w") as f: + for line in lines: + if "# Copied from transformers." not in line: + f.write(line) + + if output_pytorch: + if not self._testing: + remove_copy_lines(f"{directory}/modeling_{lowercase_model_name}.py") + + shutil.move( + f"{directory}/modeling_{lowercase_model_name}.py", + f"{path_to_transformer_root}/src/transformers/modeling_{lowercase_model_name}.py", + ) + + shutil.move( + f"{directory}/test_modeling_{lowercase_model_name}.py", + f"{path_to_transformer_root}/tests/test_modeling_{lowercase_model_name}.py", + ) + else: + os.remove(f"{directory}/modeling_{lowercase_model_name}.py") + os.remove(f"{directory}/test_modeling_{lowercase_model_name}.py") + + if output_tensorflow: + if not self._testing: + remove_copy_lines(f"{directory}/modeling_tf_{lowercase_model_name}.py") + + shutil.move( + f"{directory}/modeling_tf_{lowercase_model_name}.py", + f"{path_to_transformer_root}/src/transformers/modeling_tf_{lowercase_model_name}.py", + ) + + shutil.move( + f"{directory}/test_modeling_tf_{lowercase_model_name}.py", + f"{path_to_transformer_root}/tests/test_modeling_tf_{lowercase_model_name}.py", + ) + else: + os.remove(f"{directory}/modeling_tf_{lowercase_model_name}.py") + os.remove(f"{directory}/test_modeling_tf_{lowercase_model_name}.py") + + shutil.move( + f"{directory}/{lowercase_model_name}.rst", + f"{path_to_transformer_root}/docs/source/model_doc/{lowercase_model_name}.rst", + ) + + shutil.move( + f"{directory}/tokenization_{lowercase_model_name}.py", + f"{path_to_transformer_root}/src/transformers/tokenization_{lowercase_model_name}.py", + ) + + from os import fdopen, remove + from shutil import copymode, move + from tempfile import mkstemp + + def replace(original_file: str, line_to_copy_below: str, lines_to_copy: List[str]): + # Create temp file + fh, abs_path = mkstemp() + line_found = False + with fdopen(fh, "w") as new_file: + with open(original_file) as old_file: + for line in old_file: + new_file.write(line) + if line_to_copy_below in line: + line_found = True + for line_to_copy in lines_to_copy: + new_file.write(line_to_copy) + + if not line_found: + raise ValueError(f"Line {line_to_copy_below} was not found in file.") + + # Copy the file permissions from the old file to the new file + copymode(original_file, abs_path) + # Remove original file + remove(original_file) + # Move new file + move(abs_path, original_file) + + def skip_units(line): + return ("generating PyTorch" in line and not output_pytorch) or ( + "generating TensorFlow" in line and not output_tensorflow + ) + + def replace_in_files(path_to_datafile): + with open(path_to_datafile) as datafile: + lines_to_copy = [] + skip_file = False + skip_snippet = False + for line in datafile: + if "# To replace in: " in line and "##" not in line: + file_to_replace_in = line.split('"')[1] + skip_file = skip_units(line) + elif "# Below: " in line and "##" not in line: + line_to_copy_below = line.split('"')[1] + skip_snippet = skip_units(line) + elif "# End." in line and "##" not in line: + if not skip_file and not skip_snippet: + replace(file_to_replace_in, line_to_copy_below, lines_to_copy) + + lines_to_copy = [] + elif "# Replace with" in line and "##" not in line: + lines_to_copy = [] + elif "##" not in line: + lines_to_copy.append(line) + + remove(path_to_datafile) + + replace_in_files(f"{directory}/to_replace_{lowercase_model_name}.py") + os.rmdir(directory) diff --git a/src/transformers/commands/transformers_cli.py b/src/transformers/commands/transformers_cli.py index ecc2ce96d97587..eaa2bcaa229ed6 100644 --- a/src/transformers/commands/transformers_cli.py +++ b/src/transformers/commands/transformers_cli.py @@ -1,6 +1,7 @@ #!/usr/bin/env python from argparse import ArgumentParser +from transformers.commands.add_new_model import AddNewModelCommand from transformers.commands.convert import ConvertCommand from transformers.commands.download import DownloadCommand from transformers.commands.env import EnvironmentCommand @@ -20,6 +21,7 @@ def main(): RunCommand.register_subcommand(commands_parser) ServeCommand.register_subcommand(commands_parser) UserCommands.register_subcommand(commands_parser) + AddNewModelCommand.register_subcommand(commands_parser) # Let's go args = parser.parse_args() diff --git a/src/transformers/configuration_auto.py b/src/transformers/configuration_auto.py index 4689f38e72c432..41f9a1db82c0c9 100644 --- a/src/transformers/configuration_auto.py +++ b/src/transformers/configuration_auto.py @@ -59,6 +59,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict( (key, value) for pretrained_map in [ + # Add archive maps here BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BART_PRETRAINED_CONFIG_ARCHIVE_MAP, BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -95,6 +96,7 @@ CONFIG_MAPPING = OrderedDict( [ + # Add configs here ("retribert", RetriBertConfig), ("t5", T5Config), ("mobilebert", MobileBertConfig), @@ -136,6 +138,7 @@ MODEL_NAMES_MAPPING = OrderedDict( [ + # Add full (and cased) model names here ("retribert", "RetriBERT"), ("t5", "T5"), ("mobilebert", "MobileBERT"), diff --git a/src/transformers/modeling_auto.py b/src/transformers/modeling_auto.py index 57f403ac6a2ad8..18ec83ff2a91f4 100644 --- a/src/transformers/modeling_auto.py +++ b/src/transformers/modeling_auto.py @@ -226,11 +226,14 @@ from .utils import logging +# Add modeling imports here + logger = logging.get_logger(__name__) MODEL_MAPPING = OrderedDict( [ + # Base model mapping (RetriBertConfig, RetriBertModel), (T5Config, T5Model), (DistilBertConfig, DistilBertModel), @@ -266,6 +269,7 @@ MODEL_FOR_PRETRAINING_MAPPING = OrderedDict( [ + # Model for pre-training mapping (LayoutLMConfig, LayoutLMForMaskedLM), (RetriBertConfig, RetriBertModel), (T5Config, T5ForConditionalGeneration), @@ -295,6 +299,7 @@ MODEL_WITH_LM_HEAD_MAPPING = OrderedDict( [ + # Model with LM heads mapping (LayoutLMConfig, LayoutLMForMaskedLM), (T5Config, T5ForConditionalGeneration), (DistilBertConfig, DistilBertForMaskedLM), @@ -325,6 +330,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING = OrderedDict( [ + # Model for Causal LM mapping (CamembertConfig, CamembertForCausalLM), (XLMRobertaConfig, XLMRobertaForCausalLM), (RobertaConfig, RobertaForCausalLM), @@ -347,6 +353,7 @@ MODEL_FOR_MASKED_LM_MAPPING = OrderedDict( [ + # Model for Masked LM mapping (LayoutLMConfig, LayoutLMForMaskedLM), (DistilBertConfig, DistilBertForMaskedLM), (AlbertConfig, AlbertForMaskedLM), @@ -368,6 +375,7 @@ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict( [ + # Model for Seq2Seq Causal LM mapping (T5Config, T5ForConditionalGeneration), (PegasusConfig, PegasusForConditionalGeneration), (MarianConfig, MarianMTModel), @@ -383,6 +391,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict( [ + # Model for Sequence Classification mapping (DistilBertConfig, DistilBertForSequenceClassification), (AlbertConfig, AlbertForSequenceClassification), (CamembertConfig, CamembertForSequenceClassification), @@ -407,6 +416,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( [ + # Model for Question Answering mapping (DistilBertConfig, DistilBertForQuestionAnswering), (AlbertConfig, AlbertForQuestionAnswering), (CamembertConfig, CamembertForQuestionAnswering), @@ -429,6 +439,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( [ + # Model for Token Classification mapping (LayoutLMConfig, LayoutLMForTokenClassification), (DistilBertConfig, DistilBertForTokenClassification), (CamembertConfig, CamembertForTokenClassification), @@ -450,6 +461,7 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict( [ + # Model for Multiple Choice mapping (CamembertConfig, CamembertForMultipleChoice), (ElectraConfig, ElectraForMultipleChoice), (XLMRobertaConfig, XLMRobertaForMultipleChoice), diff --git a/src/transformers/modeling_tf_auto.py b/src/transformers/modeling_tf_auto.py index 5d16423a9e7416..12898daf0bf3f8 100644 --- a/src/transformers/modeling_tf_auto.py +++ b/src/transformers/modeling_tf_auto.py @@ -169,11 +169,14 @@ from .utils import logging +# Add modeling imports here + logger = logging.get_logger(__name__) TF_MODEL_MAPPING = OrderedDict( [ + # Base model mapping (LxmertConfig, TFLxmertModel), (T5Config, TFT5Model), (DistilBertConfig, TFDistilBertModel), @@ -200,6 +203,7 @@ TF_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict( [ + # Model for pre-training mapping (LxmertConfig, TFLxmertForPreTraining), (T5Config, TFT5ForConditionalGeneration), (DistilBertConfig, TFDistilBertForMaskedLM), @@ -224,6 +228,7 @@ TF_MODEL_WITH_LM_HEAD_MAPPING = OrderedDict( [ + # Model with LM heads mapping (T5Config, TFT5ForConditionalGeneration), (DistilBertConfig, TFDistilBertForMaskedLM), (AlbertConfig, TFAlbertForMaskedLM), @@ -249,6 +254,7 @@ TF_MODEL_FOR_CAUSAL_LM_MAPPING = OrderedDict( [ + # Model for Causal LM mapping (BertConfig, TFBertLMHeadModel), (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel), (GPT2Config, TFGPT2LMHeadModel), @@ -264,6 +270,7 @@ TF_MODEL_FOR_MASKED_LM_MAPPING = OrderedDict( [ + # Model for Masked LM mapping (DistilBertConfig, TFDistilBertForMaskedLM), (AlbertConfig, TFAlbertForMaskedLM), (CamembertConfig, TFCamembertForMaskedLM), @@ -282,6 +289,7 @@ TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict( [ + # Model for Seq2Seq Causal LM mapping (T5Config, TFT5ForConditionalGeneration), (MarianConfig, TFMarianMTModel), (MBartConfig, TFMBartForConditionalGeneration), @@ -293,6 +301,7 @@ TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict( [ + # Model for Sequence Classification mapping (DistilBertConfig, TFDistilBertForSequenceClassification), (AlbertConfig, TFAlbertForSequenceClassification), (CamembertConfig, TFCamembertForSequenceClassification), @@ -310,6 +319,7 @@ TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( [ + # Model for Question Answering mapping (DistilBertConfig, TFDistilBertForQuestionAnswering), (AlbertConfig, TFAlbertForQuestionAnswering), (CamembertConfig, TFCamembertForQuestionAnswering), @@ -328,6 +338,7 @@ TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( [ + # Model for Token Classification mapping (DistilBertConfig, TFDistilBertForTokenClassification), (AlbertConfig, TFAlbertForTokenClassification), (CamembertConfig, TFCamembertForTokenClassification), @@ -345,6 +356,7 @@ TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict( [ + # Model for Multiple Choice mapping (CamembertConfig, TFCamembertForMultipleChoice), (XLMConfig, TFXLMForMultipleChoice), (XLMRobertaConfig, TFXLMRobertaForMultipleChoice), diff --git a/templates/adding_a_new_model/README.md b/templates/adding_a_new_model/README.md index 862c7f42ce48ac..93c5950e35668a 100644 --- a/templates/adding_a_new_model/README.md +++ b/templates/adding_a_new_model/README.md @@ -1,89 +1,104 @@ -# How to add a new model in 🤗 Transformers - -This folder describes the process to add a new model in 🤗 Transformers and provide templates for the required files. - -The library is designed to incorporate a variety of models and code bases. As such the process for adding a new model -usually mostly consists in copy-pasting to relevant original code in the various sections of the templates included in -the present repository. - -One important point though is that the library has the following goals impacting the way models are incorporated: - -- One specific feature of the API is the capability to run the model and tokenizer inline. The tokenization code thus - often have to be slightly adapted to allow for running in the python interpreter. -- the package is also designed to be as self-consistent and with a small and reliable set of packages dependencies. In - consequence, additional dependencies are usually not allowed when adding a model but can be allowed for the - inclusion of a new tokenizer (recent examples of dependencies added for tokenizer specificities include - `sentencepiece` and `sacremoses`). Please make sure to check the existing dependencies when possible before adding a - new one. - -For a quick overview of the general philosphy of the library and its organization, please check the -[QuickStart section of the documentation](https://huggingface.co/transformers/philosophy.html). - -# Typical workflow for including a model - -Here an overview of the general workflow: - -- [ ] Add model/configuration/tokenization classes. -- [ ] Add conversion scripts. -- [ ] Add tests and a @slow integration test. -- [ ] Document your model. -- [ ] Finalize. - -Let's detail what should be done at each step. - -## Adding model/configuration/tokenization classes - -Here is the workflow for adding model/configuration/tokenization classes: - -- [ ] Copy the python files from the present folder to the main folder and rename them, replacing `xxx` with your model - name. -- [ ] Edit the files to replace `XXX` (with various casing) with your model name. -- [ ] Copy-paste or create a simple configuration class for your model in the `configuration_...` file. -- [ ] Copy-paste or create the code for your model in the `modeling_...` files (PyTorch and TF 2.0). -- [ ] Copy-paste or create a tokenizer class for your model in the `tokenization_...` file. - -## Adding conversion scripts - -Here is the workflow for the conversion scripts: - -- [ ] Copy the conversion script (`convert_...`) from the present folder to the main folder. -- [ ] Edit this script to convert your original checkpoint weights to the current pytorch ones. - -## Adding tests: - -Here is the workflow for the adding tests: - -- [ ] Copy the python files from the `tests` sub-folder of the present folder to the `tests` subfolder of the main - folder and rename them, replacing `xxx` with your model name. -- [ ] Edit the tests files to replace `XXX` (with various casing) with your model name. -- [ ] Edit the tests code as needed. - -## Documenting your model: - -Here is the workflow for documentation: - -- [ ] Make sure all your arguments are properly documented in your configuration and tokenizer. -- [ ] Most of the documentation of the models is automatically generated, you just have to make sure that - `XXX_START_DOCSTRING` contains an introduction to the model you're adding and a link to the original - article and that `XXX_INPUTS_DOCSTRING` contains all the inputs of your model. -- [ ] Create a new page `xxx.rst` in the folder `docs/source/model_doc` and add this file in `docs/source/index.rst`. - -Make sure to check you have no sphinx warnings when building the documentation locally and follow our -[documentation guide](https://github.com/huggingface/transformers/tree/master/docs#writing-documentation---specification). - -## Final steps - -You can then finish the addition step by adding imports for your classes in the common files: - -- [ ] Add import for all the relevant classes in `__init__.py`. -- [ ] Add your configuration in `configuration_auto.py`. -- [ ] Add your PyTorch and TF 2.0 model respectively in `modeling_auto.py` and `modeling_tf_auto.py`. -- [ ] Add your tokenizer in `tokenization_auto.py`. -- [ ] Add a link to your conversion script in the main conversion utility (in `commands/convert.py`) -- [ ] Edit the PyTorch to TF 2.0 conversion script to add your model in the `convert_pytorch_checkpoint_to_tf2.py` - file. -- [ ] Add a mention of your model in the doc: `README.md` and the documentation itself - in `docs/source/pretrained_models.rst`. Rune `make fix-copies` to update `docs/source/index.rst` with your changes. -- [ ] Upload the pretrained weights, configurations and vocabulary files. -- [ ] Create model card(s) for your models on huggingface.co. For those last two steps, check the - [model sharing documentation](https://huggingface.co/transformers/model_sharing.html). +# Using `cookiecutter` to generate models + +This folder contains templates to generate new models that fit the current API and pass all tests. It generates +models in both PyTorch and TensorFlow, completes the `__init__.py` and auto-modeling files, and creates the +documentation. + +## Usage + +Using the `cookiecutter` utility requires to have all the `dev` dependencies installed. Let's first clone the +repository and install it in our environment: + +```shell script +git clone https://github.com/huggingface/transformers +cd transformers +pip install -e ".[dev]" +``` + +Once the installation is done, you can use the CLI command `add-new-model` to generate your models: + +```shell script +transformers-cli add-new-model +``` + +This should launch the `cookiecutter` package which should prompt you to fill in the configuration. + +The `modelname` should be cased according to the plain text casing, i.e., BERT, RoBERTa, DeBERTa. +``` +modelname []: +uppercase_modelname []: +lowercase_modelname []: +camelcase_modelname []: +``` + +Fill in the `authors` with your team members: +``` +authors [The HuggingFace Team]: +``` + +The checkpoint identifier is the checkpoint that will be used in the examples across the files. Put the name you wish, +as it will appear on the modelhub. Do not forget to include the organisation. +``` +checkpoint_identifier [organisation/-base-cased]: +``` + +The tokenizer should either be based on BERT if it behaves exactly like the BERT tokenizer, or a standalone otherwise. +``` +Select tokenizer_type: +1 - Based on BERT +2 - Standalone +Choose from 1, 2 [1]: +``` + + +Once the command has finished, you should have a total of 7 new files spread across the repository: +``` +docs/source/model_doc/.rst +src/transformers/configuration_.py +src/transformers/modeling_.py +src/transformers/modeling_tf_.py +src/transformers/tokenization_.py +tests/test_modeling_.py +tests/test_modeling_tf_.py +``` + +You can run the tests to ensure that they all pass: + +``` +python -m pytest ./tests/test_**.py +``` + +Feel free to modify each file to mimic the behavior of your model. + +⚠ You should be careful about the classes preceded by the following line:️ + +```python +# Copied from transformers.[...] +``` + +This line ensures that the copy does not diverge from the source. If it *should* diverge, because the implementation +is different, this line needs to be deleted. If you don't delete this line and run `make fix-copies`, +your changes will be overwritten. + +Once you have edited the files to fit your architecture, simply re-run the tests (and edit them if a change +is needed!) afterwards to make sure everything works as expected. + +Once the files are generated and you are happy with your changes, here's a checklist to ensure that your contribution +will be merged quickly: + +- You should run the `make fixup` utility to fix the style of the files and to ensure the code quality meets the + library's standards. +- You should complete the documentation file (`docs/source/model_doc/.rst`) so that your model may be + usable. \ No newline at end of file diff --git a/templates/adding_a_new_model/configuration_xxx.py b/templates/adding_a_new_model/configuration_xxx.py deleted file mode 100644 index e11c638700acd0..00000000000000 --- a/templates/adding_a_new_model/configuration_xxx.py +++ /dev/null @@ -1,108 +0,0 @@ -# coding=utf-8 -# Copyright 2010, XXX authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" XXX model configuration """ - - -import logging -from typing import Callable, Union - -from .configuration_utils import PretrainedConfig - - -logger = logging.getLogger(__name__) - -XXX_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-config.json", - "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-config.json", -} - - -class XxxConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a :class:`~transformers.XxxModel` or a - :class:`~transformers.TFXxxModel`. It is used to instantiate a XXX model according to the specified - arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar - configuration to that of the XXX `xxx-base-uncased `__ architecture. - - Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used - to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` - for more information. - - - Args: - vocab_size (:obj:`int`, `optional`, defaults to 30522): - Vocabulary size of the XXX model. Defines the number of different tokens that can be represented by the - :obj:`inputs_ids` passed when calling :class:`~transformers.XxxModel` or - :class:`~transformers.TFXxxModel`. - hidden_size (:obj:`int`, `optional`, defaults to 768): - Dimensionality of the encoder layers and the pooler layer. - num_hidden_layers (:obj:`int`, `optional`, defaults to 12): - Number of hidden layers in the Transformer encoder. - num_attention_heads (:obj:`int`, `optional`, defaults to 12): - Number of attention heads for each attention layer in the Transformer encoder. - hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): - The non-linear activation function (function or string) in the encoder and pooler. - - If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. - hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): - The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. - attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): - The dropout ratio for the attention probabilities. - max_position_embeddings (:obj:`int`, `optional`, defaults to 512): - The maximum sequence length that this model might ever be used with. - Typically set this to something large just in case (e.g., 512 or 1024 or 2048). - type_vocab_size (:obj:`int`, `optional`, defaults to 2): - The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.XxxModel` or - :class:`~transformers.TFXxxModel`. - initializer_range (:obj:`float`, `optional`, defaults to 0.02): - The standard deviation of the :obj:`truncated_normal_initializer` for initializing all weight matrices. - layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-5): - The epsilon used by the layer normalization layers. - gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`): - If :obj:`True`, use gradient checkpointing to save memory at the expense of slower backward pass. - kwargs: - Additional arguments for common configurations, passed to :class:`~transformers.PretrainedConfig`. - """ - model_type = "xxx" - - def __init__( - self, - vocab_size: int = 50257, - hidden_size: int = 1024, - num_hidden_layers: int = 12, - num_attention_heads: int = 12, - hidden_act: Union[str, Callable] = "gelu", - hidden_dropout_prob: float = 0.1, - attention_probs_dropout_prob: float = 0.1, - max_position_embeddings: int = 512, - type_vocab_size: int = 2, - initializer_range: float = 0.02, - layer_norm_epsilon: float = 1e-5, - gradient_checkpointing: bool = False, - **kwargs - ): - super().__init__(**kwargs) - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_epsilon = layer_norm_epsilon - self.gradient_checkpointing = gradient_checkpointing diff --git a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py deleted file mode 100755 index b57d3bbdcaeacc..00000000000000 --- a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py +++ /dev/null @@ -1,61 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert XXX checkpoint.""" - - -import argparse -import logging - -import torch - -from transformers import XxxConfig, XxxForPreTraining, load_tf_weights_in_xxx - - -logging.basicConfig(level=logging.INFO) - - -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): - # Initialise PyTorch model - config = XxxConfig.from_json_file(config_file) - print("Building PyTorch model from configuration: {}".format(str(config))) - model = XxxForPreTraining(config) - - # Load weights from tf checkpoint - load_tf_weights_in_xxx(model, config, tf_checkpoint_path) - - # Save pytorch-model - print("Save PyTorch model to {}".format(pytorch_dump_path)) - torch.save(model.state_dict(), pytorch_dump_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Required parameters - parser.add_argument( - "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." - ) - parser.add_argument( - "--config_file", - default=None, - type=str, - required=True, - help="The config json file corresponding to the pre-trained model. \n" - "This specifies the model architecture.", - ) - parser.add_argument( - "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." - ) - args = parser.parse_args() - convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration.json b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration.json new file mode 100644 index 00000000000000..71c31a09c9da17 --- /dev/null +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration.json @@ -0,0 +1,10 @@ +{ + "modelname": "{{cookiecutter.modelname}}", + "uppercase_modelname": "{{cookiecutter.uppercase_modelname}}", + "lowercase_modelname": "{{cookiecutter.lowercase_modelname}}", + "camelcase_modelname": "{{cookiecutter.camelcase_modelname}}", + "authors": "{{cookiecutter.authors}}", + "checkpoint_identifier": "{{cookiecutter.checkpoint_identifier}}", + "tokenizer_type": "{{cookiecutter.tokenizer_type}}", + "generate_tensorflow_and_pytorch": "{{cookiecutter.generate_tensorflow_and_pytorch}}" +} diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py new file mode 100644 index 00000000000000..2e064271113533 --- /dev/null +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py @@ -0,0 +1,129 @@ +# coding=utf-8 +# Copyright {{cookiecutter.authors}} and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" {{cookiecutter.modelname}} model configuration """ + +from .configuration_utils import PretrainedConfig +from .utils import logging + + +logger = logging.get_logger(__name__) + +{{cookiecutter.uppercase_modelname}}_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/config.json", + # See all {{cookiecutter.modelname}} models at https://huggingface.co/models?filter={{cookiecutter.lowercase_modelname}} +} + + +class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a :class:`~transformers.{{cookiecutter.camelcase_modelname}}Model`. + It is used to instantiate an {{cookiecutter.modelname}} model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of + the {{cookiecutter.modelname}} `{{cookiecutter.checkpoint_identifier}} `__ architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used + to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` + for more information. + + + Args: + vocab_size (:obj:`int`, `optional`, defaults to 30522): + Vocabulary size of the {{cookiecutter.modelname}} model. Defines the number of different tokens that can be represented by the + :obj:`inputs_ids` passed when calling :class:`~transformers.{{cookiecutter.camelcase_modelname}}Model` or + :class:`~transformers.TF{{cookiecutter.camelcase_modelname}}Model`. + Vocabulary size of the model. Defines the different tokens that + can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.{{cookiecutter.camelcase_modelname}}Model`. + hidden_size (:obj:`int`, `optional`, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (:obj:`int`, `optional`, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (:obj:`int`, `optional`, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (:obj:`int`, `optional`, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. + If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported. + hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): + The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (:obj:`int`, `optional`, defaults to 512): + The maximum sequence length that this model might ever be used with. + Typically set this to something large just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (:obj:`int`, `optional`, defaults to 2): + The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.{{cookiecutter.camelcase_modelname}}Model` or + :class:`~transformers.TF{{cookiecutter.camelcase_modelname}}Model`. + initializer_range (:obj:`float`, `optional`, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12): + The epsilon used by the layer normalization layers. + + Example:: + + >>> from transformers import {{cookiecutter.camelcase_modelname}}Model, {{cookiecutter.camelcase_modelname}}Config + + >>> # Initializing a {{cookiecutter.modelname}} {{cookiecutter.checkpoint_identifier}} style configuration + >>> configuration = {{cookiecutter.camelcase_modelname}}Config() + + >>> # Initializing a model from the {{cookiecutter.checkpoint_identifier}} style configuration + >>> model = {{cookiecutter.camelcase_modelname}}Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + """ + model_type = "{{cookiecutter.lowercase_modelname}}" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + is_encoder_decoder=False, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + **kwargs + ): + super().__init__( + pad_token_id=pad_token_id, + is_encoder_decoder=is_encoder_decoder, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs + ) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py similarity index 51% rename from templates/adding_a_new_model/modeling_tf_xxx.py rename to templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py index e512dbd7ad4236..d32a1279a91ba7 100644 --- a/templates/adding_a_new_model/modeling_tf_xxx.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py @@ -1,6 +1,5 @@ # coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright 2018 {{cookiecutter.authors}} and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,15 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" TF 2.0 XXX model. """ +""" TF 2.0 {{cookiecutter.modelname}} model. """ -#################################################### -# In this template, replace all the XXX (various casings) with your model name -#################################################### import tensorflow as tf -from .configuration_xxx import XxxConfig +from .activations_tf import get_tf_activation +from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config from .file_utils import ( MULTIPLE_CHOICE_DUMMY_INPUTS, add_code_sample_docstrings, @@ -29,6 +26,7 @@ add_start_docstrings_to_model_forward, ) from .modeling_tf_outputs import ( + TFBaseModelOutput, TFBaseModelOutputWithPooling, TFMaskedLMOutput, TFMultipleChoiceModelOutput, @@ -43,6 +41,7 @@ TFQuestionAnsweringLoss, TFSequenceClassificationLoss, TFTokenClassificationLoss, + TFSequenceSummary, get_initializer, keras_serializable, shape_list, @@ -53,72 +52,437 @@ logger = logging.get_logger(__name__) -_CONFIG_FOR_DOC = "XXXConfig" -_TOKENIZER_FOR_DOC = "XxxTokenizer" +_CONFIG_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Config" +_TOKENIZER_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Tokenizer" -#################################################### -# This list contrains shortcut names for some of -# the pretrained weights provided with the models -#################################################### -TF_XXX_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "xxx-base-uncased", - "xxx-large-uncased", +TF_{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "{{cookiecutter.checkpoint_identifier}}", + # See all {{cookiecutter.modelname}} models at https://huggingface.co/models?filter={{cookiecutter.lowercase_modelname}} ] -#################################################### -# TF 2.0 Models are constructed using Keras imperative API by sub-classing -# - tf.keras.layers.Layer for the layers and -# - TFPreTrainedModel for the models (itself a sub-class of tf.keras.Model) -#################################################### +# Copied from transformers.modeling_tf_bert.TFBertEmbeddings with Bert->{{cookiecutter.camelcase_modelname}} +class TF{{cookiecutter.camelcase_modelname}}Embeddings(tf.keras.layers.Layer): + """Construct the embeddings from word, position and token_type embeddings.""" -#################################################### -# Here is an example of typical layer in a TF 2.0 model of the library -# The classes are usually identical to the PyTorch ones and prefixed with 'TF'. -# -# Note that class __init__ parameters includes **kwargs (send to 'super'). -# This let us have a control on class scope and variable names: -# More precisely, we set the names of the class attributes (lower level layers) to -# to the equivalent attributes names in the PyTorch model so we can have equivalent -# class and scope structure between PyTorch and TF 2.0 models and easily load one in the other. -# -# See the conversion methods in modeling_tf_pytorch_utils.py for more details -#################################################### + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = config.vocab_size + self.hidden_size = config.hidden_size + self.initializer_range = config.initializer_range + self.position_embeddings = tf.keras.layers.Embedding( + config.max_position_embeddings, + config.hidden_size, + embeddings_initializer=get_initializer(self.initializer_range), + name="position_embeddings", + ) + self.token_type_embeddings = tf.keras.layers.Embedding( + config.type_vocab_size, + config.hidden_size, + embeddings_initializer=get_initializer(self.initializer_range), + name="token_type_embeddings", + ) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + + def build(self, input_shape): + """Build shared word embedding layer """ + with tf.name_scope("word_embeddings"): + # Create and initialize weights. The random normal initializer was chosen + # arbitrarily, and works well. + self.word_embeddings = self.add_weight( + "weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + + super().build(input_shape) + + def call( + self, + input_ids=None, + position_ids=None, + token_type_ids=None, + inputs_embeds=None, + mode="embedding", + training=False, + ): + """Get token embeddings of inputs. + Args: + inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) + mode: string, a valid value is one of "embedding" and "linear". + Returns: + outputs: (1) If mode == "embedding", output embedding tensor, float32 with + shape [batch_size, length, embedding_size]; (2) mode == "linear", output + linear tensor, float32 with shape [batch_size, length, vocab_size]. + Raises: + ValueError: if mode is not valid. + + Shared weights logic adapted from + https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 + """ + if mode == "embedding": + return self._embedding(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) + elif mode == "linear": + return self._linear(input_ids) + else: + raise ValueError("mode {} is not valid.".format(mode)) + + def _embedding(self, input_ids, position_ids, token_type_ids, inputs_embeds, training=False): + """Applies embedding based on inputs tensor.""" + assert not (input_ids is None and inputs_embeds is None) + + if input_ids is not None: + input_shape = shape_list(input_ids) + else: + input_shape = shape_list(inputs_embeds)[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] + + if token_type_ids is None: + token_type_ids = tf.fill(input_shape, 0) + + if inputs_embeds is None: + inputs_embeds = tf.gather(self.word_embeddings, input_ids) + + position_embeddings = tf.cast(self.position_embeddings(position_ids), inputs_embeds.dtype) + token_type_embeddings = tf.cast(self.token_type_embeddings(token_type_ids), inputs_embeds.dtype) + embeddings = inputs_embeds + position_embeddings + token_type_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings, training=training) + + return embeddings + + def _linear(self, inputs): + """Computes logits by running inputs through a linear layer. + Args: + inputs: A float32 tensor with shape [batch_size, length, hidden_size] + Returns: + float32 tensor with shape [batch_size, length, vocab_size]. + """ + batch_size = shape_list(inputs)[0] + length = shape_list(inputs)[1] + x = tf.reshape(inputs, [-1, self.hidden_size]) + logits = tf.matmul(x, self.word_embeddings, transpose_b=True) + + return tf.reshape(logits, [batch_size, length, self.vocab_size]) + + +# Copied from transformers.modeling_tf_bert.TFBertSelfAttention with Bert->{{cookiecutter.camelcase_modelname}} +class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + assert config.hidden_size % config.num_attention_heads == 0 + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.query = tf.keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + ) + self.key = tf.keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + ) + self.value = tf.keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + ) + self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x, batch_size): + x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) + + return tf.transpose(x, perm=[0, 2, 1, 3]) + + def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): + batch_size = shape_list(hidden_states)[0] + mixed_query_layer = self.query(hidden_states) + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) + key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) + value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = tf.matmul( + query_layer, key_layer, transpose_b=True + ) # (batch size, num_heads, seq_len_q, seq_len_k) + dk = tf.cast(shape_list(key_layer)[-1], attention_scores.dtype) # scale attention_scores + attention_scores = attention_scores / tf.math.sqrt(dk) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in TF{{cookiecutter.camelcase_modelname}}Model call() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = tf.nn.softmax(attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs, training=training) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = tf.matmul(attention_probs, value_layer) + context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) + context_layer = tf.reshape( + context_layer, (batch_size, -1, self.all_head_size) + ) # (batch_size, seq_len_q, all_head_size) + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) -TFXxxAttention = tf.keras.layers.Layer + return outputs + + +# Copied from transformers.modeling_tf_bert.TFBertSelfOutput with Bert->{{cookiecutter.camelcase_modelname}} +class TF{{cookiecutter.camelcase_modelname}}SelfOutput(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.dense = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + + def call(self, hidden_states, input_tensor, training=False): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + + return hidden_states + + +# Copied from transformers.modeling_tf_bert.TFBertAttention with Bert->{{cookiecutter.camelcase_modelname}} +class TF{{cookiecutter.camelcase_modelname}}Attention(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.self_attention = TF{{cookiecutter.camelcase_modelname}}SelfAttention(config, name="self") + self.dense_output = TF{{cookiecutter.camelcase_modelname}}SelfOutput(config, name="output") + + def prune_heads(self, heads): + raise NotImplementedError + + def call(self, input_tensor, attention_mask, head_mask, output_attentions, training=False): + self_outputs = self.self_attention( + input_tensor, attention_mask, head_mask, output_attentions, training=training + ) + attention_output = self.dense_output(self_outputs[0], input_tensor, training=training) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + + return outputs + + +# Copied from transformers.modeling_tf_bert.TFBertIntermediate with Bert->{{cookiecutter.camelcase_modelname}} +class TF{{cookiecutter.camelcase_modelname}}Intermediate(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.dense = tf.keras.layers.Dense( + config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = get_tf_activation(config.hidden_act) + else: + self.intermediate_act_fn = config.hidden_act + + def call(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) -TFXxxIntermediate = tf.keras.layers.Layer + return hidden_states -TFXxxOutput = tf.keras.layers.Layer + +# Copied from transformers.modeling_tf_bert.TFBertOutput with Bert->{{cookiecutter.camelcase_modelname}} +class TF{{cookiecutter.camelcase_modelname}}Output(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.dense = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + + def call(self, hidden_states, input_tensor, training=False): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + + return hidden_states -class TFXxxLayer(tf.keras.layers.Layer): +# Copied from transformers.modeling_tf_bert.TFBertLayer with Bert->{{cookiecutter.camelcase_modelname}} +class TF{{cookiecutter.camelcase_modelname}}Layer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.attention = TFXxxAttention(config, name="attention") - self.intermediate = TFXxxIntermediate(config, name="intermediate") - self.transformer_output = TFXxxOutput(config, name="output") - def call(self, inputs, training=False): - hidden_states, attention_mask, head_mask = inputs + self.attention = TF{{cookiecutter.camelcase_modelname}}Attention(config, name="attention") + self.intermediate = TF{{cookiecutter.camelcase_modelname}}Intermediate(config, name="intermediate") + self.{{cookiecutter.lowercase_modelname}}_output = TF{{cookiecutter.camelcase_modelname}}Output(config, name="output") - attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training) + def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): + attention_outputs = self.attention( + hidden_states, attention_mask, head_mask, output_attentions, training=training + ) attention_output = attention_outputs[0] intermediate_output = self.intermediate(attention_output) - layer_output = self.transformer_output([intermediate_output, attention_output], training=training) + layer_output = self.{{cookiecutter.lowercase_modelname}}_output(intermediate_output, attention_output, training=training) outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them + return outputs -#################################################### -# The full model without a specific pretrained or finetuning head is -# provided as a tf.keras.layers.Layer usually called "TFXxxMainLayer" -#################################################### +class TF{{cookiecutter.camelcase_modelname}}Encoder(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.layer = [TF{{cookiecutter.camelcase_modelname}}Layer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)] + + def call( + self, + hidden_states, + attention_mask, + head_mask, + output_attentions, + output_hidden_states, + return_dict, + training=False, + ): + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = layer_module( + hidden_states, attention_mask, head_mask[i], output_attentions, training=training + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +# Copied from transformers.modeling_tf_bert.TFBertPredictionHead with Bert->{{cookiecutter.camelcase_modelname}} +class TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.dense = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + + if isinstance(config.hidden_act, str): + self.transform_act_fn = get_tf_activation(config.hidden_act) + else: + self.transform_act_fn = config.hidden_act + + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + + def call(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + + return hidden_states + + +# Copied from transformers.modeling_tf_bert.TFBertLMPredictionHead with Bert->{{cookiecutter.camelcase_modelname}} +class TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(tf.keras.layers.Layer): + def __init__(self, config, input_embeddings, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = config.vocab_size + self.transform = TF{{cookiecutter.camelcase_modelname}}PredictionHeadTransform(config, name="transform") + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.input_embeddings = input_embeddings + + def build(self, input_shape): + self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") + + super().build(input_shape) + + def call(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.input_embeddings(hidden_states, mode="linear") + hidden_states = hidden_states + self.bias + + return hidden_states + + +# Copied from transformers.modeling_tf_bert.TFBertMLMHead with Bert->{{cookiecutter.camelcase_modelname}} +class TF{{cookiecutter.camelcase_modelname}}MLMHead(tf.keras.layers.Layer): + def __init__(self, config, input_embeddings, **kwargs): + super().__init__(**kwargs) + + self.predictions = TF{{cookiecutter.camelcase_modelname}}LMPredictionHead(config, input_embeddings, name="predictions") + + def call(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + + return prediction_scores + + +class TF{{cookiecutter.camelcase_modelname}}NSPHead(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.seq_relationship = tf.keras.layers.Dense( + 2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship" + ) + + def call(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + + return seq_relationship_score + + @keras_serializable -class TFXxxMainLayer(tf.keras.layers.Layer): +class TF{{cookiecutter.camelcase_modelname}}MainLayer(tf.keras.layers.Layer): + config_class = {{cookiecutter.camelcase_modelname}}Config + def __init__(self, config, **kwargs): super().__init__(**kwargs) + self.num_hidden_layers = config.num_hidden_layers + self.initializer_range = config.initializer_range + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + self.return_dict = config.use_return_dict + self.embeddings = TF{{cookiecutter.camelcase_modelname}}Embeddings(config, name="embeddings") + self.encoder = TF{{cookiecutter.camelcase_modelname}}Encoder(config, name="encoder") + self.config = config + def get_input_embeddings(self): return self.embeddings @@ -127,7 +491,11 @@ def set_input_embeddings(self, value): self.embeddings.vocab_size = value.shape[0] def _prune_heads(self, heads_to_prune): - raise NotImplementedError # Not implemented yet in the library for TF 2.0 models + """Prunes heads of the model. + heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + See base class PreTrainedModel + """ + raise NotImplementedError def call( self, @@ -182,9 +550,12 @@ def call( if attention_mask is None: attention_mask = tf.fill(input_shape, 1) + if token_type_ids is None: token_type_ids = tf.fill(input_shape, 0) + embedding_output = self.embeddings(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) + # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] @@ -197,8 +568,7 @@ def call( # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - - extended_attention_mask = tf.cast(extended_attention_mask, tf.float32) + extended_attention_mask = tf.cast(extended_attention_mask, embedding_output.dtype) extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 # Prepare head mask if needed @@ -212,7 +582,6 @@ def call( head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) - embedding_output = self.embeddings(input_ids, position_ids, token_type_ids, inputs_embeds, training=training) encoder_outputs = self.encoder( embedding_output, extended_attention_mask, @@ -224,43 +593,31 @@ def call( ) sequence_output = encoder_outputs[0] - pooled_output = self.pooler(sequence_output) if not return_dict: return ( sequence_output, - pooled_output, ) + encoder_outputs[1:] - return TFBaseModelOutputWithPooling( + return TFBaseModelOutput( last_hidden_state=sequence_output, - pooler_output=pooled_output, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) -#################################################### -# TFXxxPreTrainedModel is a sub-class of tf.keras.Model -# which take care of loading and saving pretrained weights -# and various common utilities. -# Here you just need to specify a few (self-explanatory) -# pointers for your model. -#################################################### -class TFXxxPreTrainedModel(TFPreTrainedModel): +# Copied from transformers.modeling_tf_bert.TFBertPreTrainedModel with Bert->{{cookiecutter.camelcase_modelname}} +class TF{{cookiecutter.camelcase_modelname}}PreTrainedModel(TFPreTrainedModel): """An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ - config_class = XxxConfig - base_model_prefix = "transformer" + config_class = {{cookiecutter.camelcase_modelname}}Config + base_model_prefix = "{{cookiecutter.lowercase_modelname}}" -XXX_START_DOCSTRING = r""" - The XXX model was proposed in - `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding - `__ by.... +{{cookiecutter.uppercase_modelname}}_START_DOCSTRING = r""" This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the generic methods the library implements for all its model (such as downloading or saving, resizing the input @@ -289,18 +646,18 @@ class TFXxxPreTrainedModel(TFPreTrainedModel): - a dictionary with one or several input Tensors associated to the input names given in the docstring: :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})` - Parameters: - config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model. + Args: + config (:class:`~transformers.{{cookiecutter.camelcase_modelname}}Config`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. """ -XXX_INPUTS_DOCSTRING = r""" +{{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING = r""" Args: input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`): Indices of input sequence tokens in the vocabulary. - Indices can be obtained using :class:`~transformers.BertTokenizer`. + Indices can be obtained using :class:`~transformers.{{cookiecutter.camelcase_modelname}}Tokenizer`. See :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for details. @@ -310,7 +667,7 @@ class TFXxxPreTrainedModel(TFPreTrainedModel): Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. + - 0 for tokens that are **maked**. `What are attention masks? <../glossary.html#attention-mask>`__ token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): @@ -352,41 +709,50 @@ class TFXxxPreTrainedModel(TFPreTrainedModel): @add_start_docstrings( - "The bare XXX Model transformer outputting raw hidden-states without any specific head on top.", - XXX_START_DOCSTRING, + "The bare {{cookiecutter.modelname}} Model transformer outputing raw hidden-states without any specific head on top.", + {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, ) -class TFXxxModel(TFXxxPreTrainedModel): +class TF{{cookiecutter.camelcase_modelname}}Model(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) - self.transformer = TFXxxMainLayer(config, name="transformer") - @add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") + + @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, - checkpoint="xxx-base-cased", + checkpoint="{{cookiecutter.checkpoint_identifier}}", output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC, ) def call(self, inputs, **kwargs): - outputs = self.transformer(inputs, **kwargs) - return outputs + outputs = self.{{cookiecutter.lowercase_modelname}}(inputs, **kwargs) + return outputs -TFXxxMLMHead = tf.keras.layers.Layer +@add_start_docstrings("""{{cookiecutter.modelname}} Model with a `language modeling` head on top. """, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING) +class TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFMaskedLanguageModelingLoss): -@add_start_docstrings("""Xxx Model with a `language modeling` head on top. """, XXX_START_DOCSTRING) -class TFXxxForMaskedLM(TFXxxPreTrainedModel, TFMaskedLanguageModelingLoss): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) - self.transformer = TFXxxMainLayer(config, name="transformer") - self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name="mlm") + if config.is_decoder: + logger.warning( + "If you want to use `TF{{cookiecutter.camelcase_modelname}}ForMaskedLM` make sure `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") + self.mlm = TF{{cookiecutter.camelcase_modelname}}MLMHead(config, self.{{cookiecutter.lowercase_modelname}}.embeddings, name="mlm___cls") - @add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def get_output_embeddings(self): + return self.{{cookiecutter.lowercase_modelname}}.embeddings + + @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, - checkpoint="xxx-base-cased", + checkpoint="{{cookiecutter.checkpoint_identifier}}", output_type=TFMaskedLMOutput, config_class=_CONFIG_FOR_DOC, ) @@ -411,7 +777,8 @@ def call( Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` """ - return_dict = return_dict if return_dict is not None else self.transformer.return_dict + return_dict = return_dict if return_dict is not None else self.{{cookiecutter.lowercase_modelname}}.return_dict + if isinstance(inputs, (tuple, list)): labels = inputs[9] if len(inputs) > 9 else labels if len(inputs) > 9: @@ -419,7 +786,7 @@ def call( elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) - outputs = self.transformer( + outputs = self.{{cookiecutter.lowercase_modelname}}( inputs, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -434,11 +801,10 @@ def call( sequence_output = outputs[0] prediction_scores = self.mlm(sequence_output, training=training) - loss = None if labels is None else self.compute_loss(labels, prediction_scores) if not return_dict: - output = (prediction_scores,) + outputs[2:] + output = (prediction_scores,) + outputs[1:] return ((loss,) + output) if loss is not None else output return TFMaskedLMOutput( @@ -449,42 +815,65 @@ def call( ) +class TF{{cookiecutter.camelcase_modelname}}ClassificationHead(tf.keras.layers.Layer): + """Head for sentence-level classification tasks.""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.dense = tf.keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.out_proj = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" + ) + + self.config = config + + def call(self, inputs, **kwargs): + x = inputs[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = get_tf_activation(self.config.hidden_act)(x) + x = self.dropout(x) + x = self.out_proj(x) + + return x + + @add_start_docstrings( - """XXX Model transformer with a sequence classification/regression head on top (a linear layer on top of - the pooled output) e.g. for GLUE tasks. """, - XXX_START_DOCSTRING, + """{{cookiecutter.modelname}} Model transformer with a sequence classification/regression head on top + e.g., for GLUE tasks. """, + {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, ) -class TFXxxForSequenceClassification(TFXxxPreTrainedModel, TFSequenceClassificationLoss): +class TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFSequenceClassificationLoss): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.num_labels = config.num_labels + self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") + self.classifier = TF{{cookiecutter.camelcase_modelname}}ClassificationHead(config, name="classifier") - self.transformer = TFXxxMainLayer(config, name="transformer") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense( - config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" - ) - - @add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, - checkpoint="xxx-base-cased", + checkpoint="{{cookiecutter.checkpoint_identifier}}", output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC, ) def call( - self, - inputs=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - labels=None, - training=False, + self, + inputs, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + labels=None, + training=False, ): r""" labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`): @@ -493,15 +882,17 @@ def call( If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.transformer.return_dict + return_dict = return_dict if return_dict is not None else self.{{cookiecutter.lowercase_modelname}}.config.return_dict + if isinstance(inputs, (tuple, list)): labels = inputs[9] if len(inputs) > 9 else labels + if len(inputs) > 9: inputs = inputs[:9] elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) - outputs = self.transformer( + outputs = self.{{cookiecutter.lowercase_modelname}}( inputs, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -513,16 +904,12 @@ def call( return_dict=return_dict, training=training, ) - - pooled_output = outputs[1] - - pooled_output = self.dropout(pooled_output, training=training) - logits = self.classifier(pooled_output) - + logits = self.classifier(outputs[0]) loss = None if labels is None else self.compute_loss(labels, logits) if not return_dict: - output = (logits,) + outputs[2:] + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output return TFSequenceClassifierOutput( @@ -534,33 +921,36 @@ def call( @add_start_docstrings( - """XXX Model with a multiple choice classification head on top (a linear layer on top of + """{{cookiecutter.modelname}} Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, - XXX_START_DOCSTRING, + {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, ) -class TFXxxForMultipleChoice(TFXxxPreTrainedModel, TFMultipleChoiceLoss): +class TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFMultipleChoiceLoss): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) - self.transformer = TFXxxMainLayer(config, name="transformer") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") + self.sequence_summary = TFSequenceSummary( + config, initializer_range=config.initializer_range, name="sequence_summary" + ) self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) @property def dummy_inputs(self): - """Dummy inputs to build the network. + """ + Dummy inputs to build the network. Returns: tf.Tensor with dummy inputs """ return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} - @add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, - checkpoint="xxx-base-cased", + checkpoint="{{cookiecutter.checkpoint_identifier}}", output_type=TFMultipleChoiceModelOutput, config_class=_CONFIG_FOR_DOC, ) @@ -583,7 +973,6 @@ def call( Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See :obj:`input_ids` above) - heads. """ if isinstance(inputs, (tuple, list)): input_ids = inputs[0] @@ -611,7 +1000,8 @@ def call( assert len(inputs) <= 10, "Too many inputs." else: input_ids = inputs - return_dict = return_dict if return_dict is not None else self.transformer.return_dict + + return_dict = return_dict if return_dict is not None else self.{{cookiecutter.lowercase_modelname}}.config.return_dict if input_ids is not None: num_choices = shape_list(input_ids)[1] @@ -629,8 +1019,7 @@ def call( if inputs_embeds is not None else None ) - - flat_inputs = [ + outputs = self.{{cookiecutter.lowercase_modelname}}( flat_input_ids, flat_attention_mask, flat_token_type_ids, @@ -639,21 +1028,17 @@ def call( flat_inputs_embeds, output_attentions, output_hidden_states, - return_dict, - ] - - outputs = self.transformer(flat_inputs, training=training) - - pooled_output = outputs[1] - - pooled_output = self.dropout(pooled_output, training=training) - logits = self.classifier(pooled_output) + return_dict=return_dict, + training=training, + ) + logits = self.sequence_summary(outputs[0]) + logits = self.classifier(logits) reshaped_logits = tf.reshape(logits, (-1, num_choices)) - loss = None if labels is None else self.compute_loss(labels, reshaped_logits) if not return_dict: - output = (reshaped_logits,) + outputs[2:] + output = (reshaped_logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output return TFMultipleChoiceModelOutput( @@ -663,27 +1048,27 @@ def call( attentions=outputs.attentions, ) - @add_start_docstrings( - """XXX Model with a token classification head on top (a linear layer on top of + """{{cookiecutter.modelname}} Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, - XXX_START_DOCSTRING, + {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, ) -class TFXxxForTokenClassification(TFXxxPreTrainedModel, TFTokenClassificationLoss): +class TF{{cookiecutter.camelcase_modelname}}ForTokenClassification(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFTokenClassificationLoss): + def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) - self.num_labels = config.num_labels - self.transformer = TFXxxMainLayer(config, name="transformer") + self.num_labels = config.num_labels + self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) self.classifier = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" ) - @add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, - checkpoint="xxx-base-cased", + checkpoint="{{cookiecutter.checkpoint_identifier}}", output_type=TFTokenClassifierOutput, config_class=_CONFIG_FOR_DOC, ) @@ -706,7 +1091,8 @@ def call( Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ - return_dict = return_dict if return_dict is not None else self.transformer.return_dict + return_dict = return_dict if return_dict is not None else self.{{cookiecutter.lowercase_modelname}}.return_dict + if isinstance(inputs, (tuple, list)): labels = inputs[9] if len(inputs) > 9 else labels if len(inputs) > 9: @@ -714,7 +1100,7 @@ def call( elif isinstance(inputs, (dict, BatchEncoding)): labels = inputs.pop("labels", labels) - outputs = self.transformer( + outputs = self.{{cookiecutter.lowercase_modelname}}( inputs, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -726,16 +1112,13 @@ def call( return_dict=return_dict, training=training, ) - sequence_output = outputs[0] - sequence_output = self.dropout(sequence_output, training=training) logits = self.classifier(sequence_output) - loss = None if labels is None else self.compute_loss(labels, logits) if not return_dict: - output = (logits,) + outputs[2:] + output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output return TFTokenClassifierOutput( @@ -747,24 +1130,25 @@ def call( @add_start_docstrings( - """XXX Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + """{{cookiecutter.modelname}} Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`). """, - XXX_START_DOCSTRING, + {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, ) -class TFXxxForQuestionAnswering(TFXxxPreTrainedModel, TFQuestionAnsweringLoss): +class TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering(TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, TFQuestionAnsweringLoss): + def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) - self.num_labels = config.num_labels - self.transformer = TFXxxMainLayer(config, name="transformer") + self.num_labels = config.num_labels + self.{{cookiecutter.lowercase_modelname}} = TF{{cookiecutter.camelcase_modelname}}MainLayer(config, name="{{cookiecutter.lowercase_modelname}}") self.qa_outputs = tf.keras.layers.Dense( config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" ) - @add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, - checkpoint="xxx-base-cased", + checkpoint="{{cookiecutter.checkpoint_identifier}}", output_type=TFQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC, ) @@ -793,7 +1177,8 @@ def call( Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.transformer.return_dict + return_dict = return_dict if return_dict is not None else self.{{cookiecutter.lowercase_modelname}}.return_dict + if isinstance(inputs, (tuple, list)): start_positions = inputs[9] if len(inputs) > 9 else start_positions end_positions = inputs[10] if len(inputs) > 10 else end_positions @@ -803,7 +1188,7 @@ def call( start_positions = inputs.pop("start_positions", start_positions) end_positions = inputs.pop("end_positions", start_positions) - outputs = self.transformer( + outputs = self.{{cookiecutter.lowercase_modelname}}( inputs, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -815,22 +1200,20 @@ def call( return_dict=return_dict, training=training, ) - sequence_output = outputs[0] - logits = self.qa_outputs(sequence_output) start_logits, end_logits = tf.split(logits, 2, axis=-1) start_logits = tf.squeeze(start_logits, axis=-1) end_logits = tf.squeeze(end_logits, axis=-1) - loss = None + if start_positions is not None and end_positions is not None: labels = {"start_position": start_positions} labels["end_position"] = end_positions loss = self.compute_loss(labels, (start_logits, end_logits)) if not return_dict: - output = (start_logits, end_logits) + outputs[2:] + output = (start_logits, end_logits) + outputs[1:] return ((loss,) + output) if loss is not None else output return TFQuestionAnsweringModelOutput( diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py new file mode 100755 index 00000000000000..cb65d5246d665f --- /dev/null +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py @@ -0,0 +1,1234 @@ +# coding=utf-8 +# Copyright 2020 {{cookiecutter.authors}} and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch {{cookiecutter.modelname}} model. """ + + + +import math +import os +import warnings + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss, MSELoss + +from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config +from .file_utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, +) +from .modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPooling, + MaskedLMOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from .modeling_utils import ( + PreTrainedModel, + SequenceSummary, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer, +) +from .utils import logging +from .activations import ACT2FN + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Config" +_TOKENIZER_FOR_DOC = "{{cookiecutter.camelcase_modelname}}Tokenizer" + +{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "{{cookiecutter.checkpoint_identifier}}", + # See all {{cookiecutter.modelname}} models at https://huggingface.co/models?filter={{cookiecutter.lowercase_modelname}} +] + + +def load_tf_weights_in_{{cookiecutter.lowercase_modelname}}(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info("Loading TF weight {} with shape {}".format(name, shape)) + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split("/") + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any( + n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] + for n in name + ): + logger.info("Skipping {}".format("/".join(name))) + continue + pointer = model + for m_name in name: + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "kernel" or scope_names[0] == "gamma": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "output_weights": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "squad": + pointer = getattr(pointer, "classifier") + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info("Skipping {}".format("/".join(name))) + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if m_name[-11:] == "_embeddings": + pointer = getattr(pointer, "weight") + elif m_name == "kernel": + array = np.transpose(array) + try: + assert ( + pointer.shape == array.shape + ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched" + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info("Initialize PyTorch weight {}".format(name)) + pointer.data = torch.from_numpy(array) + return model + + +def mish(x): + return x * torch.tanh(nn.functional.softplus(x)) + + +# Copied from transformers.modeling_bert.BertEmbeddings with Bert->{{cookiecutter.camelcase_modelname}} +class {{cookiecutter.camelcase_modelname}}Embeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + + def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + position_embeddings = self.position_embeddings(position_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + position_embeddings + token_type_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +# Copied from transformers.modeling_bert.BertSelfAttention with Bert->{{cookiecutter.camelcase_modelname}} +class {{cookiecutter.camelcase_modelname}}SelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + output_attentions=False, + ): + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + if encoder_hidden_states is not None: + mixed_key_layer = self.key(encoder_hidden_states) + mixed_value_layer = self.value(encoder_hidden_states) + attention_mask = encoder_attention_mask + else: + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + key_layer = self.transpose_for_scores(mixed_key_layer) + value_layer = self.transpose_for_scores(mixed_value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in {{cookiecutter.camelcase_modelname}}Model forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + return outputs + + +# Copied from transformers.modeling_bert.BertSelfOutput with Bert->{{cookiecutter.camelcase_modelname}} +class {{cookiecutter.camelcase_modelname}}SelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.modeling_bert.BertAttention with Bert->{{cookiecutter.camelcase_modelname}} +class {{cookiecutter.camelcase_modelname}}Attention(nn.Module): + def __init__(self, config): + super().__init__() + self.self = {{cookiecutter.camelcase_modelname}}SelfAttention(config) + self.output = {{cookiecutter.camelcase_modelname}}SelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + output_attentions=False, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.modeling_bert.BertIntermediate with Bert->{{cookiecutter.camelcase_modelname}} +class {{cookiecutter.camelcase_modelname}}Intermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.modeling_bert.BertOutput with Bert->{{cookiecutter.camelcase_modelname}} +class {{cookiecutter.camelcase_modelname}}Output(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.modeling_bert.BertLayer with Bert->{{cookiecutter.camelcase_modelname}} +class {{cookiecutter.camelcase_modelname}}Layer(nn.Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = {{cookiecutter.camelcase_modelname}}Attention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added" + self.crossattention = {{cookiecutter.camelcase_modelname}}Attention(config) + self.intermediate = {{cookiecutter.camelcase_modelname}}Intermediate(config) + self.output = {{cookiecutter.camelcase_modelname}}Output(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + output_attentions=False, + ): + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + if self.is_decoder and encoder_hidden_states is not None: + assert hasattr( + self, "crossattention" + ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`" + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +# Copied from transformers.modeling_bert.BertEncoder with Bert->{{cookiecutter.camelcase_modelname}} +class {{cookiecutter.camelcase_modelname}}Encoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([{{cookiecutter.camelcase_modelname}}Layer(config) for _ in range(config.num_hidden_layers)]) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + output_attentions=False, + output_hidden_states=False, + return_dict=False, + ): + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + if getattr(self.config, "gradient_checkpointing", False): + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions, + ) + hidden_states = layer_outputs[0] + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +# Copied from transformers.modeling_bert.BertPredictionHead with Bert->{{cookiecutter.camelcase_modelname}} +class {{cookiecutter.camelcase_modelname}}PredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +# Copied from transformers.modeling_bert.BertLMPredictionHead with Bert->{{cookiecutter.camelcase_modelname}} +class {{cookiecutter.camelcase_modelname}}LMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = {{cookiecutter.camelcase_modelname}}PredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +# Copied from transformers.modeling_bert.BertOnlyMLMHead with Bert->{{cookiecutter.camelcase_modelname}} +class {{cookiecutter.camelcase_modelname}}OnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = {{cookiecutter.camelcase_modelname}}LMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class {{cookiecutter.camelcase_modelname}}PreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and + a simple interface for downloading and loading pretrained models. + """ + + config_class = {{cookiecutter.camelcase_modelname}}Config + load_tf_weights = load_tf_weights_in_{{cookiecutter.lowercase_modelname}} + base_model_prefix = "{{cookiecutter.lowercase_modelname}}" + authorized_missing_keys = [r"position_ids"] + + def _init_weights(self, module): + """ Initialize the weights """ + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +{{cookiecutter.uppercase_modelname}}_START_DOCSTRING = r""" + This model is a PyTorch `torch.nn.Module `_ sub-class. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general + usage and behavior. + + Parameters: + config (:class:`~transformers.{{cookiecutter.camelcase_modelname}}Config`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +{{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`transformers.{{cookiecutter.camelcase_modelname}}Tokenizer`. + See :func:`transformers.PreTrainedTokenizer.encode` and + :func:`transformers.PreTrainedTokenizer.__call__` for details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + + `What are token type IDs? <../glossary.html#token-type-ids>`_ + position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. + Selected in the range ``[0, config.max_position_embeddings - 1]``. + + `What are position IDs? <../glossary.html#position-ids>`_ + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare {{cookiecutter.modelname}} Model transformer outputting raw hidden-states without any specific head on top.", + {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, +) +class {{cookiecutter.camelcase_modelname}}Model({{cookiecutter.camelcase_modelname}}PreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well + as a decoder, in which case a layer of cross-attention is added between + the self-attention layers, following the architecture described in `Attention is + all you need `__ by Ashish Vaswani, + Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the + :obj:`is_decoder` argument of the configuration set to :obj:`True`. + To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder` + argument and :obj:`add_cross_attention` set to :obj:`True`; an + :obj:`encoder_hidden_states` is then expected as an input to the forward pass. + """ + + def __init__(self, config): + super().__init__(config) + self.config = config + + self.embeddings = {{cookiecutter.camelcase_modelname}}Embeddings(config) + self.encoder = {{cookiecutter.camelcase_modelname}}Encoder(config) + + self.init_weights() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """Prunes heads of the model. + heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + See base class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="{{cookiecutter.checkpoint_identifier}}", + output_type=BaseModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + if the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask + is used in the cross-attention if the model is configured as a decoder. + Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=device) + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + + if not return_dict: + return (sequence_output,) + encoder_outputs[1:] + + return BaseModelOutput( + last_hidden_state=sequence_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings("""{{cookiecutter.modelname}} Model with a `language modeling` head on top. """, {{cookiecutter.uppercase_modelname}}_START_DOCSTRING) +class {{cookiecutter.camelcase_modelname}}ForMaskedLM({{cookiecutter.camelcase_modelname}}PreTrainedModel): + def __init__(self, config): + super().__init__(config) + + if config.is_decoder: + logger.warning( + "If you want to use `{{cookiecutter.camelcase_modelname}}ForMaskedLM` make sure `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.{{cookiecutter.lowercase_modelname}} = {{cookiecutter.camelcase_modelname}}Model(config) + self.cls = {{cookiecutter.camelcase_modelname}}OnlyMLMHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="{{cookiecutter.checkpoint_identifier}}", + output_type=MaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. + Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]``. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.{{cookiecutter.lowercase_modelname}}( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + effective_batch_size = input_shape[0] + + # add a dummy token + assert self.config.pad_token_id is not None, "The PAD token should be defined for generation" + attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1) + dummy_token = torch.full( + (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device + ) + input_ids = torch.cat([input_ids, dummy_token], dim=1) + + return {"input_ids": input_ids, "attention_mask": attention_mask} + + +class {{cookiecutter.camelcase_modelname}}ClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.out_proj = nn.Linear(config.hidden_size, config.num_labels) + + self.config = config + + def forward(self, features, **kwargs): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = ACT2FN[self.config.hidden_act](x) + x = self.dropout(x) + x = self.out_proj(x) + return x + + +@add_start_docstrings( + """{{cookiecutter.modelname}} Model transformer with a sequence classification/regression head on top (a linear layer on top of + the pooled output) e.g. for GLUE tasks. """, + {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, +) +class {{cookiecutter.camelcase_modelname}}ForSequenceClassification({{cookiecutter.camelcase_modelname}}PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.{{cookiecutter.lowercase_modelname}} = {{cookiecutter.camelcase_modelname}}Model(config) + self.classifier = {{cookiecutter.camelcase_modelname}}ClassificationHead(config) + + self.init_weights() + + @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="{{cookiecutter.checkpoint_identifier}}", + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the sequence classification/regression loss. + Indices should be in :obj:`[0, ..., config.num_labels - 1]`. + If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.{{cookiecutter.lowercase_modelname}}( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + if self.num_labels == 1: + # We are doing regression + loss_fct = MSELoss() + loss = loss_fct(logits.view(-1), labels.view(-1)) + else: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + +@add_start_docstrings( + """{{cookiecutter.modelname}} Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, + {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, +) +class {{cookiecutter.camelcase_modelname}}ForMultipleChoice({{cookiecutter.camelcase_modelname}}PreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.{{cookiecutter.lowercase_modelname}} = {{cookiecutter.camelcase_modelname}}Model(config) + self.sequence_summary = SequenceSummary(config) + self.classifier = nn.Linear(config.hidden_size, 1) + + self.init_weights() + + @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="{{cookiecutter.checkpoint_identifier}}", + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the multiple choice classification loss. + Indices should be in ``[0, ..., num_choices-1]`` where :obj:`num_choices` is the size of the second dimension + of the input tensors. (See :obj:`input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.{{cookiecutter.lowercase_modelname}}( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + pooled_output = self.sequence_summary(sequence_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """{{cookiecutter.modelname}} Model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, + {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, +) +class {{cookiecutter.camelcase_modelname}}ForTokenClassification({{cookiecutter.camelcase_modelname}}PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.{{cookiecutter.lowercase_modelname}} = {{cookiecutter.camelcase_modelname}}Model(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="{{cookiecutter.checkpoint_identifier}}", + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the token classification loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.{{cookiecutter.lowercase_modelname}}( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels) + active_labels = torch.where( + active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) + ) + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """{{cookiecutter.modelname}} Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, + {{cookiecutter.uppercase_modelname}}_START_DOCSTRING, +) +class {{cookiecutter.camelcase_modelname}}ForQuestionAnswering({{cookiecutter.camelcase_modelname}}PreTrainedModel): + def __init__(self, config): + super().__init__(config) + + config.num_labels = 2 + self.num_labels = config.num_labels + + self.{{cookiecutter.lowercase_modelname}} = {{cookiecutter.camelcase_modelname}}Model(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + @add_start_docstrings_to_model_forward({{cookiecutter.uppercase_modelname}}_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="{{cookiecutter.checkpoint_identifier}}", + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (:obj:`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (:obj:`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.{{cookiecutter.lowercase_modelname}}( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[1:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py new file mode 100644 index 00000000000000..9846fce625c7a8 --- /dev/null +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py @@ -0,0 +1,271 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers import {{cookiecutter.camelcase_modelname}}Config, is_tf_available +from transformers.testing_utils import require_tf, slow + +from .test_configuration_common import ConfigTester +from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor + + +if is_tf_available(): + import tensorflow as tf + + from transformers.modeling_tf_{{cookiecutter.lowercase_modelname}} import ( + TF{{cookiecutter.camelcase_modelname}}ForMaskedLM, + TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice, + TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering, + TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification, + TF{{cookiecutter.camelcase_modelname}}ForTokenClassification, + TF{{cookiecutter.camelcase_modelname}}Model, + ) + + +class TF{{cookiecutter.camelcase_modelname}}ModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = 13 + self.seq_length = 7 + self.is_training = True + self.use_input_mask = True + self.use_token_type_ids = True + self.use_labels = True + self.vocab_size = 99 + self.hidden_size = 32 + self.num_hidden_layers = 5 + self.num_attention_heads = 4 + self.intermediate_size = 37 + self.hidden_act = "gelu" + self.hidden_dropout_prob = 0.1 + self.attention_probs_dropout_prob = 0.1 + self.max_position_embeddings = 512 + self.type_vocab_size = 16 + self.type_sequence_label_size = 2 + self.initializer_range = 0.02 + self.num_labels = 3 + self.num_choices = 4 + self.scope = None + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = {{cookiecutter.camelcase_modelname}}Config( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range, + return_dict=True, + ) + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TF{{cookiecutter.camelcase_modelname}}Model(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + + inputs = [input_ids, input_mask] + result = model(inputs) + + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TF{{cookiecutter.camelcase_modelname}}ForMaskedLM(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_multiple_choice( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_choices = self.num_choices + model = TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice(config=config) + multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1)) + multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1)) + multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1)) + inputs = { + "input_ids": multiple_choice_inputs_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) + + def create_and_check_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = TF{{cookiecutter.camelcase_modelname}}ForTokenClassification(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + + result = model(inputs) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_tf +class TF{{cookiecutter.camelcase_modelname}}ModelTest(TFModelTesterMixin, unittest.TestCase): + + all_model_classes = ( + ( + TF{{cookiecutter.camelcase_modelname}}Model, + TF{{cookiecutter.camelcase_modelname}}ForMaskedLM, + TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering, + TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification, + TF{{cookiecutter.camelcase_modelname}}ForTokenClassification, + TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice, + ) + if is_tf_available() + else () + ) + + def setUp(self): + self.model_tester = TF{{cookiecutter.camelcase_modelname}}ModelTester(self) + self.config_tester = ConfigTester(self, config_class={{cookiecutter.camelcase_modelname}}Config, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model = TF{{cookiecutter.camelcase_modelname}}Model.from_pretrained("{{cookiecutter.checkpoint_identifier}}") + self.assertIsNotNone(model) diff --git a/templates/adding_a_new_model/tests/test_modeling_xxx.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py similarity index 62% rename from templates/adding_a_new_model/tests/test_modeling_xxx.py rename to templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py index b2474ce9a0ac53..15263c360580b1 100644 --- a/templates/adding_a_new_model/tests/test_modeling_xxx.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2018 XXX Authors. +# Copyright 2018 The Google AI Language Team Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,61 +12,56 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" Testing suite for the PyTorch {{cookiecutter.modelname}} model. """ import unittest from transformers import is_torch_available -from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device - +from transformers.testing_utils import require_torch, slow, torch_device from .test_configuration_common import ConfigTester -from .test_modeling_common import ModelTesterMixin, ids_tensor + +from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask if is_torch_available(): from transformers import ( - AutoModelForMaskedLM, - AutoTokenizer, - XxxConfig, - XxxForMaskedLM, - XxxForMultipleChoice, - XxxForQuestionAnswering, - XxxForSequenceClassification, - XxxForTokenClassification, - XxxModel, + {{cookiecutter.camelcase_modelname}}Config, + {{cookiecutter.camelcase_modelname}}ForMaskedLM, + {{cookiecutter.camelcase_modelname}}ForMultipleChoice, + {{cookiecutter.camelcase_modelname}}ForQuestionAnswering, + {{cookiecutter.camelcase_modelname}}ForSequenceClassification, + {{cookiecutter.camelcase_modelname}}ForTokenClassification, + {{cookiecutter.camelcase_modelname}}Model, ) - from transformers.file_utils import cached_property - - # - + from transformers.modeling_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST -class XxxModelTester: - """You can also import this e.g from .test_modeling_bart import BartModelTester """ +class {{cookiecutter.camelcase_modelname}}ModelTester: def __init__( - self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_input_mask=True, - use_token_type_ids=True, - use_labels=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, - scope=None, + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, ): self.parent = parent self.batch_size = batch_size @@ -96,7 +91,7 @@ def prepare_config_and_inputs(self): input_mask = None if self.use_input_mask: - input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + input_mask = random_attention_mask([self.batch_size, self.seq_length]) token_type_ids = None if self.use_token_type_ids: @@ -110,7 +105,7 @@ def prepare_config_and_inputs(self): token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) choice_labels = ids_tensor([self.batch_size], self.num_choices) - config = XxxConfig( + config = {{cookiecutter.camelcase_modelname}}Config( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, @@ -121,6 +116,7 @@ def prepare_config_and_inputs(self): attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, + is_decoder=False, initializer_range=self.initializer_range, return_dict=True, ) @@ -128,30 +124,29 @@ def prepare_config_and_inputs(self): return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels def create_and_check_model( - self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): - model = XxxModel(config=config) + model = {{cookiecutter.camelcase_modelname}}Model(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) result = model(input_ids, token_type_ids=token_type_ids) result = model(input_ids) self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) - self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) def create_and_check_for_masked_lm( - self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): - model = XxxForMaskedLM(config=config) + model = {{cookiecutter.camelcase_modelname}}ForMaskedLM(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) def create_and_check_for_question_answering( - self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): - model = XxxForQuestionAnswering(config=config) + model = {{cookiecutter.camelcase_modelname}}ForQuestionAnswering(config=config) model.to(torch_device) model.eval() result = model( @@ -165,30 +160,30 @@ def create_and_check_for_question_answering( self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) def create_and_check_for_sequence_classification( - self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): config.num_labels = self.num_labels - model = XxxForSequenceClassification(config) + model = {{cookiecutter.camelcase_modelname}}ForSequenceClassification(config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) def create_and_check_for_token_classification( - self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): config.num_labels = self.num_labels - model = XxxForTokenClassification(config=config) + model = {{cookiecutter.camelcase_modelname}}ForTokenClassification(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) def create_and_check_for_multiple_choice( - self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): config.num_choices = self.num_choices - model = XxxForMultipleChoice(config=config) + model = {{cookiecutter.camelcase_modelname}}ForMultipleChoice(config=config) model.to(torch_device) model.eval() multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() @@ -218,17 +213,24 @@ def prepare_config_and_inputs_for_common(self): @require_torch -class XxxModelTest(ModelTesterMixin, unittest.TestCase): +class {{cookiecutter.camelcase_modelname}}ModelTest(ModelTesterMixin, unittest.TestCase): all_model_classes = ( - (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering, XxxForSequenceClassification, XxxForTokenClassification) + ( + {{cookiecutter.camelcase_modelname}}Model, + {{cookiecutter.camelcase_modelname}}ForMaskedLM, + {{cookiecutter.camelcase_modelname}}ForMultipleChoice, + {{cookiecutter.camelcase_modelname}}ForQuestionAnswering, + {{cookiecutter.camelcase_modelname}}ForSequenceClassification, + {{cookiecutter.camelcase_modelname}}ForTokenClassification, + ) if is_torch_available() else () ) def setUp(self): - self.model_tester = XxxModelTester(self) - self.config_tester = ConfigTester(self, config_class=XxxConfig, hidden_size=37) + self.model_tester = {{cookiecutter.camelcase_modelname}}ModelTester(self) + self.config_tester = ConfigTester(self, config_class={{cookiecutter.camelcase_modelname}}Config, hidden_size=37) def test_config(self): self.config_tester.run_common_tests() @@ -241,6 +243,10 @@ def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs) + def test_for_question_answering(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_question_answering(*config_and_inputs) @@ -253,55 +259,10 @@ def test_for_token_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_token_classification(*config_and_inputs) - def test_for_multiple_choice(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_electra_for_multiple_choice(*config_and_inputs) - @slow - def test_lm_outputs_same_as_reference_model(self): - """Write something that could help someone fixing this here.""" - checkpoint_path = "XXX/bart-large" - model = self.big_model - tokenizer = AutoTokenizer.from_pretrained( - checkpoint_path - ) # same with AutoTokenizer (see tokenization_auto.py). This is not mandatory - # MODIFY THIS DEPENDING ON YOUR MODELS RELEVANT TASK. - batch = tokenizer(["I went to the yesterday"]).to(torch_device) - desired_mask_result = tokenizer.decode("store") # update this - logits = model(**batch).logits - masked_index = (batch.input_ids == self.tokenizer.mask_token_id).nonzero() - assert model.num_parameters() == 175e9 # a joke - mask_entry_logits = logits[0, masked_index.item(), :] - probs = mask_entry_logits.softmax(dim=0) - _, predictions = probs.topk(1) - self.assertEqual(tokenizer.decode(predictions), desired_mask_result) - - @cached_property - def big_model(self): - """Cached property means this code will only be executed once.""" - checkpoint_path = "XXX/bart-large" - model = AutoModelForMaskedLM.from_pretrained(checkpoint_path).to( - torch_device - ) # test whether AutoModel can determine your model_class from checkpoint name - if torch_device == "cuda": - model.half() - - # optional: do more testing! This will save you time later! - @slow - def test_that_XXX_can_be_used_in_a_pipeline(self): - """We can use self.big_model here without calling __init__ again.""" - pass - - def test_XXX_loss_doesnt_change_if_you_add_padding(self): - pass - - def test_XXX_bad_args(self): - pass + def test_model_from_pretrained(self): + for model_name in {{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = {{cookiecutter.camelcase_modelname}}Model.from_pretrained(model_name) + self.assertIsNotNone(model) - def test_XXX_backward_pass_reduces_loss(self): - """Test loss/gradients same as reference implementation, for example.""" - pass - @require_torch_gpu - def test_large_inputs_in_fp16_dont_cause_overflow(self): - pass diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/to_replace_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/to_replace_{{cookiecutter.lowercase_modelname}}.py new file mode 100644 index 00000000000000..1de0b01a06876a --- /dev/null +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/to_replace_{{cookiecutter.lowercase_modelname}}.py @@ -0,0 +1,184 @@ +## This file is made so that specific statements may be copied inside existing files. This is useful to copy +## import statements in __init__.py, or to complete model lists in the AUTO files. +## +## It is to be used as such: +## Put '# To replace in: "FILE_PATH"' in order to indicate the contents will be copied in the file at path FILE_PATH +## Put '# Below: "STATEMENT"' in order to copy the contents below **the first occurence** of that line in the file at FILE_PATH +## Put '# Replace with:' followed by the lines containing the content to define the content +## End a statement with '# End.'. If starting a new statement without redefining the FILE_PATH, it will continue pasting +## content in that file. +## +## Put '## COMMENT' to comment on the file. + + +# To replace in: "src/transformers/__init__.py" +# Below: "if is_torch_available():" if generating PyTorch +# Replace with: + from .modeling_{{cookiecutter.lowercase_modelname}} import ( + {{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST, + {{cookiecutter.camelcase_modelname}}ForMaskedLM, + {{cookiecutter.camelcase_modelname}}ForMultipleChoice, + {{cookiecutter.camelcase_modelname}}ForQuestionAnswering, + {{cookiecutter.camelcase_modelname}}ForSequenceClassification, + {{cookiecutter.camelcase_modelname}}ForTokenClassification, + {{cookiecutter.camelcase_modelname}}Layer, + {{cookiecutter.camelcase_modelname}}Model, + {{cookiecutter.camelcase_modelname}}PreTrainedModel, + load_tf_weights_in_{{cookiecutter.lowercase_modelname}}, + ) +# End. + +# Below: "if is_tf_available():" if generating TensorFlow +# Replace with: + from .modeling_tf_{{cookiecutter.lowercase_modelname}} import ( + TF_{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST, + TF{{cookiecutter.camelcase_modelname}}ForMaskedLM, + TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice, + TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering, + TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification, + TF{{cookiecutter.camelcase_modelname}}ForTokenClassification, + TF{{cookiecutter.camelcase_modelname}}Layer, + TF{{cookiecutter.camelcase_modelname}}Model, + TF{{cookiecutter.camelcase_modelname}}PreTrainedModel, + ) +# End. + + +# Below: "from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig" +# Replace with: +from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.uppercase_modelname}}_PRETRAINED_CONFIG_ARCHIVE_MAP, {{cookiecutter.camelcase_modelname}}Config +# End. + + + +# To replace in: "src/transformers/configuration_auto.py" +# Below: "# Add configs here" +# Replace with: + ("{{cookiecutter.lowercase_modelname}}", {{cookiecutter.camelcase_modelname}}Config), +# End. + +# Below: "# Add archive maps here" +# Replace with: + {{cookiecutter.uppercase_modelname}}_PRETRAINED_CONFIG_ARCHIVE_MAP, +# End. + +# Below: "from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig", +# Replace with: +from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.uppercase_modelname}}_PRETRAINED_CONFIG_ARCHIVE_MAP, {{cookiecutter.camelcase_modelname}}Config +# End. + +# Below: "# Add full (and cased) model names here" +# Replace with: + ("{{cookiecutter.lowercase_modelname}}", "{{cookiecutter.camelcase_modelname}}"), +# End. + + + +# To replace in: "src/transformers/modeling_auto.py" if generating PyTorch +# Below: "from .configuration_auto import (" +# Replace with: + {{cookiecutter.camelcase_modelname}}Config, +# End. + +# Below: "# Add modeling imports here" +# Replace with: + +from .modeling_{{cookiecutter.lowercase_modelname}} import ( + {{cookiecutter.camelcase_modelname}}ForMaskedLM, + {{cookiecutter.camelcase_modelname}}ForMultipleChoice, + {{cookiecutter.camelcase_modelname}}ForQuestionAnswering, + {{cookiecutter.camelcase_modelname}}ForSequenceClassification, + {{cookiecutter.camelcase_modelname}}ForTokenClassification, + {{cookiecutter.camelcase_modelname}}Model, +) +# End. + +# Below: "# Base model mapping" +# Replace with: + ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}Model), +# End. + +# Below: "# Model with LM heads mapping" +# Replace with: + ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForMaskedLM), +# End. + +# Below: "# Model for Masked LM mapping" +# Replace with: + ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForMaskedLM), +# End. + +# Below: "# Model for Sequence Classification mapping" +# Replace with: + ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForSequenceClassification), +# End. + +# Below: "# Model for Question Answering mapping" +# Replace with: + ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForQuestionAnswering), +# End. + +# Below: "# Model for Token Classification mapping" +# Replace with: + ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForTokenClassification), +# End. + +# Below: "# Model for Multiple Choice mapping" +# Replace with: + ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForMultipleChoice), +# End. + + +# To replace in: "src/transformers/modeling_tf_auto.py" if generating TensorFlow +# Below: "from .configuration_auto import (" +# Replace with: + {{cookiecutter.camelcase_modelname}}Config, +# End. + +# Below: "# Add modeling imports here" +# Replace with: + +from .modeling_tf_{{cookiecutter.lowercase_modelname}} import ( + TF{{cookiecutter.camelcase_modelname}}ForMaskedLM, + TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice, + TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering, + TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification, + TF{{cookiecutter.camelcase_modelname}}ForTokenClassification, + TF{{cookiecutter.camelcase_modelname}}Model, +) +# End. + +# Below: "# Base model mapping" +# Replace with: + ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}Model), +# End. + +# Below: "# Model with LM heads mapping" +# Replace with: + ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForMaskedLM), +# End. + +# Below: "# Model for Masked LM mapping" +# Replace with: + ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForMaskedLM), +# End. + +# Below: "# Model for Sequence Classification mapping" +# Replace with: + ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification), +# End. + +# Below: "# Model for Question Answering mapping" +# Replace with: + ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering), +# End. + +# Below: "# Model for Token Classification mapping" +# Replace with: + ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForTokenClassification), +# End. + +# Below: "# Model for Multiple Choice mapping" +# Replace with: + ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice), +# End. diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py new file mode 100644 index 00000000000000..29a762961e8035 --- /dev/null +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py @@ -0,0 +1,342 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for {{cookiecutter.modelname}}.""" + +{%- if cookiecutter.tokenizer_type == "Based on BERT" %} +from .tokenization_bert import BertTokenizer, BertTokenizerFast +from .utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.txt", + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "{{cookiecutter.checkpoint_identifier}}": 512, +} + + +PRETRAINED_INIT_CONFIGURATION = { + "{{cookiecutter.checkpoint_identifier}}": {"do_lower_case": False}, +} + + +class {{cookiecutter.camelcase_modelname}}Tokenizer(BertTokenizer): + r""" + Construct a {{cookiecutter.modelname}} tokenizer. + + :class:`~transformers.{{cookiecutter.camelcase_modelname}}Tokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end + tokenization: punctuation splitting and wordpiece. + + Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning + parameters. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + + +class {{cookiecutter.camelcase_modelname}}TokenizerFast(BertTokenizerFast): + r""" + Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's `tokenizers` library). + + :class:`~transformers.{{cookiecutter.camelcase_modelname}}TokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs + end-to-end tokenization: punctuation splitting and wordpiece. + + Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning + parameters. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION +{%- elif cookiecutter.tokenizer_type == "Standalone" %} +import warnings + +from tokenizers import ByteLevelBPETokenizer + +from .tokenization_utils import AddedToken, PreTrainedTokenizer +from .tokenization_utils_base import BatchEncoding +from .tokenization_utils_fast import PreTrainedTokenizerFast +from typing import List, Optional +from .utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {} + +PRETRAINED_VOCAB_FILES_MAP = {} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "{{cookiecutter.checkpoint_identifier}}": 1024, +} + +class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer): + """ + Construct a {{cookiecutter.modelname}} tokenizer. Based on byte-level Byte-Pair-Encoding. + + Args: + vocab_file (:obj:`str`): + Path to the vocabulary file. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["attention_mask"] + + def __init__( + self, + vocab_file, + unk_token="<|endoftext|>", + bos_token="<|endoftext|>", + eos_token="<|endoftext|>", + **kwargs + ): + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) + + "Initialisation" + + @property + def vocab_size(self): + "Returns vocab size" + + def get_vocab(self): + "Returns vocab as a dict" + + def _tokenize(self, text): + """ Returns a tokenized string. """ + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + + def convert_tokens_to_string(self, tokens): + """ Converts a sequence of tokens (string) in a single string. """ + + def save_vocabulary(self, save_directory): + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (:obj:`str`): + The directory in which to save the vocabulary. + + Returns: + :obj:`Tuple(str)`: Paths to the files saved. + """ + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A {{cookiecutter.modelname}} sequence has the following format: + + - single sequence: `` X `` + - pair of sequences: `` A B `` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` method. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formatted with special tokens for the model." + ) + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. + {{cookiecutter.modelname}} does not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): + if "is_pretokenized" in kwargs: + warnings.warn( + "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.", + FutureWarning, + ) + is_split_into_words = kwargs.pop("is_pretokenized") + + add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) + if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()): + text = " " + text + return (text, kwargs) + +class {{cookiecutter.camelcase_modelname}}TokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's `tokenizers` library). + + Args: + vocab_file (:obj:`str`): + Path to the vocabulary file. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["attention_mask"] + + def __init__( + self, + vocab_file, + merges_file, + unk_token="<|endoftext|>", + bos_token="<|endoftext|>", + eos_token="<|endoftext|>", + add_prefix_space=False, + trim_offsets=True, + **kwargs + ): + super().__init__( + ByteLevelBPETokenizer( + vocab_file=vocab_file, + merges_file=merges_file, + add_prefix_space=add_prefix_space, + trim_offsets=trim_offsets, + ), + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + **kwargs, + ) + self.add_prefix_space = add_prefix_space + + def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = None + if "is_pretokenized" in kwargs: + warnings.warn( + "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.", + FutureWarning, + ) + is_split_into_words = kwargs.pop("is_pretokenized") + + is_split_into_words = kwargs.get("is_split_into_words", False) if is_split_into_words is None else is_split_into_words + return super()._batch_encode_plus(*args, **kwargs) + + def _encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = None + if "is_pretokenized" in kwargs: + warnings.warn( + "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.", + FutureWarning, + ) + is_split_into_words = kwargs.get("is_split_into_words", False) if is_split_into_words is None else is_split_into_words + return super()._encode_plus(*args, **kwargs) + + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] + if token_ids_1 is None: + return output + + return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id] + + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. + {{cookiecutter.modelname}} does not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + +{% endif %} diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst new file mode 100644 index 00000000000000..b87d57eb3bb08f --- /dev/null +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst @@ -0,0 +1,128 @@ +{{cookiecutter.uppercase_modelname}} +----------------------------------------------------------------------------------------------------------------------- + +Overview +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The {{cookiecutter.modelname}} model was proposed in ` +<>`__ by . + +The abstract from the paper is the following: + +** + +Tips: + + + +{{cookiecutter.camelcase_modelname}}Config +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}Config + :members: + + +{{cookiecutter.camelcase_modelname}}Tokenizer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}Tokenizer + :members: build_inputs_with_special_tokens, get_special_tokens_mask, + create_token_type_ids_from_sequences, save_vocabulary + + +{{cookiecutter.camelcase_modelname}}TokenizerFast +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}TokenizerFast + :members: build_inputs_with_special_tokens, get_special_tokens_mask, + create_token_type_ids_from_sequences, save_vocabulary + + +{% if "PyTorch" in cookiecutter.generate_tensorflow_and_pytorch -%} +{{cookiecutter.camelcase_modelname}}Model +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}Model + :members: forward + + +{{cookiecutter.camelcase_modelname}}ForMaskedLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForMaskedLM + :members: forward + + +{{cookiecutter.camelcase_modelname}}ForSequenceClassification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForSequenceClassification + :members: forward + + +{{cookiecutter.camelcase_modelname}}ForMultipleChoice +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForMultipleChoice + :members: + + +{{cookiecutter.camelcase_modelname}}ForTokenClassification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForTokenClassification + :members: forward + + +{{cookiecutter.camelcase_modelname}}ForQuestionAnswering +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForQuestionAnswering + :members: forward + +{% endif -%} +{% if "TensorFlow" in cookiecutter.generate_tensorflow_and_pytorch -%} + +TF{{cookiecutter.camelcase_modelname}}Model +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}Model + :members: call + + +TF{{cookiecutter.camelcase_modelname}}ForMaskedLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForMaskedLM + :members: call + + +TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification + :members: call + + +TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice + :members: call + + +TF{{cookiecutter.camelcase_modelname}}ForTokenClassification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForTokenClassification + :members: call + + +TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering + :members: call + + +{% endif -%} diff --git a/templates/adding_a_new_model/cookiecutter.json b/templates/adding_a_new_model/cookiecutter.json new file mode 100644 index 00000000000000..a5df424a2d3d79 --- /dev/null +++ b/templates/adding_a_new_model/cookiecutter.json @@ -0,0 +1,10 @@ +{ + "modelname": "BrandNewBERT", + "uppercase_modelname": "BRAND_NEW_BERT", + "lowercase_modelname": "brand_new_bert", + "camelcase_modelname": "BrandNewBert", + "authors": "The HuggingFace Team", + "checkpoint_identifier": "brand-new-bert-base-cased", + "tokenizer_type": ["Based on BERT", "Standalone"], + "generate_tensorflow_and_pytorch": ["PyTorch & TensorFlow", "PyTorch", "TensorFlow"] +} \ No newline at end of file diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py deleted file mode 100644 index ccdf52a3cb9c07..00000000000000 --- a/templates/adding_a_new_model/modeling_xxx.py +++ /dev/null @@ -1,808 +0,0 @@ -# coding=utf-8 -# Copyright 2018 XXX Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" PyTorch XXX model. """ - -#################################################### -# In this template, replace all the XXX (various casings) with your model name -#################################################### - - -import os - -import torch -from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss - -from .configuration_xxx import XxxConfig -from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward -from .modeling_outputs import ( - BaseModelOutputWithPooling, - MaskedLMOutput, - MultipleChoiceModelOutput, - QuestionAnsweringModelOutput, - SequenceClassifierOutput, - TokenClassifierOutput, -) -from .modeling_utils import PreTrainedModel -from .utils import logging - - -logger = logging.get_logger(__name__) - -_CONFIG_FOR_DOC = "XXXConfig" -_TOKENIZER_FOR_DOC = "XXXTokenizer" - -#################################################### -# This list contrains shortcut names for some of -# the pretrained weights provided with the models -#################################################### -XXX_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "xxx-base-uncased", - "xxx-large-uncased", -] - - -#################################################### -# This is a conversion method from TF 1.0 to PyTorch -# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28 -#################################################### -def load_tf_weights_in_xxx(model, config, tf_checkpoint_path): - """Load tf checkpoints in a pytorch model.""" - try: - import re - - import numpy as np - import tensorflow as tf - except ImportError: - logger.error( - "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " - "https://www.tensorflow.org/install/ for installation instructions." - ) - raise - tf_path = os.path.abspath(tf_checkpoint_path) - logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) - # Load weights from TF model - init_vars = tf.train.list_variables(tf_path) - names = [] - arrays = [] - for name, shape in init_vars: - logger.info("Loading TF weight {} with shape {}".format(name, shape)) - array = tf.train.load_variable(tf_path, name) - names.append(name) - arrays.append(array) - - for name, array in zip(names, arrays): - name = name.split("/") - # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v - # which are not required for using pretrained model - if any( - n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] - for n in name - ): - logger.info("Skipping {}".format("/".join(name))) - continue - pointer = model - for m_name in name: - if re.fullmatch(r"[A-Za-z]+_\d+", m_name): - scope_names = re.split(r"_(\d+)", m_name) - else: - scope_names = [m_name] - if scope_names[0] == "kernel" or scope_names[0] == "gamma": - pointer = getattr(pointer, "weight") - elif scope_names[0] == "output_bias" or scope_names[0] == "beta": - pointer = getattr(pointer, "bias") - elif scope_names[0] == "output_weights": - pointer = getattr(pointer, "weight") - elif scope_names[0] == "squad": - pointer = getattr(pointer, "classifier") - else: - try: - pointer = getattr(pointer, scope_names[0]) - except AttributeError: - logger.info("Skipping {}".format("/".join(name))) - continue - if len(scope_names) >= 2: - num = int(scope_names[1]) - pointer = pointer[num] - if m_name[-11:] == "_embeddings": - pointer = getattr(pointer, "weight") - elif m_name == "kernel": - array = np.transpose(array) - try: - assert ( - pointer.shape == array.shape - ), f"Pointer and array have mismatched shapes {pointer.shape} and {array.shape}" - except AssertionError as e: - e.args += (pointer.shape, array.shape) - raise - logger.info("Initialize PyTorch weight {}".format(name)) - pointer.data = torch.from_numpy(array) - return model - - -#################################################### -# PyTorch Models are constructed by sub-classing -# - torch.nn.Module for the layers and -# - PreTrainedModel for the models (itself a sub-class of torch.nn.Module) -#################################################### - -#################################################### -# Here is an example of typical layer in a PyTorch model of the library -# The classes are usually identical to the TF 2.0 ones without the 'TF' prefix. -# -# See the conversion methods in modeling_tf_pytorch_utils.py for more details -#################################################### - -XxxAttention = nn.Module - -XxxIntermediate = nn.Module - -XxxOutput = nn.Module - - -class XxxLayer(nn.Module): - def __init__(self, config): - super().__init__() - self.attention = XxxAttention(config) - self.intermediate = XxxIntermediate(config) - self.output = XxxOutput(config) - - def forward(self, hidden_states, attention_mask=None, head_mask=None): - attention_outputs = self.attention(hidden_states, attention_mask, head_mask) - attention_output = attention_outputs[0] - intermediate_output = self.intermediate(attention_output) - layer_output = self.output(intermediate_output, attention_output) - outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them - return outputs - - -#################################################### -# PreTrainedModel is a sub-class of torch.nn.Module -# which take care of loading and saving pretrained weights -# and various common utilities. -# -# Here you just need to specify a few (self-explanatory) -# pointers for your model and the weights initialization -# method if its not fully covered by PreTrainedModel's default method -#################################################### - -XxxLayerNorm = torch.nn.LayerNorm - -XxxEmbeddings = nn.Module - -XxxEncoder = nn.Module - -XxxPooler = nn.Module - - -class XxxPreTrainedModel(PreTrainedModel): - """An abstract class to handle weights initialization and - a simple interface for downloading and loading pretrained models. - """ - - config_class = XxxConfig - load_tf_weights = load_tf_weights_in_xxx - base_model_prefix = "transformer" - - def _init_weights(self, module): - """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - elif isinstance(module, XxxLayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() - - -XXX_START_DOCSTRING = r""" - - The XXX model was proposed in `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding - `__ by.... - - This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic - methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, - pruning heads etc.) - - This model is also a PyTorch `torch.nn.Module `__ subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general - usage and behavior. - - Parameters: - config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the configuration. - Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. -""" - -XXX_INPUTS_DOCSTRING = r""" - Inputs: - input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`): - Indices of input sequence tokens in the vocabulary. - - Indices can be obtained using :class:`~transformers.XxxTokenizer`. - See :meth:`transformers.PreTrainedTokenizer.encode` and - :meth:`transformers.PreTrainedTokenizer.__call__` for details. - - `What are input IDs? <../glossary.html#input-ids>`__ - attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`): - Mask to avoid performing attention on padding token indices. - Mask values selected in ``[0, 1]``: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - `What are attention masks? <../glossary.html#attention-mask>`__ - token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): - Segment token indices to indicate first and second portions of the inputs. - Indices are selected in ``[0, 1]``: - - - 0 corresponds to a `sentence A` token, - - 1 corresponds to a `sentence B` token. - - `What are token type IDs? <../glossary.html#token-type-ids>`_ - position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): - Indices of positions of each input sequence tokens in the position embeddings. - Selected in the range ``[0, config.max_position_embeddings - 1]``. - - `What are position IDs? <../glossary.html#position-ids>`_ - head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): - Mask to nullify selected heads of the self-attention modules. - Mask values selected in ``[0, 1]``: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`): - Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. - This is useful if you want more control over how to convert :obj:`input_ids` indices into associated - vectors than the model's internal embedding lookup matrix. - output_attentions (:obj:`bool`, `optional`): - Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned - tensors for more detail. - output_hidden_states (:obj:`bool`, `optional`): - Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for - more detail. - return_dict (:obj:`bool`, `optional`): - Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. -""" - - -@add_start_docstrings( - "The bare XXX Model transformer outputting raw hidden-states without any specific head on top.", - XXX_START_DOCSTRING, -) -class XxxModel(XxxPreTrainedModel): - def __init__(self, config): - super().__init__(config) - - self.embeddings = XxxEmbeddings(config) - self.encoder = XxxEncoder(config) - self.pooler = XxxPooler(config) - - self.init_weights() - - def get_input_embeddings(self): - return self.embeddings.word_embeddings - - def set_input_embeddings(self, new_embeddings): - self.embeddings.word_embeddings = new_embeddings - - def _prune_heads(self, heads_to_prune): - """Prunes heads of the model. - heads_to_prune: dict of {layer_num: list of heads to prune in this layer} - See base class PreTrainedModel - """ - for layer, heads in heads_to_prune.items(): - self.encoder.layer[layer].attention.prune_heads(heads) - - @add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, sequence_length")) - @add_code_sample_docstrings( - tokenizer_class=_TOKENIZER_FOR_DOC, - checkpoint="xxx-base-uncased", - output_type=BaseModelOutputWithPooling, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - input_shape = input_ids.size() - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - else: - raise ValueError("You have to specify either input_ids or inputs_embeds") - - device = input_ids.device if input_ids is not None else inputs_embeds.device - - if attention_mask is None: - attention_mask = torch.ones(input_shape, device=device) - if token_type_ids is None: - token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) - - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device) - # Prepare head mask if needed - # 1.0 in head_mask indicate we keep the head - # attention_probs has shape bsz x n_heads x N x N - # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] - # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] - head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - - ################################## - # Replace this with your model code - embedding_output = self.embeddings( - input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds - ) - encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask) - sequence_output = encoder_outputs[0] - pooled_output = self.pooler(sequence_output) - - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - - return BaseModelOutputWithPooling( - last_hidden_state=sequence_output, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) - - -@add_start_docstrings("""XXX Model with a `language modeling` head on top. """, XXX_START_DOCSTRING) -class XxxForMaskedLM(XxxPreTrainedModel): - def __init__(self, config): - super().__init__(config) - - self.transformer = XxxModel(config) - self.lm_head = nn.Linear(config.n_embd, config.vocab_size) - - self.init_weights() - - def get_output_embeddings(self): - return self.lm_head - - @add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, sequence_length")) - @add_code_sample_docstrings( - tokenizer_class=_TOKENIZER_FOR_DOC, - checkpoint="xxx-base-uncased", - output_type=MaskedLMOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): - Labels for computing the masked language modeling loss. - Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels - in ``[0, ..., config.vocab_size]`` - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.transformer( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = outputs[0] - prediction_scores = self.lm_head(sequence_output) - - masked_lm_loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() # -100 index = padding token - masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - - return MaskedLMOutput( - loss=masked_lm_loss, - logits=prediction_scores, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -@add_start_docstrings( - """XXX Model transformer with a sequence classification/regression head on top (a linear layer on top of - the pooled output) e.g. for GLUE tasks. """, - XXX_START_DOCSTRING, -) -class XxxForSequenceClassification(XxxPreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - - self.transformer = XxxModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) - - self.init_weights() - - @add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, sequence_length")) - @add_code_sample_docstrings( - tokenizer_class=_TOKENIZER_FOR_DOC, - checkpoint="xxx-base-uncased", - output_type=SequenceClassifierOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): - Labels for computing the sequence classification/regression loss. - Indices should be in :obj:`[0, ..., config.num_labels - 1]`. - If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), - If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.transformer( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - pooled_output = outputs[1] - - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - - loss = None - if labels is not None: - if self.num_labels == 1: - # We are doing regression - loss_fct = MSELoss() - loss = loss_fct(logits.view(-1), labels.view(-1)) - else: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - - return SequenceClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -@add_start_docstrings( - """XXX Model with a multiple choice classification head on top (a linear layer on top of - the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, - XXX_START_DOCSTRING, -) -class XxxForMultipleChoice(XxxPreTrainedModel): - def __init__(self, config): - super().__init__(config) - - self.transformer = XxxModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, 1) - - self.init_weights() - - @add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) - @add_code_sample_docstrings( - tokenizer_class=_TOKENIZER_FOR_DOC, - checkpoint="xxx-base-uncased", - output_type=MultipleChoiceModelOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): - Labels for computing the multiple choice classification loss. - Indices should be in ``[0, ..., num_choices-1]`` where :obj:`num_choices` is the size of the second dimension - of the input tensors. (See :obj:`input_ids` above) - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] - - input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None - attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None - token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None - position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None - inputs_embeds = ( - inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) - if inputs_embeds is not None - else None - ) - - outputs = self.transformer( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - pooled_output = outputs[1] - - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - reshaped_logits = logits.view(-1, num_choices) - - loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(reshaped_logits, labels) - - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - - return MultipleChoiceModelOutput( - loss=loss, - logits=reshaped_logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -@add_start_docstrings( - """XXX Model with a token classification head on top (a linear layer on top of - the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, - XXX_START_DOCSTRING, -) -class XxxForTokenClassification(XxxPreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - - self.transformer = XxxModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) - - self.init_weights() - - @add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, sequence_length")) - @add_code_sample_docstrings( - tokenizer_class=_TOKENIZER_FOR_DOC, - checkpoint="xxx-base-uncased", - output_type=TokenClassifierOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): - Labels for computing the token classification loss. - Indices should be in ``[0, ..., config.num_labels - 1]``. - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.transformer( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = outputs[0] - - sequence_output = self.dropout(sequence_output) - logits = self.classifier(sequence_output) - - loss = None - if labels is not None: - loss_fct = CrossEntropyLoss() - # Only keep active parts of the loss - if attention_mask is not None: - active_loss = attention_mask.view(-1) == 1 - active_logits = logits.view(-1, self.num_labels) - active_labels = torch.where( - active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) - ) - loss = loss_fct(active_logits, active_labels) - else: - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - - return TokenClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -@add_start_docstrings( - """XXX Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear - layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, - XXX_START_DOCSTRING, -) -class XxxForQuestionAnswering(XxxPreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - - self.transformer = XxxModel(config) - self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - - self.init_weights() - - @add_start_docstrings_to_model_forward(XXX_INPUTS_DOCSTRING.format("batch_size, sequence_length")) - @add_code_sample_docstrings( - tokenizer_class=_TOKENIZER_FOR_DOC, - checkpoint="xxx-base-uncased", - output_type=QuestionAnsweringModelOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - start_positions=None, - end_positions=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): - Labels for position (index) of the start of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (:obj:`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. - end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): - Labels for position (index) of the end of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (:obj:`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.transformer( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = outputs[0] - - logits = self.qa_outputs(sequence_output) - start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) - - total_loss = None - if start_positions is not None and end_positions is not None: - # If we are on multi-GPU, split add a dimension - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1) - # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = start_logits.size(1) - start_positions.clamp_(0, ignored_index) - end_positions.clamp_(0, ignored_index) - - loss_fct = CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2 - - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - - return QuestionAnsweringModelOutput( - loss=total_loss, - start_logits=start_logits, - end_logits=end_logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) diff --git a/templates/adding_a_new_model/tests/encoder-bert-tokenizer.json b/templates/adding_a_new_model/tests/encoder-bert-tokenizer.json new file mode 100644 index 00000000000000..087ad886e3382f --- /dev/null +++ b/templates/adding_a_new_model/tests/encoder-bert-tokenizer.json @@ -0,0 +1,10 @@ +{ + "modelname": "EncoderBERT", + "uppercase_modelname": "ENCODER_BERT", + "lowercase_modelname": "encoder_bert", + "camelcase_modelname": "EncoderBert", + "authors": "The HuggingFace Team", + "checkpoint_identifier": "brand-new-bert-base-cased", + "tokenizer_type": "Based on BERT", + "generate_tensorflow_and_pytorch": "PyTorch & TensorFlow" +} diff --git a/templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json b/templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json new file mode 100644 index 00000000000000..96c644d9059747 --- /dev/null +++ b/templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json @@ -0,0 +1,10 @@ +{ + "modelname": "PTEncoderBERT", + "uppercase_modelname": "PT_ENCODER_BERT", + "lowercase_modelname": "pt_encoder_bert", + "camelcase_modelname": "PtEncoderBert", + "authors": "The HuggingFace Team", + "checkpoint_identifier": "brand-new-bert-base-cased", + "tokenizer_type": "Based on BERT", + "generate_tensorflow_and_pytorch": "PyTorch" +} diff --git a/templates/adding_a_new_model/tests/standalone.json b/templates/adding_a_new_model/tests/standalone.json new file mode 100644 index 00000000000000..959e17506fcbab --- /dev/null +++ b/templates/adding_a_new_model/tests/standalone.json @@ -0,0 +1,10 @@ +{ + "modelname": "BIEncoderBERT", + "uppercase_modelname": "BI_ENCODER_BERT", + "lowercase_modelname": "bi_encoder_bert", + "camelcase_modelname": "BiEncoderBert", + "authors": "The HuggingFace Team", + "checkpoint_identifier": "bi-brand-new-bert-base-cased", + "tokenizer_type": "Standalone", + "generate_tensorflow_and_pytorch": "PyTorch & TensorFlow" +} diff --git a/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py b/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py deleted file mode 100644 index f6b11bf72c7326..00000000000000 --- a/templates/adding_a_new_model/tests/test_modeling_tf_xxx.py +++ /dev/null @@ -1,257 +0,0 @@ -# coding=utf-8 -# Copyright 2018 XXX Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import unittest - -from transformers import XxxConfig, is_tf_available -from transformers.testing_utils import CACHE_DIR, require_tf, slow - -from .test_configuration_common import ConfigTester -from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor - - -if is_tf_available(): - import tensorflow as tf - - from transformers.modeling_tf_xxx import ( - TFXxxForMaskedLM, - TFXxxForMultipleChoice, - TFXxxForQuestionAnswering, - TFXxxForSequenceClassification, - TFXxxForTokenClassification, - TFXxxModel, - ) - - -@require_tf -class TFXxxModelTest(TFModelTesterMixin, unittest.TestCase): - - all_model_classes = ( - ( - TFXxxModel, - TFXxxForMaskedLM, - TFXxxForMultipleChoice, - TFXxxForQuestionAnswering, - TFXxxForSequenceClassification, - TFXxxForTokenClassification, - ) - if is_tf_available() - else () - ) - - class TFXxxModelTester(object): - def __init__( - self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_input_mask=True, - use_token_type_ids=True, - use_labels=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, - scope=None, - ): - self.parent = parent - self.batch_size = batch_size - self.seq_length = seq_length - self.is_training = is_training - self.use_input_mask = use_input_mask - self.use_token_type_ids = use_token_type_ids - self.use_labels = use_labels - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.type_sequence_label_size = type_sequence_label_size - self.initializer_range = initializer_range - self.num_labels = num_labels - self.num_choices = num_choices - self.scope = scope - - def prepare_config_and_inputs(self): - input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) - - input_mask = None - if self.use_input_mask: - input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) - - token_type_ids = None - if self.use_token_type_ids: - token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) - - sequence_labels = None - token_labels = None - choice_labels = None - if self.use_labels: - sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) - token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) - choice_labels = ids_tensor([self.batch_size], self.num_choices) - - config = XxxConfig( - vocab_size=self.vocab_size, - hidden_size=self.hidden_size, - num_hidden_layers=self.num_hidden_layers, - num_attention_heads=self.num_attention_heads, - intermediate_size=self.intermediate_size, - hidden_act=self.hidden_act, - hidden_dropout_prob=self.hidden_dropout_prob, - attention_probs_dropout_prob=self.attention_probs_dropout_prob, - max_position_embeddings=self.max_position_embeddings, - type_vocab_size=self.type_vocab_size, - initializer_range=self.initializer_range, - return_dict=True, - ) - - return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - - def create_and_check_model( - self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - ): - model = TFXxxModel(config=config) - inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - result = model(inputs) - - inputs = [input_ids, input_mask] - result = model(inputs) - - result = model(input_ids) - - self.parent.assertEqual( - result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size) - ) - self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) - - def create_and_check_for_masked_lm( - self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - ): - model = TFXxxForMaskedLM(config=config) - inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - result = model(inputs) - self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) - - def create_and_check_for_sequence_classification( - self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - ): - config.num_labels = self.num_labels - model = TFXxxForSequenceClassification(config=config) - inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - result = model(inputs) - self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) - - def create_and_check_for_multiple_choice( - self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - ): - config.num_choices = self.num_choices - model = TFXxxForMultipleChoice(config=config) - multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1)) - multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1)) - multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1)) - inputs = { - "input_ids": multiple_choice_inputs_ids, - "attention_mask": multiple_choice_input_mask, - "token_type_ids": multiple_choice_token_type_ids, - } - result = model(inputs) - self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices)) - - def create_and_check_for_token_classification( - self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - ): - config.num_labels = self.num_labels - model = TFXxxForTokenClassification(config=config) - inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - result = model(inputs) - self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) - - def create_and_check_for_question_answering( - self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels - ): - model = TFXxxForQuestionAnswering(config=config) - inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - result = model(inputs) - self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) - self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) - - def prepare_config_and_inputs_for_common(self): - config_and_inputs = self.prepare_config_and_inputs() - ( - config, - input_ids, - token_type_ids, - input_mask, - sequence_labels, - token_labels, - choice_labels, - ) = config_and_inputs - inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} - return config, inputs_dict - - def setUp(self): - self.model_tester = TFXxxModelTest.TFXxxModelTester(self) - self.config_tester = ConfigTester(self, config_class=XxxConfig, hidden_size=37) - - def test_config(self): - self.config_tester.run_common_tests() - - def test_model(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_model(*config_and_inputs) - - def test_for_masked_lm(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) - - def test_for_question_answering(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_for_question_answering(*config_and_inputs) - - def test_for_sequence_classification(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) - - def test_for_token_classification(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_for_token_classification(*config_and_inputs) - - def test_for_multiple_choice(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs) - - @slow - def test_model_from_pretrained(self): - for model_name in ["xxx-base-uncased"]: - model = TFXxxModel.from_pretrained(model_name, cache_dir=CACHE_DIR) - self.assertIsNotNone(model) diff --git a/templates/adding_a_new_model/tests/test_tokenization_xxx.py b/templates/adding_a_new_model/tests/test_tokenization_xxx.py deleted file mode 100644 index b2e81d75ca4229..00000000000000 --- a/templates/adding_a_new_model/tests/test_tokenization_xxx.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding=utf-8 -# Copyright 2018 XXX Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import unittest - -from transformers.tokenization_bert import VOCAB_FILES_NAMES, XxxTokenizer - -from .test_tokenization_common import TokenizerTesterMixin - - -class XxxTokenizationTest(TokenizerTesterMixin, unittest.TestCase): - - tokenizer_class = XxxTokenizer - - def setUp(self): - super().setUp() - - vocab_tokens = [ - "[UNK]", - "[CLS]", - "[SEP]", - "want", - "##want", - "##ed", - "wa", - "un", - "runn", - "##ing", - ",", - "low", - "lowest", - ] - self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) - with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: - vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) - - def get_tokenizer(self, **kwargs): - return XxxTokenizer.from_pretrained(self.tmpdirname, **kwargs) - - def get_input_output_texts(self, tokenizer): - input_text = "UNwant\u00E9d,running" - output_text = "unwanted, running" - return input_text, output_text - - def test_full_tokenizer(self): - tokenizer = self.tokenizer_class(self.vocab_file) - - tokens = tokenizer.tokenize("UNwant\u00E9d,running") - self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) - self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) - - def test_special_tokens_as_you_expect(self): - """If you are training a seq2seq model that expects a decoder_prefix token make sure it is prepended to decoder_input_ids """ - pass diff --git a/templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json b/templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json new file mode 100644 index 00000000000000..1221c609be3cd0 --- /dev/null +++ b/templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json @@ -0,0 +1,10 @@ +{ + "modelname": "TFEncoderBERT", + "uppercase_modelname": "TF_ENCODER_BERT", + "lowercase_modelname": "tf_encoder_bert", + "camelcase_modelname": "TfEncoderBert", + "authors": "The HuggingFace Team", + "checkpoint_identifier": "brand-new-bert-base-cased", + "tokenizer_type": "Based on BERT", + "generate_tensorflow_and_pytorch": "TensorFlow" +} diff --git a/templates/adding_a_new_model/tokenization_xxx.py b/templates/adding_a_new_model/tokenization_xxx.py deleted file mode 100644 index 2df09603050dc0..00000000000000 --- a/templates/adding_a_new_model/tokenization_xxx.py +++ /dev/null @@ -1,296 +0,0 @@ -# coding=utf-8 -# Copyright 2018 XXX Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Tokenization class for model XXX.""" - - -import collections -import logging -import os -from typing import List, Optional, Tuple - -from .tokenization_utils import PreTrainedTokenizer - - -logger = logging.getLogger(__name__) - -#################################################### -# In this template, replace all the XXX (various casings) with your model name -#################################################### - -#################################################### -# Mapping from the keyword arguments names of Tokenizer `__init__` -# to file names for serializing Tokenizer instances -#################################################### -VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} - -#################################################### -# Mapping from the keyword arguments names of Tokenizer `__init__` -# to pretrained vocabulary URL for all the model shortcut names. -#################################################### -PRETRAINED_VOCAB_FILES_MAP = { - "vocab_file": { - "xxx-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-vocab.txt", - "xxx-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-vocab.txt", - } -} - -#################################################### -# Mapping from model shortcut names to max length of inputs -#################################################### -PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - "xxx-base-uncased": 512, - "xxx-large-uncased": 512, -} - -#################################################### -# Mapping from model shortcut names to a dictionary of additional -# keyword arguments for Tokenizer `__init__`. -# To be used for checkpoint specific configurations. -#################################################### -PRETRAINED_INIT_CONFIGURATION = { - "xxx-base-uncased": {"do_lower_case": True}, - "xxx-large-uncased": {"do_lower_case": True}, -} - - -def load_vocab(vocab_file): - """Loads a vocabulary file into a dictionary.""" - vocab = collections.OrderedDict() - with open(vocab_file, "r", encoding="utf-8") as reader: - tokens = reader.readlines() - for index, token in enumerate(tokens): - token = token.rstrip("\n") - vocab[token] = index - return vocab - - -class XxxTokenizer(PreTrainedTokenizer): - r""" - Constructs a XXX tokenizer. Based on XXX. - - This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. - Users should refer to this superclass for more information regarding those methods. - - Args: - vocab_file (:obj:`str`): - File containing the vocabulary. - do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether or not to lowercase the input when tokenizing. - do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`): - Whether ot not to do basic tokenization before WordPiece. - never_split (:obj:`Iterable`, `optional`): - Collection of tokens which will never be split during tokenization. Only has an effect when - :obj:`do_basic_tokenize=True` - unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences - for sequence classification or for a text and a question for question answering. - It is also used as the last token of a sequence built with special tokens. - pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`): - The token used for padding, for example when batching sequences of different lengths. - cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): - The classifier token which is used when doing sequence classification (classification of the whole - sequence instead of per-token classification). It is the first token of the sequence when built with - special tokens. - mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): - The token used for masking values. This is the token used when training this model with masked language - modeling. This is the token which the model will try to predict. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - - def __init__( - self, - vocab_file, - do_lower_case=True, - do_basic_tokenize=True, - never_split=None, - unk_token="[UNK]", - sep_token="[SEP]", - pad_token="[PAD]", - cls_token="[CLS]", - mask_token="[MASK]", - tokenize_chinese_chars=True, - **kwargs - ): - super().__init__( - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - **kwargs, - ) - - if not os.path.isfile(vocab_file): - raise ValueError( - "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " - "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file) - ) - self.vocab = load_vocab(vocab_file) - self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) - self.do_basic_tokenize = do_basic_tokenize - # Replace and adapt - # if do_basic_tokenize: - # self.basic_tokenizer = BasicTokenizer( - # do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars - # ) - # self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) - - @property - def vocab_size(self): - return len(self.vocab) - - def get_vocab(self): - return dict(self.vocab, **self.added_tokens_encoder) - - def _tokenize(self, text): - split_tokens = [] - if self.do_basic_tokenize: - for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - - # If the token is part of the never_split set - if token in self.basic_tokenizer.never_split: - split_tokens.append(token) - else: - split_tokens += self.wordpiece_tokenizer.tokenize(token) - else: - split_tokens = self.wordpiece_tokenizer.tokenize(text) - return split_tokens - - def _convert_token_to_id(self, token): - """ Converts a token (str) in an id using the vocab. """ - return self.vocab.get(token, self.vocab.get(self.unk_token)) - - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - return self.ids_to_tokens.get(index, self.unk_token) - - def convert_tokens_to_string(self, tokens): - """ Converts a sequence of tokens (string) in a single string. """ - out_string = " ".join(tokens).replace(" ##", "").strip() - return out_string - - def build_inputs_with_special_tokens( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks - by concatenating and adding special tokens. - A XXX sequence has the following format: - - - single sequence: ``[CLS] X [SEP]`` - - pair of sequences: ``[CLS] A [SEP] B [SEP]`` - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. - """ - if token_ids_1 is None: - return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - cls = [self.cls_token_id] - sep = [self.sep_token_id] - return cls + token_ids_0 + sep + token_ids_1 + sep - - def get_special_tokens_mask( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False - ) -> List[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` method. - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - - if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - - if token_ids_1 is not None: - return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1] - - def create_token_type_ids_from_sequences( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. - A BERT sequence pair mask has the following format: - - :: - - 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence | - - If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). - - Args: - token_ids_0 (:obj:`List[int]`): - List of IDs. - token_ids_1 (:obj:`List[int]`, `optional`): - Optional second list of IDs for sequence pairs. - - Returns: - :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given - sequence(s). - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] - - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: - index = 0 - if os.path.isdir(save_directory): - vocab_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] - ) - else: - vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory - with open(vocab_file, "w", encoding="utf-8") as writer: - for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): - if index != token_index: - logger.warning( - "Saving vocabulary to {}: vocabulary indices are not consecutive." - " Please check that the vocabulary is not corrupted!".format(vocab_file) - ) - index = token_index - writer.write(token + "\n") - index += 1 - return (vocab_file,) diff --git a/utils/check_repo.py b/utils/check_repo.py index 2cfcfc9a9df391..f79a9c2642b186 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -279,20 +279,9 @@ def check_models_are_documented(module, doc_file): def _get_model_name(module): """ Get the model name for the module defining it.""" splits = module.__name__.split("_") + splits = splits[(2 if splits[1] in ["flax", "tf"] else 1) :] - # Secial case for transfo_xl - if splits[-1] == "xl": - return "_".join(splits[-2:]) - # Special case for xlm_prophetnet - if splits[-1] == "prophetnet" and splits[-2] == "xlm": - return "_".join(splits[-2:]) - # Secial case for xlm_roberta - if splits[-1] == "roberta" and splits[-2] == "xlm": - return "_".join(splits[-2:]) - # Special case for bert_generation - if splits[-1] == "generation" and splits[-2] == "bert": - return "_".join(splits[-2:]) - return splits[-1] + return "_".join(splits) def check_all_models_are_documented():