From 601d4d699cfd57804aa82026a35c32ca219ec3c2 Mon Sep 17 00:00:00 2001 From: Thomas Wolf Date: Fri, 26 Jun 2020 19:48:14 +0200 Subject: [PATCH] [tokenizers] Updates data processors, docstring, examples and model cards to the new API (#5308) * remove references to old API in docstring - update data processors * style * fix tests - better type checking error messages * better type checking * include awesome fix by @LysandreJik for #5310 * updated doc and examples --- README.md | 4 +- docs/README.md | 2 +- docs/source/main_classes/tokenizer.rst | 2 +- docs/source/task_summary.rst | 14 +++--- docs/source/training.rst | 2 +- examples/adversarial/utils_hans.py | 5 ++- examples/longform-qa/eli5_utils.py | 12 ++--- .../emmental/modeling_bert_masked.py | 2 +- .../multiple-choice/utils_multiple_choice.py | 5 ++- examples/seq2seq/run_eval.py | 6 +-- examples/seq2seq/utils.py | 6 +-- .../SparkBeyond/roberta-large-sts-b/README.md | 2 +- model_cards/a-ware/bart-squadv2/README.md | 2 +- .../a-ware/xlmroberta-squadv2/README.md | 2 +- model_cards/google/reformer-enwik8/README.md | 2 +- .../bert-turkish-question-answering/README.md | 2 +- .../README.md | 2 +- .../t5-base-finetuned-squadv2/README.md | 2 +- .../german-sentiment-bert/README.md | 2 +- .../bart-large-finetuned-squadv1/README.md | 2 +- .../README.md | 2 +- model_cards/valhalla/t5-base-squad/README.md | 2 +- notebooks/02-transformers.ipynb | 22 +++++----- notebooks/04-onnx-export.ipynb | 2 +- src/transformers/convert_graph_to_onnx.py | 2 +- .../data/datasets/language_modeling.py | 2 +- src/transformers/data/processors/glue.py | 7 ++- src/transformers/data/processors/squad.py | 14 +++--- src/transformers/file_utils.py | 4 +- src/transformers/modeling_albert.py | 2 +- src/transformers/modeling_bert.py | 2 +- src/transformers/modeling_ctrl.py | 2 +- src/transformers/modeling_distilbert.py | 4 +- src/transformers/modeling_electra.py | 2 +- src/transformers/modeling_flaubert.py | 2 +- src/transformers/modeling_gpt2.py | 2 +- src/transformers/modeling_longformer.py | 4 +- src/transformers/modeling_mobilebert.py | 2 +- src/transformers/modeling_openai.py | 2 +- src/transformers/modeling_reformer.py | 2 +- src/transformers/modeling_retribert.py | 2 +- src/transformers/modeling_roberta.py | 2 +- src/transformers/modeling_tf_albert.py | 2 +- src/transformers/modeling_tf_bert.py | 4 +- src/transformers/modeling_tf_ctrl.py | 2 +- src/transformers/modeling_tf_distilbert.py | 2 +- src/transformers/modeling_tf_electra.py | 2 +- src/transformers/modeling_tf_flaubert.py | 2 +- src/transformers/modeling_tf_gpt2.py | 2 +- src/transformers/modeling_tf_mobilebert.py | 2 +- src/transformers/modeling_tf_openai.py | 2 +- src/transformers/modeling_tf_roberta.py | 2 +- src/transformers/modeling_tf_transfo_xl.py | 2 +- src/transformers/modeling_tf_xlm.py | 2 +- src/transformers/modeling_tf_xlnet.py | 2 +- src/transformers/modeling_transfo_xl.py | 2 +- src/transformers/modeling_xlm.py | 2 +- src/transformers/modeling_xlnet.py | 2 +- src/transformers/pipelines.py | 44 ++++++++----------- src/transformers/tokenization_albert.py | 2 +- src/transformers/tokenization_bart.py | 2 +- src/transformers/tokenization_bert.py | 2 +- src/transformers/tokenization_camembert.py | 2 +- src/transformers/tokenization_roberta.py | 2 +- src/transformers/tokenization_utils.py | 2 +- src/transformers/tokenization_utils_base.py | 36 +++++++++++++++ src/transformers/tokenization_xlm.py | 2 +- src/transformers/tokenization_xlm_roberta.py | 2 +- src/transformers/tokenization_xlnet.py | 2 +- .../adding_a_new_model/tokenization_xxx.py | 2 +- tests/test_modeling_bart.py | 9 ++-- tests/test_modeling_t5.py | 5 ++- tests/test_modeling_tf_t5.py | 5 ++- 73 files changed, 180 insertions(+), 138 deletions(-) diff --git a/README.md b/README.md index 9995b6b1849f36..f1f3a25550f48d 100644 --- a/README.md +++ b/README.md @@ -287,8 +287,8 @@ pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf sentence_0 = "This research was consistent with his findings." sentence_1 = "His findings were compatible with this research." sentence_2 = "His findings were not compatible with this research." -inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt') -inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt') +inputs_1 = tokenizer(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt') +inputs_2 = tokenizer(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt') pred_1 = pytorch_model(inputs_1['input_ids'], token_type_ids=inputs_1['token_type_ids'])[0].argmax().item() pred_2 = pytorch_model(inputs_2['input_ids'], token_type_ids=inputs_2['token_type_ids'])[0].argmax().item() diff --git a/docs/README.md b/docs/README.md index 1cfd8e01e4bac8..6da2f78f3abc7e 100644 --- a/docs/README.md +++ b/docs/README.md @@ -167,7 +167,7 @@ Here's an example showcasing everything so far: Indices can be obtained using :class:`transformers.AlbertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ ``` diff --git a/docs/source/main_classes/tokenizer.rst b/docs/source/main_classes/tokenizer.rst index 7a81c936244682..ee12da18475301 100644 --- a/docs/source/main_classes/tokenizer.rst +++ b/docs/source/main_classes/tokenizer.rst @@ -11,7 +11,7 @@ The base classes ``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` impleme - adding new tokens to the vocabulary in a way that is independant of the underlying structure (BPE, SentencePiece...), - managing special tokens like mask, beginning-of-sentence, etc tokens (adding them, assigning them to attributes in the tokenizer for easy access and making sure they are not split during tokenization) -``BatchEncoding`` holds the output of the tokenizer's encoding methods (``encode_plus`` and ``batch_encode_plus``) and is derived from a Python dictionary. When the tokenizer is a pure python tokenizer, this class behave just like a standard python dictionary and hold the various model inputs computed by these methodes (``input_ids``, ``attention_mask``...). When the tokenizer is a "Fast" tokenizer (i.e. backed by HuggingFace tokenizers library), this class provides in addition several advanced alignement methods which can be used to map between the original string (character and words) and the token space (e.g. getting the index of the token comprising a given character or the span of characters corresponding to a given token). +``BatchEncoding`` holds the output of the tokenizer's encoding methods (``__call__``, ``encode_plus`` and ``batch_encode_plus``) and is derived from a Python dictionary. When the tokenizer is a pure python tokenizer, this class behave just like a standard python dictionary and hold the various model inputs computed by these methodes (``input_ids``, ``attention_mask``...). When the tokenizer is a "Fast" tokenizer (i.e. backed by HuggingFace tokenizers library), this class provides in addition several advanced alignement methods which can be used to map between the original string (character and words) and the token space (e.g. getting the index of the token comprising a given character or the span of characters corresponding to a given token). ``PreTrainedTokenizer`` ~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/task_summary.rst b/docs/source/task_summary.rst index d1157ccccbccc9..0a425b52cf28df 100644 --- a/docs/source/task_summary.rst +++ b/docs/source/task_summary.rst @@ -74,7 +74,7 @@ of each other. The process is the following: with the weights stored in the checkpoint. - Build a sequence from the two sentences, with the correct model-specific separators token type ids and attention masks (:func:`~transformers.PreTrainedTokenizer.encode` and - :func:`~transformers.PreTrainedTokenizer.encode_plus` take care of this) + :func:`~transformers.PreTrainedTokenizer.__call__` take care of this) - Pass this sequence through the model so that it is classified in one of the two available classes: 0 (not a paraphrase) and 1 (is a paraphrase) - Compute the softmax of the result to get probabilities over the classes @@ -95,8 +95,8 @@ of each other. The process is the following: >>> sequence_1 = "Apples are especially bad for your health" >>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan" - >>> paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="pt") - >>> not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="pt") + >>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt") + >>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt") >>> paraphrase_classification_logits = model(**paraphrase)[0] >>> not_paraphrase_classification_logits = model(**not_paraphrase)[0] @@ -128,8 +128,8 @@ of each other. The process is the following: >>> sequence_1 = "Apples are especially bad for your health" >>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan" - >>> paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="tf") - >>> not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="tf") + >>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="tf") + >>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="tf") >>> paraphrase_classification_logits = model(paraphrase)[0] >>> not_paraphrase_classification_logits = model(not_paraphrase)[0] @@ -221,7 +221,7 @@ Here is an example of question answering using a model and a tokenizer. The proc ... ] >>> for question in questions: - ... inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt") + ... inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt") ... input_ids = inputs["input_ids"].tolist()[0] ... ... text_tokens = tokenizer.convert_ids_to_tokens(input_ids) @@ -263,7 +263,7 @@ Here is an example of question answering using a model and a tokenizer. The proc ... ] >>> for question in questions: - ... inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="tf") + ... inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="tf") ... input_ids = inputs["input_ids"].numpy()[0] ... ... text_tokens = tokenizer.convert_ids_to_tokens(input_ids) diff --git a/docs/source/training.rst b/docs/source/training.rst index 5d465f8c37c1a4..c497fb4b601676 100644 --- a/docs/source/training.rst +++ b/docs/source/training.rst @@ -77,7 +77,7 @@ other than bias and layer normalization terms: optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5) Now we can set up a simple dummy training batch using -:func:`~transformers.PreTrainedTokenizer.batch_encode_plus`. This returns a +:func:`~transformers.PreTrainedTokenizer.__call__`. This returns a :func:`~transformers.BatchEncoding` instance which prepares everything we might need to pass to the model. diff --git a/examples/adversarial/utils_hans.py b/examples/adversarial/utils_hans.py index 5058e8b45f92e0..8f230fad9819be 100644 --- a/examples/adversarial/utils_hans.py +++ b/examples/adversarial/utils_hans.py @@ -298,12 +298,13 @@ def hans_convert_examples_to_features( if ex_index % 10000 == 0: logger.info("Writing example %d" % (ex_index)) - inputs = tokenizer.encode_plus( + inputs = tokenizer( example.text_a, example.text_b, add_special_tokens=True, max_length=max_length, - pad_to_max_length=True, + padding="max_length", + truncation=True, return_overflowing_tokens=True, ) diff --git a/examples/longform-qa/eli5_utils.py b/examples/longform-qa/eli5_utils.py index 4f7d7a9d46d037..f6e417a8dd0360 100644 --- a/examples/longform-qa/eli5_utils.py +++ b/examples/longform-qa/eli5_utils.py @@ -193,12 +193,12 @@ def make_qa_retriever_model(model_name="google/bert_uncased_L-8_H-512_A-8", from def make_qa_retriever_batch(qa_list, tokenizer, max_len=64, device="cuda:0"): q_ls = [q for q, a in qa_list] a_ls = [a for q, a in qa_list] - q_toks = tokenizer.batch_encode_plus(q_ls, max_length=max_len, pad_to_max_length=True) + q_toks = tokenizer(q_ls, max_length=max_len, padding="max_length", truncation=True) q_ids, q_mask = ( torch.LongTensor(q_toks["input_ids"]).to(device), torch.LongTensor(q_toks["attention_mask"]).to(device), ) - a_toks = tokenizer.batch_encode_plus(a_ls, max_length=max_len, pad_to_max_length=True) + a_toks = tokenizer(a_ls, max_length=max_len, padding="max_length", truncation=True) a_ids, a_mask = ( torch.LongTensor(a_toks["input_ids"]).to(device), torch.LongTensor(a_toks["attention_mask"]).to(device), @@ -375,12 +375,12 @@ def make_qa_s2s_model(model_name="facebook/bart-large", from_file=None, device=" def make_qa_s2s_batch(qa_list, tokenizer, max_len=64, max_a_len=360, device="cuda:0"): q_ls = [q for q, a in qa_list] a_ls = [a for q, a in qa_list] - q_toks = tokenizer.batch_encode_plus(q_ls, max_length=max_len, pad_to_max_length=True) + q_toks = tokenizer(q_ls, max_length=max_len, padding="max_length", truncation=True) q_ids, q_mask = ( torch.LongTensor(q_toks["input_ids"]).to(device), torch.LongTensor(q_toks["attention_mask"]).to(device), ) - a_toks = tokenizer.batch_encode_plus(a_ls, max_length=min(max_len, max_a_len), pad_to_max_length=True) + a_toks = tokenizer(a_ls, max_length=min(max_len, max_a_len), padding="max_length", truncation=True) a_ids, a_mask = ( torch.LongTensor(a_toks["input_ids"]).to(device), torch.LongTensor(a_toks["attention_mask"]).to(device), @@ -531,7 +531,7 @@ def qa_s2s_generate( # ELI5-trained retrieval model usage ############### def embed_passages_for_retrieval(passages, tokenizer, qa_embedder, max_length=128, device="cuda:0"): - a_toks = tokenizer.batch_encode_plus(passages, max_length=max_length, pad_to_max_length=True) + a_toks = tokenizer(passages, max_length=max_length, padding="max_length", truncation=True) a_ids, a_mask = ( torch.LongTensor(a_toks["input_ids"]).to(device), torch.LongTensor(a_toks["attention_mask"]).to(device), @@ -542,7 +542,7 @@ def embed_passages_for_retrieval(passages, tokenizer, qa_embedder, max_length=12 def embed_questions_for_retrieval(q_ls, tokenizer, qa_embedder, device="cuda:0"): - q_toks = tokenizer.batch_encode_plus(q_ls, max_length=128, pad_to_max_length=True) + q_toks = tokenizer(q_ls, max_length=128, padding="max_length", truncation=True) q_ids, q_mask = ( torch.LongTensor(q_toks["input_ids"]).to(device), torch.LongTensor(q_toks["attention_mask"]).to(device), diff --git a/examples/movement-pruning/emmental/modeling_bert_masked.py b/examples/movement-pruning/emmental/modeling_bert_masked.py index a87718fa3a33f6..294db863eefa65 100644 --- a/examples/movement-pruning/emmental/modeling_bert_masked.py +++ b/examples/movement-pruning/emmental/modeling_bert_masked.py @@ -424,7 +424,7 @@ def _init_weights(self, module): Indices can be obtained using :class:`transformers.BertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/examples/multiple-choice/utils_multiple_choice.py b/examples/multiple-choice/utils_multiple_choice.py index 1b18c6ed6321cd..2f6dd040dce05e 100644 --- a/examples/multiple-choice/utils_multiple_choice.py +++ b/examples/multiple-choice/utils_multiple_choice.py @@ -510,12 +510,13 @@ def convert_examples_to_features( else: text_b = example.question + " " + ending - inputs = tokenizer.encode_plus( + inputs = tokenizer( text_a, text_b, add_special_tokens=True, max_length=max_length, - pad_to_max_length=True, + padding="max_length", + truncation=True, return_overflowing_tokens=True, ) if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0: diff --git a/examples/seq2seq/run_eval.py b/examples/seq2seq/run_eval.py index 82699d1f226516..6a0480f36de473 100644 --- a/examples/seq2seq/run_eval.py +++ b/examples/seq2seq/run_eval.py @@ -45,9 +45,9 @@ def generate_summaries_or_translations( for batch in tqdm(list(chunks(examples, batch_size))): if "t5" in model_name: batch = [model.config.prefix + text for text in batch] - batch = tokenizer.batch_encode_plus( - batch, max_length=1024, return_tensors="pt", truncation=True, pad_to_max_length=True - ).to(device) + batch = tokenizer(batch, max_length=1024, return_tensors="pt", truncation=True, padding="max_length").to( + device + ) summaries = model.generate(**batch, **gen_kwargs) dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False) for hypothesis in dec: diff --git a/examples/seq2seq/utils.py b/examples/seq2seq/utils.py index 39cfa9d38056a9..99a2abbe208dac 100644 --- a/examples/seq2seq/utils.py +++ b/examples/seq2seq/utils.py @@ -41,12 +41,12 @@ def encode_file( assert lns, f"found empty file at {data_path}" examples = [] for text in tqdm(lns, desc=f"Tokenizing {data_path.name}"): - tokenized = tokenizer.batch_encode_plus( + tokenized = tokenizer( [text], max_length=max_length, - pad_to_max_length=pad_to_max_length, - add_prefix_space=True, + padding="max_length" if pad_to_max_length else None, truncation=True, + add_prefix_space=True, return_tensors=return_tensors, ) assert tokenized.input_ids.shape[1] == max_length diff --git a/model_cards/SparkBeyond/roberta-large-sts-b/README.md b/model_cards/SparkBeyond/roberta-large-sts-b/README.md index 6fa2fd2a63170f..a32cb57486e247 100644 --- a/model_cards/SparkBeyond/roberta-large-sts-b/README.md +++ b/model_cards/SparkBeyond/roberta-large-sts-b/README.md @@ -40,7 +40,7 @@ def roberta_similarity_batches(to_predict): return similarity_scores def similarity_roberta(model, tokenizer, sent_pairs): - batch_token = tokenizer.batch_encode_plus(sent_pairs, pad_to_max_length=True, max_length=500) + batch_token = tokenizer(sent_pairs, padding='max_length', truncation=True, max_length=500) res = model(torch.tensor(batch_token['input_ids']).cuda(), attention_mask=torch.tensor(batch_token["attention_mask"]).cuda()) return res diff --git a/model_cards/a-ware/bart-squadv2/README.md b/model_cards/a-ware/bart-squadv2/README.md index a6e088c7214c7e..164c6a220faa32 100644 --- a/model_cards/a-ware/bart-squadv2/README.md +++ b/model_cards/a-ware/bart-squadv2/README.md @@ -60,7 +60,7 @@ tokenizer = BartTokenizer.from_pretrained('a-ware/bart-squadv2') model = BartForQuestionAnswering.from_pretrained('a-ware/bart-squadv2') question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" -encoding = tokenizer.encode_plus(question, text, return_tensors='pt') +encoding = tokenizer(question, text, return_tensors='pt') input_ids = encoding['input_ids'] attention_mask = encoding['attention_mask'] diff --git a/model_cards/a-ware/xlmroberta-squadv2/README.md b/model_cards/a-ware/xlmroberta-squadv2/README.md index 48e09fccbb059d..eb08cbbc06cd6b 100644 --- a/model_cards/a-ware/xlmroberta-squadv2/README.md +++ b/model_cards/a-ware/xlmroberta-squadv2/README.md @@ -43,7 +43,7 @@ tokenizer = XLMRobertaTokenizer.from_pretrained('a-ware/xlmroberta-squadv2') model = XLMRobertaForQuestionAnswering.from_pretrained('a-ware/xlmroberta-squadv2') question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" -encoding = tokenizer.encode_plus(question, text, return_tensors='pt') +encoding = tokenizer(question, text, return_tensors='pt') input_ids = encoding['input_ids'] attention_mask = encoding['attention_mask'] diff --git a/model_cards/google/reformer-enwik8/README.md b/model_cards/google/reformer-enwik8/README.md index 5086ce80cc52ae..998ac33d9c7e1f 100644 --- a/model_cards/google/reformer-enwik8/README.md +++ b/model_cards/google/reformer-enwik8/README.md @@ -14,7 +14,7 @@ Therefore, this model does not need a tokenizer. The following function can inst import torch # Encoding -def encode(list_of_strings, pad_to_max_length=True, pad_token_id=0): +def encode(list_of_strings, pad_token_id=0): max_length = max([len(string) for string in list_of_strings]) # create emtpy tensors diff --git a/model_cards/lserinol/bert-turkish-question-answering/README.md b/model_cards/lserinol/bert-turkish-question-answering/README.md index 5a0a8df9352c51..66a7dc0b244b36 100644 --- a/model_cards/lserinol/bert-turkish-question-answering/README.md +++ b/model_cards/lserinol/bert-turkish-question-answering/README.md @@ -43,7 +43,7 @@ questions = [ ] for question in questions: - inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt") + inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt") input_ids = inputs["input_ids"].tolist()[0] text_tokens = tokenizer.convert_ids_to_tokens(input_ids) diff --git a/model_cards/mrm8488/longformer-base-4096-finetuned-squadv2/README.md b/model_cards/mrm8488/longformer-base-4096-finetuned-squadv2/README.md index 6c81af1821da7d..f789f6a606d20e 100644 --- a/model_cards/mrm8488/longformer-base-4096-finetuned-squadv2/README.md +++ b/model_cards/mrm8488/longformer-base-4096-finetuned-squadv2/README.md @@ -50,7 +50,7 @@ model = AutoModelForQuestionAnswering.from_pretrained("mrm8488/longformer-base-4 text = "Huggingface has democratized NLP. Huge thanks to Huggingface for this." question = "What has Huggingface done ?" -encoding = tokenizer.encode_plus(question, text, return_tensors="pt") +encoding = tokenizer(question, text, return_tensors="pt") input_ids = encoding["input_ids"] # default is local attention everywhere diff --git a/model_cards/mrm8488/t5-base-finetuned-squadv2/README.md b/model_cards/mrm8488/t5-base-finetuned-squadv2/README.md index 7456f498ce9edf..d072a4bcff6586 100644 --- a/model_cards/mrm8488/t5-base-finetuned-squadv2/README.md +++ b/model_cards/mrm8488/t5-base-finetuned-squadv2/README.md @@ -55,7 +55,7 @@ model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-squadv2") def get_answer(question, context): input_text = "question: %s context: %s " % (question, context) - features = tokenizer.batch_encode_plus([input_text], return_tensors='pt') + features = tokenizer([input_text], return_tensors='pt') output = model.generate(input_ids=features['input_ids'], attention_mask=features['attention_mask']) diff --git a/model_cards/oliverguhr/german-sentiment-bert/README.md b/model_cards/oliverguhr/german-sentiment-bert/README.md index 30afb653d85b70..2594aacab5bad1 100644 --- a/model_cards/oliverguhr/german-sentiment-bert/README.md +++ b/model_cards/oliverguhr/german-sentiment-bert/README.md @@ -55,7 +55,7 @@ class SentimentModel(): def predict_sentiment(self, texts: List[str])-> List[str]: texts = [self.clean_text(text) for text in texts] # Add special tokens takes care of adding [CLS], [SEP], ... tokens in the right way for each model. - input_ids = self.tokenizer.batch_encode_plus(texts,pad_to_max_length=True, add_special_tokens=True) + input_ids = self.tokenizer(texts, padding=True, truncation=True, add_special_tokens=True) input_ids = torch.tensor(input_ids["input_ids"]) with torch.no_grad(): diff --git a/model_cards/valhalla/bart-large-finetuned-squadv1/README.md b/model_cards/valhalla/bart-large-finetuned-squadv1/README.md index e2baaa49776a71..e53087c1ba97b0 100644 --- a/model_cards/valhalla/bart-large-finetuned-squadv1/README.md +++ b/model_cards/valhalla/bart-large-finetuned-squadv1/README.md @@ -50,7 +50,7 @@ tokenizer = BartTokenizer.from_pretrained('valhalla/bart-large-finetuned-squadv1 model = BartForQuestionAnswering.from_pretrained('valhalla/bart-large-finetuned-squadv1') question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" -encoding = tokenizer.encode_plus(question, text, return_tensors='pt') +encoding = tokenizer(question, text, return_tensors='pt') input_ids = encoding['input_ids'] attention_mask = encoding['attention_mask'] diff --git a/model_cards/valhalla/longformer-base-4096-finetuned-squadv1/README.md b/model_cards/valhalla/longformer-base-4096-finetuned-squadv1/README.md index f7328b1cfec0fc..b6b096bf47cbec 100644 --- a/model_cards/valhalla/longformer-base-4096-finetuned-squadv1/README.md +++ b/model_cards/valhalla/longformer-base-4096-finetuned-squadv1/README.md @@ -33,7 +33,7 @@ model = AutoModelForQuestionAnswering.from_pretrained("valhalla/longformer-base- text = "Huggingface has democratized NLP. Huge thanks to Huggingface for this." question = "What has Huggingface done ?" -encoding = tokenizer.encode_plus(question, text, return_tensors="pt") +encoding = tokenizer(question, text, return_tensors="pt") input_ids = encoding["input_ids"] # default is local attention everywhere diff --git a/model_cards/valhalla/t5-base-squad/README.md b/model_cards/valhalla/t5-base-squad/README.md index 16ec7e1ad1bf9d..18f314759a44e8 100644 --- a/model_cards/valhalla/t5-base-squad/README.md +++ b/model_cards/valhalla/t5-base-squad/README.md @@ -19,7 +19,7 @@ model = AutoModelWithLMHead.from_pretrained("valhalla/t5-base-squad") def get_answer(question, context): input_text = "question: %s context: %s " % (question, context) - features = tokenizer.batch_encode_plus([input_text], return_tensors='pt') + features = tokenizer([input_text], return_tensors='pt') out = model.generate(input_ids=features['input_ids'], attention_mask=features['attention_mask']) diff --git a/notebooks/02-transformers.ipynb b/notebooks/02-transformers.ipynb index 81615b501e4456..636d1c738c6881 100644 --- a/notebooks/02-transformers.ipynb +++ b/notebooks/02-transformers.ipynb @@ -255,7 +255,7 @@ "# tokens_pt = torch.tensor([tokens_ids])\n", "\n", "# This code can be factored into one-line as follow\n", - "tokens_pt2 = tokenizer.encode_plus(\"This is an input example\", return_tensors=\"pt\")\n", + "tokens_pt2 = tokenizer(\"This is an input example\", return_tensors=\"pt\")\n", "\n", "for key, value in tokens_pt2.items():\n", " print(\"{}:\\n\\t{}\".format(key, value))\n", @@ -268,7 +268,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As you can see above, the method `encode_plus` provides a convenient way to generate all the required parameters\n", + "As you can see above, calling the tokenizer provides a convenient way to generate all the required parameters\n", "that will go through the model. \n", "\n", "Moreover, you might have noticed it generated some additional tensors: \n", @@ -302,10 +302,10 @@ ], "source": [ "# Single segment input\n", - "single_seg_input = tokenizer.encode_plus(\"This is a sample input\")\n", + "single_seg_input = tokenizer(\"This is a sample input\")\n", "\n", "# Multiple segment input\n", - "multi_seg_input = tokenizer.encode_plus(\"This is segment A\", \"This is segment B\")\n", + "multi_seg_input = tokenizer(\"This is segment A\", \"This is segment B\")\n", "\n", "print(\"Single segment token (str): {}\".format(tokenizer.convert_ids_to_tokens(single_seg_input['input_ids'])))\n", "print(\"Single segment token (int): {}\".format(single_seg_input['input_ids']))\n", @@ -344,9 +344,9 @@ ], "source": [ "# Padding highlight\n", - "tokens = tokenizer.batch_encode_plus(\n", + "tokens = tokenizer(\n", " [\"This is a sample\", \"This is another longer sample text\"], \n", - " pad_to_max_length=True # First sentence will have some PADDED tokens to match second sequence length\n", + " padding=True # First sentence will have some PADDED tokens to match second sequence length\n", ")\n", "\n", "for i in range(2):\n", @@ -405,8 +405,8 @@ ], "source": [ "# transformers generates a ready to use dictionary with all the required parameters for the specific framework.\n", - "input_tf = tokenizer.encode_plus(\"This is a sample input\", return_tensors=\"tf\")\n", - "input_pt = tokenizer.encode_plus(\"This is a sample input\", return_tensors=\"pt\")\n", + "input_tf = tokenizer(\"This is a sample input\", return_tensors=\"tf\")\n", + "input_pt = tokenizer(\"This is a sample input\", return_tensors=\"pt\")\n", "\n", "# Let's compare the outputs\n", "output_tf, output_pt = model_tf(input_tf), model_pt(**input_pt)\n", @@ -464,7 +464,7 @@ "from transformers import DistilBertModel\n", "\n", "bert_distil = DistilBertModel.from_pretrained('distilbert-base-cased')\n", - "input_pt = tokenizer.encode_plus(\n", + "input_pt = tokenizer(\n", " 'This is a sample input to demonstrate performance of distiled models especially inference time', \n", " return_tensors=\"pt\"\n", ")\n", @@ -514,7 +514,7 @@ "de_bert = BertModel.from_pretrained(\"dbmdz/bert-base-german-cased\")\n", "de_tokenizer = BertTokenizer.from_pretrained(\"dbmdz/bert-base-german-cased\")\n", "\n", - "de_input = de_tokenizer.encode_plus(\n", + "de_input = de_tokenizer(\n", " \"Hugging Face ist eine französische Firma mit Sitz in New-York.\",\n", " return_tensors=\"pt\"\n", ")\n", @@ -559,4 +559,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/notebooks/04-onnx-export.ipynb b/notebooks/04-onnx-export.ipynb index 4666097c30a45c..acac251912a2fb 100644 --- a/notebooks/04-onnx-export.ipynb +++ b/notebooks/04-onnx-export.ipynb @@ -248,7 +248,7 @@ "cpu_model = create_model_for_provider(\"onnx/bert-base-cased.onnx\", \"CPUExecutionProvider\")\n", "\n", "# Inputs are provided through numpy array\n", - "model_inputs = tokenizer.encode_plus(\"My name is Bert\", return_tensors=\"pt\")\n", + "model_inputs = tokenizer(\"My name is Bert\", return_tensors=\"pt\")\n", "inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()}\n", "\n", "# Run the model (None = get all the outputs)\n", diff --git a/src/transformers/convert_graph_to_onnx.py b/src/transformers/convert_graph_to_onnx.py index 5fd0c6c96bbd90..fcbfea1d22d1c9 100644 --- a/src/transformers/convert_graph_to_onnx.py +++ b/src/transformers/convert_graph_to_onnx.py @@ -86,7 +86,7 @@ def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int): print("Found {} {} with shape: {}".format("input" if is_input else "output", name, axes)) return axes - tokens = nlp.tokenizer.encode_plus("This is a sample output", return_tensors=framework) + tokens = nlp.tokenizer("This is a sample output", return_tensors=framework) seq_len = tokens.input_ids.shape[-1] outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens) diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py index 6fae7b55c58593..94988a859b6690 100644 --- a/src/transformers/data/datasets/language_modeling.py +++ b/src/transformers/data/datasets/language_modeling.py @@ -91,7 +91,7 @@ def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: i with open(file_path, encoding="utf-8") as f: lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] - batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size) + batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size) self.examples = batch_encoding["input_ids"] def __len__(self): diff --git a/src/transformers/data/processors/glue.py b/src/transformers/data/processors/glue.py index 870817a60e092d..8a962404861220 100644 --- a/src/transformers/data/processors/glue.py +++ b/src/transformers/data/processors/glue.py @@ -137,8 +137,11 @@ def label_from_example(example: InputExample) -> Union[int, float, None]: labels = [label_from_example(example) for example in examples] - batch_encoding = tokenizer.batch_encode_plus( - [(example.text_a, example.text_b) for example in examples], max_length=max_length, pad_to_max_length=True, + batch_encoding = tokenizer( + [(example.text_a, example.text_b) for example in examples], + max_length=max_length, + padding="max_length", + truncation=True, ) features = [] diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py index 9ca27d7ff4c09a..0c68df6820bf47 100644 --- a/src/transformers/data/processors/squad.py +++ b/src/transformers/data/processors/squad.py @@ -120,7 +120,9 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q spans = [] - truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) + truncated_query = tokenizer.encode( + example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length + ) sequence_added_tokens = ( tokenizer.max_len - tokenizer.max_len_single_sentence + 1 if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer)) @@ -131,14 +133,14 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q span_doc_tokens = all_doc_tokens while len(spans) * doc_stride < len(all_doc_tokens): - encoded_dict = tokenizer.encode_plus( + encoded_dict = tokenizer.encode_plus( # TODO(thom) update this logic truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, + truncation="only_second" if tokenizer.padding_side == "right" else "only_first", + padding="max_length", max_length=max_seq_length, return_overflowing_tokens=True, - pad_to_max_length=True, stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, - truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first", return_token_type_ids=True, ) @@ -176,7 +178,9 @@ def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_q spans.append(encoded_dict) - if "overflowing_tokens" not in encoded_dict: + if "overflowing_tokens" not in encoded_dict or ( + "overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0 + ): break span_doc_tokens = encoded_dict["overflowing_tokens"] diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index b2a3c4f611b865..29e0d01a78c396 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -278,7 +278,7 @@ def docstring_decorator(fn): >>> choice1 = "It is eaten while held in the hand." >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1 - >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', pad_to_max_length=True) + >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True) >>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels) # batch size is 1 >>> # the linear classifier still needs to be trained @@ -391,7 +391,7 @@ def docstring_decorator(fn): >>> choice0 = "It is eaten with a fork and a knife." >>> choice1 = "It is eaten while held in the hand." - >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='tf', pad_to_max_length=True) + >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='tf', padding=True) >>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}} >>> outputs = model(inputs) # batch size is 1 diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py index cc8390a149251d..731ee4a7ee0fd4 100644 --- a/src/transformers/modeling_albert.py +++ b/src/transformers/modeling_albert.py @@ -402,7 +402,7 @@ def _init_weights(self, module): Indices can be obtained using :class:`transformers.AlbertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 75a7345c8e9ca9..23d25cfa09847a 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -579,7 +579,7 @@ def _init_weights(self, module): Indices can be obtained using :class:`transformers.BertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_ctrl.py b/src/transformers/modeling_ctrl.py index 3f11109a4de6de..0bceef12598242 100644 --- a/src/transformers/modeling_ctrl.py +++ b/src/transformers/modeling_ctrl.py @@ -251,7 +251,7 @@ def _init_weights(self, module): Indices can be obtained using :class:`transformers.CTRLTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): diff --git a/src/transformers/modeling_distilbert.py b/src/transformers/modeling_distilbert.py index 398b0e29580aa0..cf93c0b1c73e5f 100644 --- a/src/transformers/modeling_distilbert.py +++ b/src/transformers/modeling_distilbert.py @@ -360,7 +360,7 @@ def _init_weights(self, module): Indices can be obtained using :class:`transformers.DistilBertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): @@ -893,7 +893,7 @@ def forward( >>> choice1 = "It is eaten while held in the hand." >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1 - >>> encoding = tokenizer.batch_encode_plus([[prompt, choice0], [prompt, choice1]], return_tensors='pt', pad_to_max_length=True) + >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors='pt', padding=True) >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1 >>> # the linear classifier still needs to be trained diff --git a/src/transformers/modeling_electra.py b/src/transformers/modeling_electra.py index e08e4871535090..050d96909d0627 100644 --- a/src/transformers/modeling_electra.py +++ b/src/transformers/modeling_electra.py @@ -186,7 +186,7 @@ class ElectraPreTrainedModel(BertPreTrainedModel): Indices can be obtained using :class:`transformers.ElectraTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_flaubert.py b/src/transformers/modeling_flaubert.py index 1fed5b68532fbd..5e5128a0cba74a 100644 --- a/src/transformers/modeling_flaubert.py +++ b/src/transformers/modeling_flaubert.py @@ -65,7 +65,7 @@ Indices can be obtained using :class:`transformers.BertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py index 8bcb8876a9d9a5..b839cba16227bc 100644 --- a/src/transformers/modeling_gpt2.py +++ b/src/transformers/modeling_gpt2.py @@ -302,7 +302,7 @@ def _init_weights(self, module): Indices can be obtained using :class:`transformers.GPT2Tokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ diff --git a/src/transformers/modeling_longformer.py b/src/transformers/modeling_longformer.py index 7d2a6978b59666..9d869e73a1c5db 100644 --- a/src/transformers/modeling_longformer.py +++ b/src/transformers/modeling_longformer.py @@ -454,7 +454,7 @@ def forward( Indices can be obtained using :class:`transformers.LonmgformerTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): @@ -970,7 +970,7 @@ def forward( >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa") >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" - >>> encoding = tokenizer.encode_plus(question, text, return_tensors="pt") + >>> encoding = tokenizer(question, text, return_tensors="pt") >>> input_ids = encoding["input_ids"] >>> # default is local attention everywhere diff --git a/src/transformers/modeling_mobilebert.py b/src/transformers/modeling_mobilebert.py index 5165d3fa2bc695..4cbaf3a5024566 100644 --- a/src/transformers/modeling_mobilebert.py +++ b/src/transformers/modeling_mobilebert.py @@ -678,7 +678,7 @@ def _init_weights(self, module): Indices can be obtained using :class:`transformers.MobileBertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py index 949a6ccd7a7091..e831594cbdc081 100644 --- a/src/transformers/modeling_openai.py +++ b/src/transformers/modeling_openai.py @@ -296,7 +296,7 @@ def _init_weights(self, module): Indices can be obtained using :class:`transformers.OpenAIGPTTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_reformer.py b/src/transformers/modeling_reformer.py index 7b763ebf9601c0..65cfd50345d938 100644 --- a/src/transformers/modeling_reformer.py +++ b/src/transformers/modeling_reformer.py @@ -1487,7 +1487,7 @@ def _init_weights(self, module): Indices can be obtained using :class:`transformers.ReformerTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_retribert.py b/src/transformers/modeling_retribert.py index e0395ceb03285c..8b03a6d0f9e688 100644 --- a/src/transformers/modeling_retribert.py +++ b/src/transformers/modeling_retribert.py @@ -153,7 +153,7 @@ def forward( Indices can be obtained using :class:`transformers.RetriBertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask_query (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_roberta.py b/src/transformers/modeling_roberta.py index 7c3f08294d4054..7a7baea01480ee 100644 --- a/src/transformers/modeling_roberta.py +++ b/src/transformers/modeling_roberta.py @@ -103,7 +103,7 @@ def create_position_ids_from_inputs_embeds(self, inputs_embeds): Indices can be obtained using :class:`transformers.RobertaTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_tf_albert.py b/src/transformers/modeling_tf_albert.py index 9f988c647e7ebc..3c2a7bbf98f89f 100644 --- a/src/transformers/modeling_tf_albert.py +++ b/src/transformers/modeling_tf_albert.py @@ -674,7 +674,7 @@ def call( Indices can be obtained using :class:`transformers.AlbertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional, defaults to :obj:`None`): diff --git a/src/transformers/modeling_tf_bert.py b/src/transformers/modeling_tf_bert.py index 55431d1fbbbd20..0bab7699cef44f 100644 --- a/src/transformers/modeling_tf_bert.py +++ b/src/transformers/modeling_tf_bert.py @@ -664,7 +664,7 @@ class TFBertPreTrainedModel(TFPreTrainedModel): Indices can be obtained using :class:`transformers.BertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): @@ -882,7 +882,7 @@ def call(self, inputs, **kwargs): prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." next_sentence = "The sky is blue due to the shorter wavelength of blue light." - encoding = tokenizer.encode_plus(prompt, next_sentence, return_tensors='tf') + encoding = tokenizer(prompt, next_sentence, return_tensors='tf') logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0] assert logits[0][0] < logits[0][1] # the next sentence was random diff --git a/src/transformers/modeling_tf_ctrl.py b/src/transformers/modeling_tf_ctrl.py index 4bc7cf09102102..96c2d0e2ad9ca5 100644 --- a/src/transformers/modeling_tf_ctrl.py +++ b/src/transformers/modeling_tf_ctrl.py @@ -437,7 +437,7 @@ class TFCTRLPreTrainedModel(TFPreTrainedModel): Indices can be obtained using :class:`transformers.CTRLTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): diff --git a/src/transformers/modeling_tf_distilbert.py b/src/transformers/modeling_tf_distilbert.py index bf06335b202943..e9fe573bde6996 100644 --- a/src/transformers/modeling_tf_distilbert.py +++ b/src/transformers/modeling_tf_distilbert.py @@ -545,7 +545,7 @@ class TFDistilBertPreTrainedModel(TFPreTrainedModel): Indices can be obtained using :class:`transformers.BertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_tf_electra.py b/src/transformers/modeling_tf_electra.py index 1aae20b56f35f5..b77c04e4d25ee6 100644 --- a/src/transformers/modeling_tf_electra.py +++ b/src/transformers/modeling_tf_electra.py @@ -339,7 +339,7 @@ def call( Indices can be obtained using :class:`transformers.ElectraTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_tf_flaubert.py b/src/transformers/modeling_tf_flaubert.py index 09e1f00df480a2..d10324de088e89 100644 --- a/src/transformers/modeling_tf_flaubert.py +++ b/src/transformers/modeling_tf_flaubert.py @@ -60,7 +60,7 @@ Indices of input sequence tokens in the vocabulary. Indices can be obtained using :class:`transformers.BertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. diff --git a/src/transformers/modeling_tf_gpt2.py b/src/transformers/modeling_tf_gpt2.py index f2bc63392a9c4d..5c4bbd27c60256 100644 --- a/src/transformers/modeling_tf_gpt2.py +++ b/src/transformers/modeling_tf_gpt2.py @@ -441,7 +441,7 @@ class TFGPT2PreTrainedModel(TFPreTrainedModel): Indices can be obtained using :class:`transformers.GPT2Tokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): diff --git a/src/transformers/modeling_tf_mobilebert.py b/src/transformers/modeling_tf_mobilebert.py index 3178bccfaf3e14..5e68853a1fbb28 100644 --- a/src/transformers/modeling_tf_mobilebert.py +++ b/src/transformers/modeling_tf_mobilebert.py @@ -794,7 +794,7 @@ class TFMobileBertPreTrainedModel(TFPreTrainedModel): Indices can be obtained using :class:`transformers.MobileBertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_tf_openai.py b/src/transformers/modeling_tf_openai.py index c254e3275149ec..477e63ee590e79 100644 --- a/src/transformers/modeling_tf_openai.py +++ b/src/transformers/modeling_tf_openai.py @@ -405,7 +405,7 @@ class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel): Indices can be obtained using :class:`transformers.GPT2Tokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_tf_roberta.py b/src/transformers/modeling_tf_roberta.py index 751ca17abc4505..4148a69065aa9c 100644 --- a/src/transformers/modeling_tf_roberta.py +++ b/src/transformers/modeling_tf_roberta.py @@ -156,7 +156,7 @@ class TFRobertaPreTrainedModel(TFPreTrainedModel): Indices can be obtained using :class:`transformers.RobertaTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_tf_transfo_xl.py b/src/transformers/modeling_tf_transfo_xl.py index ae9accb255648c..33fd3ba7ff7ea0 100644 --- a/src/transformers/modeling_tf_transfo_xl.py +++ b/src/transformers/modeling_tf_transfo_xl.py @@ -694,7 +694,7 @@ class TFTransfoXLPreTrainedModel(TFPreTrainedModel): Indices can be obtained using :class:`transformers.TransfoXLTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ mems (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers`): diff --git a/src/transformers/modeling_tf_xlm.py b/src/transformers/modeling_tf_xlm.py index 007bb572e68084..e912891c212df9 100644 --- a/src/transformers/modeling_tf_xlm.py +++ b/src/transformers/modeling_tf_xlm.py @@ -555,7 +555,7 @@ def dummy_inputs(self): Indices can be obtained using :class:`transformers.BertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_tf_xlnet.py b/src/transformers/modeling_tf_xlnet.py index 80ee28fc789661..3ec96593235f09 100644 --- a/src/transformers/modeling_tf_xlnet.py +++ b/src/transformers/modeling_tf_xlnet.py @@ -778,7 +778,7 @@ class TFXLNetPreTrainedModel(TFPreTrainedModel): Indices can be obtained using :class:`transformers.XLNetTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_transfo_xl.py b/src/transformers/modeling_transfo_xl.py index 2e39ef025c8386..8abd643da26c14 100644 --- a/src/transformers/modeling_transfo_xl.py +++ b/src/transformers/modeling_transfo_xl.py @@ -609,7 +609,7 @@ def _resize_cutoffs(self, new_num_tokens, new_emb_size, new_embedding_shapes, la Indices can be obtained using :class:`transformers.TransfoXLTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ mems (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers`): diff --git a/src/transformers/modeling_xlm.py b/src/transformers/modeling_xlm.py index 03a1ebe2371a25..2c91e834b5734a 100644 --- a/src/transformers/modeling_xlm.py +++ b/src/transformers/modeling_xlm.py @@ -259,7 +259,7 @@ def _init_weights(self, module): Indices can be obtained using :class:`transformers.BertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py index f6dcd679eb3877..0b67bafc3b7808 100644 --- a/src/transformers/modeling_xlnet.py +++ b/src/transformers/modeling_xlnet.py @@ -573,7 +573,7 @@ def _init_weights(self, module): Indices can be obtained using :class:`transformers.BertTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + :func:`transformers.PreTrainedTokenizer.__call__` for details. `What are input IDs? <../glossary.html#input-ids>`__ attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`): diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 0c1e3990014968..0ed72371a238af 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -456,17 +456,14 @@ def ensure_tensor_on_device(self, **inputs): """ return {name: tensor.to(self.device) for name, tensor in inputs.items()} - def _parse_and_tokenize(self, *args, pad_to_max_length=True, add_special_tokens=True, **kwargs): + def _parse_and_tokenize(self, *args, padding=True, add_special_tokens=True, **kwargs): """ Parse arguments and tokenize """ # Parse arguments inputs = self._args_parser(*args, **kwargs) - inputs = self.tokenizer.batch_encode_plus( - inputs, - add_special_tokens=add_special_tokens, - return_tensors=self.framework, - pad_to_max_length=pad_to_max_length, + inputs = self.tokenizer( + inputs, add_special_tokens=add_special_tokens, return_tensors=self.framework, padding=padding, ) return inputs @@ -623,10 +620,10 @@ def __call__( with self.device_placement(): if self.model.__class__.__name__ in ["XLNetLMHeadModel", "TransfoXLLMHeadModel"]: inputs = self._parse_and_tokenize( - self.PADDING_TEXT + prompt_text, pad_to_max_length=False, add_special_tokens=False + self.PADDING_TEXT + prompt_text, padding=False, add_special_tokens=False ) else: - inputs = self._parse_and_tokenize(prompt_text, pad_to_max_length=False, add_special_tokens=False) + inputs = self._parse_and_tokenize(prompt_text, padding=False, add_special_tokens=False) # set input_ids to None to allow empty prompt if inputs["input_ids"].shape[-1] == 0: @@ -920,11 +917,8 @@ def __call__(self, *args, **kwargs): # Manage correct placement of the tensors with self.device_placement(): - tokens = self.tokenizer.encode_plus( - sentence, - return_attention_mask=False, - return_tensors=self.framework, - max_length=self.tokenizer.max_len, + tokens = self.tokenizer( + sentence, return_attention_mask=False, return_tensors=self.framework, truncation=True, ) # Forward @@ -1187,12 +1181,12 @@ def __call__(self, *args, **kwargs): examples = self._args_parser(*args, **kwargs) features_list = [ squad_convert_examples_to_features( - [example], - self.tokenizer, - kwargs["max_seq_len"], - kwargs["doc_stride"], - kwargs["max_question_len"], - False, + examples=[example], + tokenizer=self.tokenizer, + max_seq_length=kwargs["max_seq_len"], + doc_stride=kwargs["doc_stride"], + max_query_length=kwargs["max_question_len"], + is_training=False, tqdm_enabled=False, ) for example in examples @@ -1431,11 +1425,11 @@ def __call__( ), "Please make sure that the tokenizer has a pad_token_id when using a batch input" documents = ([prefix + document for document in documents[0]],) - pad_to_max_length = True + padding = True elif isinstance(documents[0], str): documents = (prefix + documents[0],) - pad_to_max_length = False + padding = False else: raise ValueError( " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format( @@ -1444,7 +1438,7 @@ def __call__( ) with self.device_placement(): - inputs = self._parse_and_tokenize(*documents, pad_to_max_length=pad_to_max_length) + inputs = self._parse_and_tokenize(*documents, padding=padding) if self.framework == "pt": inputs = self.ensure_tensor_on_device(**inputs) @@ -1549,11 +1543,11 @@ def __call__( self.tokenizer.pad_token_id is not None ), "Please make sure that the tokenizer has a pad_token_id when using a batch input" args = ([prefix + text for text in args[0]],) - pad_to_max_length = True + padding = True elif isinstance(args[0], str): args = (prefix + args[0],) - pad_to_max_length = False + padding = False else: raise ValueError( " `documents[0]`: {} have the wrong format. The should be either of type `str` or type `list`".format( @@ -1562,7 +1556,7 @@ def __call__( ) with self.device_placement(): - inputs = self._parse_and_tokenize(*args, pad_to_max_length=pad_to_max_length) + inputs = self._parse_and_tokenize(*args, padding=padding) if self.framework == "pt": inputs = self.ensure_tensor_on_device(**inputs) diff --git a/src/transformers/tokenization_albert.py b/src/transformers/tokenization_albert.py index f33ce15f78c12f..0b7ca36e8135d9 100644 --- a/src/transformers/tokenization_albert.py +++ b/src/transformers/tokenization_albert.py @@ -263,7 +263,7 @@ def get_special_tokens_mask( ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + special tokens using the tokenizer ``prepare_for_model`` method. Args: token_ids_0 (:obj:`List[int]`): diff --git a/src/transformers/tokenization_bart.py b/src/transformers/tokenization_bart.py index 78d5a1474f0cd4..e3157e9eecc86d 100644 --- a/src/transformers/tokenization_bart.py +++ b/src/transformers/tokenization_bart.py @@ -125,7 +125,7 @@ def _convert_id_to_token(self, index): return self.sp_model.IdToPiece(index - self.fairseq_offset) def set_lang(self, lang: str) -> None: - """Set the current language code in order to call batch_encode_plus properly.""" + """Set the current language code in order to call tokenizer properly.""" self.cur_lang_code = self.lang_code_to_id[lang] def prepare_translation_batch( diff --git a/src/transformers/tokenization_bert.py b/src/transformers/tokenization_bert.py index c503e8504f5328..b168fe96f45c70 100644 --- a/src/transformers/tokenization_bert.py +++ b/src/transformers/tokenization_bert.py @@ -263,7 +263,7 @@ def get_special_tokens_mask( ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + special tokens using the tokenizer ``prepare_for_model`` method. Args: token_ids_0 (:obj:`List[int]`): diff --git a/src/transformers/tokenization_camembert.py b/src/transformers/tokenization_camembert.py index 5b8fe7ab001238..84ecd2fef3dd1a 100644 --- a/src/transformers/tokenization_camembert.py +++ b/src/transformers/tokenization_camembert.py @@ -171,7 +171,7 @@ def get_special_tokens_mask( ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + special tokens using the tokenizer ``prepare_for_model`` method. Args: token_ids_0 (:obj:`List[int]`): diff --git a/src/transformers/tokenization_roberta.py b/src/transformers/tokenization_roberta.py index 19b482976c29ef..41abab03ce471a 100644 --- a/src/transformers/tokenization_roberta.py +++ b/src/transformers/tokenization_roberta.py @@ -193,7 +193,7 @@ def get_special_tokens_mask( ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + special tokens using the tokenizer ``prepare_for_model`` method. Args: token_ids_0 (:obj:`List[int]`): diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index 4a69ecc7255b1e..15fb58bff0b324 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -820,7 +820,7 @@ def get_special_tokens_mask( ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + special tokens using the tokenizer ``prepare_for_model`` method. Args: token_ids_0: list of ids (must not contain special tokens) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 13e24bf02c23d3..db9b4e45e7f997 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1583,6 +1583,42 @@ def __call__( If the sequences are provided as list of strings (pretokenized), you must set `is_pretokenized=True` (to lift the ambiguity with a batch of sequences) """ + # Input type checking for clearer error + assert isinstance(text, str) or ( + isinstance(text, (list, tuple)) + and ( + len(text) == 0 + or ( + isinstance(text[0], str) + or (isinstance(text[0], (list, tuple)) and (len(text[0]) == 0 or isinstance(text[0][0], str))) + ) + ) + ), ( + "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " + "or `List[List[str]]` (batch of pretokenized examples)." + ) + + assert ( + text_pair is None + or isinstance(text_pair, str) + or ( + isinstance(text_pair, (list, tuple)) + and ( + len(text_pair) == 0 + or ( + isinstance(text_pair[0], str) + or ( + isinstance(text_pair[0], (list, tuple)) + and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str)) + ) + ) + ) + ) + ), ( + "text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " + "or `List[List[str]]` (batch of pretokenized examples)." + ) + is_batched = bool( (not is_pretokenized and isinstance(text, (list, tuple))) or (is_pretokenized and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))) diff --git a/src/transformers/tokenization_xlm.py b/src/transformers/tokenization_xlm.py index 0ca13344acd082..0a62468ea504a5 100644 --- a/src/transformers/tokenization_xlm.py +++ b/src/transformers/tokenization_xlm.py @@ -882,7 +882,7 @@ def get_special_tokens_mask( ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + special tokens using the tokenizer ``prepare_for_model`` methods. Args: token_ids_0 (:obj:`List[int]`): diff --git a/src/transformers/tokenization_xlm_roberta.py b/src/transformers/tokenization_xlm_roberta.py index f2f5f76c79a0bf..787be9b06468a1 100644 --- a/src/transformers/tokenization_xlm_roberta.py +++ b/src/transformers/tokenization_xlm_roberta.py @@ -206,7 +206,7 @@ def get_special_tokens_mask( ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + special tokens using the tokenizer ``prepare_for_model`` methods. Args: token_ids_0 (:obj:`List[int]`): diff --git a/src/transformers/tokenization_xlnet.py b/src/transformers/tokenization_xlnet.py index 93ef2d2bb7b1b1..c3689b21d4b97a 100644 --- a/src/transformers/tokenization_xlnet.py +++ b/src/transformers/tokenization_xlnet.py @@ -267,7 +267,7 @@ def get_special_tokens_mask( ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + special tokens using the tokenizer ``prepare_for_model`` methods. Args: token_ids_0 (:obj:`List[int]`): diff --git a/templates/adding_a_new_model/tokenization_xxx.py b/templates/adding_a_new_model/tokenization_xxx.py index 6a96b0ff9d7318..91dc7f8c0b90d5 100644 --- a/templates/adding_a_new_model/tokenization_xxx.py +++ b/templates/adding_a_new_model/tokenization_xxx.py @@ -171,7 +171,7 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + special tokens using the tokenizer ``prepare_for_model`` methods. Args: token_ids_0: list of ids (must not contain special tokens) diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py index 3cafb3a40dea74..209abbb211a44d 100644 --- a/tests/test_modeling_bart.py +++ b/tests/test_modeling_bart.py @@ -626,9 +626,9 @@ def test_xsum_summarization_same_as_fairseq(self): PGE_ARTICLE = """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.""" EXPECTED_SUMMARY = "California's largest power company has begun shutting off power to tens of thousands of homes and businesses in the state." - dct = tok.batch_encode_plus([PGE_ARTICLE], max_length=1024, pad_to_max_length=True, return_tensors="pt",).to( - torch_device - ) + dct = tok.batch_encode_plus( + [PGE_ARTICLE], max_length=1024, padding="max_length", truncation=True, return_tensors="pt", + ).to(torch_device) hypotheses_batch = model.generate( input_ids=dct["input_ids"], @@ -672,7 +672,8 @@ def test_cnn_summarization_same_as_fairseq(self): dct = tok.batch_encode_plus( [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY], max_length=1024, - pad_to_max_length=True, + padding="max_length", + truncation=True, return_tensors="pt", ) diff --git a/tests/test_modeling_t5.py b/tests/test_modeling_t5.py index a5d7a1a4d2a215..39254c4f47a50c 100644 --- a/tests/test_modeling_t5.py +++ b/tests/test_modeling_t5.py @@ -375,10 +375,11 @@ def test_summarization(self): summarization_config = task_specific_config.get("summarization", {}) model.config.update(summarization_config) - dct = tok.batch_encode_plus( + dct = tok( [model.config.prefix + x for x in [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY]], max_length=512, - pad_to_max_length=True, + padding="max_length", + truncation=True, return_tensors="pt", ) self.assertEqual(512, dct["input_ids"].shape[1]) diff --git a/tests/test_modeling_tf_t5.py b/tests/test_modeling_tf_t5.py index 28cf60e461deb9..2f996fc90cc0a6 100644 --- a/tests/test_modeling_tf_t5.py +++ b/tests/test_modeling_tf_t5.py @@ -276,10 +276,11 @@ def test_summarization(self): summarization_config = task_specific_config.get("summarization", {}) model.config.update(summarization_config) - dct = tok.batch_encode_plus( + dct = tok( [model.config.prefix + x for x in [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY]], max_length=512, - pad_to_max_length=True, + padding="max_length", + truncation=True, return_tensors="tf", ) self.assertEqual(512, dct["input_ids"].shape[1])