Skip to content

Commit

Permalink
[tokenizers] Ensure that add_prefix_space is propagated to backend_…
Browse files Browse the repository at this point in the history
…tokenizer.pre_tokenizer (#35593)

* Ensure that add_prefix_space is propagated to backend_tokenizer.pre_tokenizer

in PreTrainedTokenizerFast, rather than relying on subclasses to take care of this.

* Simplify setting self.add_prefix_space, ensure pre_tok exists

* Wrap in try-except to catch 'Custom PreTokenizer cannot be serialized'

https://github.com/huggingface/tokenizers/blob/862d1a346a99183017b1eb5ad1aa3133b466784f/bindings/python/src/pre_tokenizers.rs#L672 produces the Exception. They're triggered by the roformer tests, as the RoFormerTokenizerFast uses a custom PreTokenizer.

* Propagate add_prefix_space in T5TokenizerFast to superclass
  • Loading branch information
tomaarsen authored Jan 9, 2025
1 parent 46276f9 commit 32e0db8
Show file tree
Hide file tree
Showing 16 changed files with 36 additions and 122 deletions.
10 changes: 1 addition & 9 deletions src/transformers/models/bart/tokenization_bart_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import json
from typing import List, Optional, Tuple

from tokenizers import pre_tokenizers, processors
from tokenizers import processors

from ...tokenization_utils_base import AddedToken, BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
Expand Down Expand Up @@ -157,14 +157,6 @@ def __init__(
**kwargs,
)

pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

self.add_prefix_space = add_prefix_space

# the pre_tokenizer is already updated in the GPT2TokenizerFast `__init__`
tokenizer_component = "post_processor"
tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import json
from typing import List, Optional, Tuple

from tokenizers import pre_tokenizers, processors
from tokenizers import processors

from ...tokenization_utils_base import AddedToken, BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
Expand Down Expand Up @@ -160,14 +160,6 @@ def __init__(
**kwargs,
)

pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

self.add_prefix_space = add_prefix_space

tokenizer_component = "post_processor"
tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
if tokenizer_component_instance:
Expand Down
10 changes: 0 additions & 10 deletions src/transformers/models/codegen/tokenization_codegen_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
# limitations under the License.
"""Tokenization classes for OpenAI GPT."""

import json
import re
from typing import TYPE_CHECKING, List, Optional, Tuple, Union

Expand All @@ -29,7 +28,6 @@
if is_tf_available():
import tensorflow as tf

from tokenizers import pre_tokenizers

from ...tokenization_utils_base import BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
Expand Down Expand Up @@ -137,14 +135,6 @@ def __init__(
" so that the fast tokenizer works correctly."
)

pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

self.add_prefix_space = add_prefix_space

def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
is_split_into_words = kwargs.get("is_split_into_words", False)
assert self.add_prefix_space or not is_split_into_words, (
Expand Down
11 changes: 0 additions & 11 deletions src/transformers/models/deberta/tokenization_deberta_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,8 @@
# limitations under the License.
"""Fast Tokenization class for model DeBERTa."""

import json
from typing import List, Optional, Tuple

from tokenizers import pre_tokenizers

from ...tokenization_utils_base import AddedToken, BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
Expand Down Expand Up @@ -132,14 +129,6 @@ def __init__(
)
self.add_bos_token = kwargs.pop("add_bos_token", False)

pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

self.add_prefix_space = add_prefix_space

@property
def mask_token(self) -> str:
"""
Expand Down
11 changes: 0 additions & 11 deletions src/transformers/models/gpt2/tokenization_gpt2_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,8 @@
# limitations under the License.
"""Tokenization classes for OpenAI GPT."""

import json
from typing import Optional, Tuple

from tokenizers import pre_tokenizers

from ...tokenization_utils_base import BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
Expand Down Expand Up @@ -109,14 +106,6 @@ def __init__(

self.add_bos_token = kwargs.pop("add_bos_token", False)

pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

self.add_prefix_space = add_prefix_space

def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
is_split_into_words = kwargs.get("is_split_into_words", False)
assert self.add_prefix_space or not is_split_into_words, (
Expand Down
11 changes: 1 addition & 10 deletions src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,9 @@
# limitations under the License.
"""Tokenization classes for GPTNeoX."""

import json
from typing import List, Optional, Tuple

from tokenizers import pre_tokenizers, processors
from tokenizers import processors

from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
Expand Down Expand Up @@ -122,14 +121,6 @@ def __init__(
self._add_eos_token = add_eos_token
self.update_post_processor()

pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

self.add_prefix_space = add_prefix_space

@property
def add_eos_token(self):
return self._add_eos_token
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import json
from typing import Dict, List, Optional, Tuple, Union

from tokenizers import pre_tokenizers, processors
from tokenizers import processors

from ...tokenization_utils_base import (
BatchEncoding,
Expand Down Expand Up @@ -162,14 +162,6 @@ def __init__(
**kwargs,
)

pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

self.add_prefix_space = add_prefix_space

tokenizer_component = "post_processor"
tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
if tokenizer_component_instance:
Expand Down
10 changes: 1 addition & 9 deletions src/transformers/models/led/tokenization_led_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import json
from typing import Dict, List, Optional, Tuple, Union

from tokenizers import pre_tokenizers, processors
from tokenizers import processors

from ...tokenization_utils_base import AddedToken, BatchEncoding, EncodedInput
from ...tokenization_utils_fast import PreTrainedTokenizerFast
Expand Down Expand Up @@ -157,14 +157,6 @@ def __init__(
**kwargs,
)

pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

self.add_prefix_space = add_prefix_space

# the pre_tokenizer is already updated in the GPT2TokenizerFast `__init__`
tokenizer_component = "post_processor"
tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import json
from typing import List, Optional, Tuple

from tokenizers import pre_tokenizers, processors
from tokenizers import processors

from ...tokenization_utils_base import AddedToken, BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
Expand Down Expand Up @@ -155,14 +155,6 @@ def __init__(
**kwargs,
)

pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

self.add_prefix_space = add_prefix_space

tokenizer_component = "post_processor"
tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
if tokenizer_component_instance:
Expand Down
10 changes: 1 addition & 9 deletions src/transformers/models/markuplm/tokenization_markuplm_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from functools import lru_cache
from typing import Dict, List, Optional, Tuple, Union

from tokenizers import pre_tokenizers, processors
from tokenizers import processors

from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings
from ...tokenization_utils_base import (
Expand Down Expand Up @@ -207,14 +207,6 @@ def __init__(

self.tags_dict = tags_dict

pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

self.add_prefix_space = add_prefix_space

tokenizer_component = "post_processor"
tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
if tokenizer_component_instance:
Expand Down
10 changes: 1 addition & 9 deletions src/transformers/models/mvp/tokenization_mvp_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import json
from typing import List, Optional, Tuple

from tokenizers import pre_tokenizers, processors
from tokenizers import processors

from ...tokenization_utils_base import AddedToken, BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
Expand Down Expand Up @@ -160,14 +160,6 @@ def __init__(
**kwargs,
)

pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

self.add_prefix_space = add_prefix_space

# the pre_tokenizer is already updated in the GPT2TokenizerFast `__init__`
tokenizer_component = "post_processor"
tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
Expand Down
10 changes: 1 addition & 9 deletions src/transformers/models/roberta/tokenization_roberta_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import json
from typing import List, Optional, Tuple

from tokenizers import pre_tokenizers, processors
from tokenizers import processors

from ...tokenization_utils_base import AddedToken, BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
Expand Down Expand Up @@ -154,14 +154,6 @@ def __init__(
**kwargs,
)

pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

self.add_prefix_space = add_prefix_space

tokenizer_component = "post_processor"
tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
if tokenizer_component_instance:
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/t5/tokenization_t5_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def __init__(
pad_token=pad_token,
extra_ids=extra_ids,
additional_special_tokens=additional_special_tokens,
add_prefix_space=add_prefix_space,
**kwargs,
)

Expand Down
9 changes: 1 addition & 8 deletions src/transformers/models/whisper/tokenization_whisper_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from typing import List, Optional, Tuple

import numpy as np
from tokenizers import AddedToken, pre_tokenizers, processors
from tokenizers import AddedToken, processors

from ...tokenization_utils_base import BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
Expand Down Expand Up @@ -128,19 +128,12 @@ def __init__(

self.add_bos_token = kwargs.pop("add_bos_token", False)

pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)

if normalizer_file is not None:
with open(normalizer_file, encoding="utf-8") as vocab_handle:
self.english_spelling_normalizer = json.load(vocab_handle)
else:
self.english_spelling_normalizer = None

self.add_prefix_space = add_prefix_space
self.timestamp_pat = re.compile(r"<\|(\d+\.\d+)\|>")

self.language = language
Expand Down
13 changes: 13 additions & 0 deletions src/transformers/tokenization_utils_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ def __init__(self, *args, **kwargs):
fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
from_slow = kwargs.pop("from_slow", False)
added_tokens_decoder = kwargs.pop("added_tokens_decoder", {})
self.add_prefix_space = kwargs.get("add_prefix_space", False)

if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
raise ValueError(
Expand Down Expand Up @@ -206,6 +207,18 @@ def __init__(self, *args, **kwargs):
if tokens:
self.add_tokens(tokens)

try:
pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
if pre_tok_state.get("add_prefix_space", self.add_prefix_space) != self.add_prefix_space:
pre_tok_class = getattr(pre_tokenizers_fast, pre_tok_state.pop("type"))
pre_tok_state["add_prefix_space"] = self.add_prefix_space
self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
except Exception:
# We'll get an error if there is no pre_tokenizer, or if it's a custom pre_tokenizer that can
# not be serialized. In those cases, we just ignore the error as there's no pre_tokenizer
# for which we need to update the `add_prefix_space` attribute.
pass

@property
def is_fast(self) -> bool:
return True
Expand Down
12 changes: 12 additions & 0 deletions tests/test_tokenization_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4684,3 +4684,15 @@ def test_tokenizer_initialization_with_conflicting_key(self):

with self.assertRaises(AttributeError, msg="conflicts with the method"):
get_tokenizer_func(get_vocab=True)

@parameterized.expand([(True,), (False,)])
def test_rust_tokenizer_add_prefix_space(self, add_prefix_space):
if not self.test_rust_tokenizer:
self.skipTest(reason="test_rust_tokenizer is set to False")

for tokenizer, pretrained_name, _ in self.tokenizers_list:
fast_tokenizer = tokenizer.from_pretrained(pretrained_name, add_prefix_space=add_prefix_space)
self.assertEqual(fast_tokenizer.add_prefix_space, add_prefix_space)
# Only the ByteLevel pre-tokenizer has the `add_prefix_space` attribute, we have to ensure that it's set correctly
if hasattr(fast_tokenizer.backend_tokenizer.pre_tokenizer, "add_prefix_space"):
self.assertEqual(fast_tokenizer.backend_tokenizer.pre_tokenizer.add_prefix_space, add_prefix_space)

0 comments on commit 32e0db8

Please sign in to comment.