Skip to content

Commit

Permalink
Merge pull request huggingface#4 from DaryaTereshchenko/changes_token…
Browse files Browse the repository at this point in the history
…izer

fix additional tokens list and run tests
  • Loading branch information
DaryaTereshchenko authored Nov 17, 2024
2 parents 2603cf8 + 17c1198 commit 8bedcb3
Showing 1 changed file with 4 additions and 4 deletions.
8 changes: 4 additions & 4 deletions src/transformers/models/prism/tokenization_prism.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,11 +169,9 @@ def __init__(
fairseq_language_code = FAIRSEQ_LANGUAGE_CODES[language_codes]
self.lang_code_to_token = {lang_code: f"<{lang_code}>" for lang_code in fairseq_language_code}

language_tokens = [self.get_lang_token(lang_code) for lang_code in fairseq_language_code]
additional_special_tokens = kwargs.pop("additional_special_tokens", [])
for lang_code in fairseq_language_code:
token = self.get_lang_token(lang_code)
if token not in additional_special_tokens and lang_code not in str(token) not in self.added_tokens_encoder:
additional_special_tokens.append(token)
self.additional_special_tokens = language_tokens + additional_special_tokens

self.vocab_file = vocab_file
self.encoder = load_json(vocab_file)
Expand Down Expand Up @@ -215,6 +213,8 @@ def __init__(
num_madeup_words=num_madeup_words,
**kwargs,
)

self.special_tokens_map['additional_special_tokens'] = self.additional_special_tokens
self.set_src_lang_special_tokens(self._src_lang)

@property
Expand Down

0 comments on commit 8bedcb3

Please sign in to comment.