Skip to content

Commit

Permalink
update to use the 99 percentile cutoff for max seq length
Browse files Browse the repository at this point in the history
  • Loading branch information
Richard5678 committed Dec 19, 2024
1 parent cb4b135 commit 3b4ef43
Showing 1 changed file with 4 additions and 2 deletions.
6 changes: 4 additions & 2 deletions examples/machine_translation_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,8 +279,10 @@ def main():
tokenizer_en = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenizer_zh = AutoTokenizer.from_pretrained("bert-base-chinese")

# max_seq_length = get_max_seq_length(percentile=95, plot=True)
max_seq_length = 71
# max_seq_length = get_max_seq_length(percentile=100, plot=True)
# print(f"max_seq_length: {max_seq_length}")
# return
max_seq_length = 104

# Tokenize data using batch processing with progress bar
train_encodings_en = tokenizer_en.batch_encode_plus(
Expand Down

0 comments on commit 3b4ef43

Please sign in to comment.