From 9d38dfc2b26bc52fb4a5030487dd424d7c40e45d Mon Sep 17 00:00:00 2001 From: Vover Date: Fri, 11 Oct 2024 15:49:15 +0400 Subject: [PATCH 01/19] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 358fed9..50f06b0 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,2 @@ # Tokenizer -Leatning tokenization for Deep learning and LLms +Learning tokenization for Deep learning and LLms From 223c3bdd5a819205c91acd750976e62e70dc7a4f Mon Sep 17 00:00:00 2001 From: Vover Date: Sun, 13 Oct 2024 19:04:17 +0400 Subject: [PATCH 02/19] Update README.md --- README.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 50f06b0..b04c1a9 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,10 @@ -# Tokenizer -Learning tokenization for Deep learning and LLms +# SmolBPE + +Tokenization for Deep Learning and Large Language Models (LLMs). +## Description + +SmolBPE is a repository focused on providing efficient tokenization techniques for deep learning and large language models. This project is composed primarily of Jupyter Notebooks and Python scripts. + +## Features + +- Efficient tokenization algorithms. From d20ad44ed5c2be1cca3c0f92fc52e0a07a613ddc Mon Sep 17 00:00:00 2001 From: Vover Date: Sun, 13 Oct 2024 19:08:01 +0400 Subject: [PATCH 03/19] Update README.md --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index b04c1a9..30fde64 100644 --- a/README.md +++ b/README.md @@ -8,3 +8,13 @@ SmolBPE is a repository focused on providing efficient tokenization techniques f ## Features - Efficient tokenization algorithms. + +---- + +## Byte pair encoding + +Byte pair encoding (also known as digram coding) is an algorithm, first described in 1994 by Philip Gage for encoding strings of text into tabular form for use in downstream modeling. Its modification is notable as the large language model tokenizer with an ability to combine both tokens that encode single characters (including single digits or single punctuation marks) and those that encode whole words (even the longest compound words). This modification, in the first step, assumes all unique characters to be an initial set of 1-character long n-grams (i.e. initial "tokens"). Then, successively, the most frequent pair of adjacent characters is merged into a new, 2-character long n-gram and all instances of the pair are replaced by this new token. This is repeated until a vocabulary of prescribed size is obtained. Note that new words can always be constructed from final vocabulary tokens and initial-set characters. This algorithmic approach has been extended from spoken language to sign language in recent years. + +All the unique tokens found in a corpus are listed in a token vocabulary, the size of which, in the case of GPT-3.5 and GPT-4, is 100256. + +![image](https://github.com/user-attachments/assets/27cf64e5-42a1-470b-baee-fc5a170bb4eb) From ef3ac4d832dcf571cc70da4f879365ec88298efe Mon Sep 17 00:00:00 2001 From: T4ras123 Date: Sun, 3 Nov 2024 16:34:29 +0400 Subject: [PATCH 04/19] feat: enhance GPT4Tokenizer with vocab loading/saving and CLI support --- smolbpe/gpt4.py | 89 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 70 insertions(+), 19 deletions(-) diff --git a/smolbpe/gpt4.py b/smolbpe/gpt4.py index a1f4508..4cb8626 100644 --- a/smolbpe/gpt4.py +++ b/smolbpe/gpt4.py @@ -1,12 +1,15 @@ import regex as re +import argparse +import json class GPT4Tokenizer(): - def __init__(self): + def __init__(self, path='vocab.json', pattern=None): self.vocab = {idx : bytes([idx]) for idx in range(256)} self.merges = dict() - self.pattern = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" + self.pattern = pattern if pattern else r"\p{L}+|\p{Z}+|\p{N}+|[\p{P}&&[^.]]" self.splitby = re.compile(self.pattern) + self.path = path def train(self, text, vocab_size): @@ -28,20 +31,28 @@ def train(self, text, vocab_size): ids = [self.merge(chunk_ids, pair, idx) for chunk_ids in ids] self.merges[pair] = idx self.vocab[idx] = self.vocab[pair[0]] + self.vocab[pair[1]] + self.save_vocab_and_merges(self.path) + def encode(self, text): - tokens = list(text.encode("utf-8")) - while len(tokens)>=2: - bigrams = self.get_pairs(tokens) - pair = min(bigrams, key = lambda p: bigrams.get(p, float("inf"))) - if pair not in self.merges: + ids = list(text.encode('utf-8')) + + while True: + pairs = self.get_pairs(ids) + mergeable_pairs = {p: self.merges[p] for p in pairs if p in self.merges} + + + if not mergeable_pairs: break - idx = self.merges[pair] - tokens = self.merge(tokens, pair, idx) - return tokens - + pair = min(mergeable_pairs, key=self.merges.get) + + ids = self.merge(ids, pair, self.merges[pair]) + + return ids + + def decode(self, ids): tokens = b"".join(self.vocab[idx] for idx in ids) text = tokens.decode("utf-8", errors="replace") @@ -57,6 +68,41 @@ def get_pairs(self, ids, counts=None): return counts + def save_vocab_and_merges(self, path): + data = { + 'vocab': {}, + 'merges': {} + } + # Save vocab + for idx, byte_val in self.vocab.items(): + try: + data['vocab'][str(idx)] = byte_val.decode('utf-8') + except UnicodeDecodeError: + data['vocab'][str(idx)] = byte_val.hex() + # Save merges + for (first, second), idx in self.merges.items(): + key = f"{first},{second}" # Convert tuple to string + data['merges'][key] = idx + with open(path, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2) + + + def load_vocab(self, path='vocab.json'): + with open(path, 'r', encoding='utf-8') as f: + data = json.load(f) + # Load vocab + self.vocab = {} + for idx_str, value in data['vocab'].items(): + idx = idx_str + self.vocab[idx] = value.encode('utf-8') + # Load merges + self.merges = {} + for pair_str, idx in data['merges'].items(): + first_str, second_str = pair_str.split(',') + first, second = int(first_str), int(second_str) + self.merges[(first, second)] = idx + + def merge(self, ids, pair, idx): id = 0 newids = [] @@ -70,11 +116,16 @@ def merge(self, ids, pair, idx): return newids -t = GPT4Tokenizer() - -with open("./data/taylorswift.txt", "r") as f: - text = f.read() - -t.train(text, 400) -print(t.vocab) -print(t.decode(t.encode("Hello world"))) +if __name__=='__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-t', '--text', type=str, help='Text to train tokenizer on') + parser.add_argument('-v','--vocab_size', type=int, help='Vocab size for tokenizer') + parser.add_argument('-o', '--output', default='vocab.json', type=str, help='Output path for vocab and merges') + parser.add_argument('-p', '--pattern', type=str, help='Regex pattern to split text') + args = parser.parse_args() + + with open(args.text, 'r') as f: + args.text = f.read() + + tokenizer = GPT4Tokenizer(args.output, args.pattern) + tokenizer.train(args.text, args.vocab_size) From 88a080505da18cc30e3e90bae2b6ff187dbbc32a Mon Sep 17 00:00:00 2001 From: T4ras123 Date: Sun, 3 Nov 2024 16:38:12 +0400 Subject: [PATCH 05/19] feat: initialize smolbpe module --- smolbpe/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 smolbpe/__init__.py diff --git a/smolbpe/__init__.py b/smolbpe/__init__.py new file mode 100644 index 0000000..e69de29 From d952b4a0be8d866e08c54987b5991d4e21763e9a Mon Sep 17 00:00:00 2001 From: T4ras123 Date: Sun, 3 Nov 2024 17:02:57 +0400 Subject: [PATCH 06/19] feat: add initial implementation of GPT-4 compatible tokenizer and setup configuration --- MANIFEST.in | 2 ++ setup.py | 29 ++++++++++++++++++++++++++ smolbpe/data/__init__.py | 0 {data => smolbpe/data}/taylorswift.txt | 0 {data => smolbpe/data}/text.py | 0 smolbpe/{gpt4.py => gpt4Tokenizer.py} | 0 tests/__init__.py | 0 7 files changed, 31 insertions(+) create mode 100644 MANIFEST.in create mode 100644 setup.py create mode 100644 smolbpe/data/__init__.py rename {data => smolbpe/data}/taylorswift.txt (100%) rename {data => smolbpe/data}/text.py (100%) rename smolbpe/{gpt4.py => gpt4Tokenizer.py} (100%) create mode 100644 tests/__init__.py diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..454c748 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,2 @@ +include smolbpe/data/*.txt +include smolbpe/data/*.json \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..3007929 --- /dev/null +++ b/setup.py @@ -0,0 +1,29 @@ +from setuptools import setup, find_packages + +setup( + name='gpt4tokenizer', + version='0.1.0', + description='A GPT-4 compatible Byte Pair Encoding (BPE) tokenizer.', + author='Vover', + author_email='your.email@example.com', + url='https://github.com/yourusername/gpt4tokenizer', # Replace with your repository URL + packages=find_packages(include=['smolbpe', 'smolbpe.*']), + install_requires=[ + 'regex>=2021.4.4', + ], + classifiers=[ + 'Programming Language :: Python :: 3', + 'License :: OSI Approved :: MIT License', + 'Operating System :: OS Independent', + ], + python_requires='>=3.6', + include_package_data=True, + package_data={ + 'smolbpe': ['data/*.txt', 'data/*.json'], + }, + entry_points={ + 'console_scripts': [ + 'gpt4tokenizer=smolbpe.gpt4Tokenizer:main', # If you have a main function + ], + }, +) \ No newline at end of file diff --git a/smolbpe/data/__init__.py b/smolbpe/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/data/taylorswift.txt b/smolbpe/data/taylorswift.txt similarity index 100% rename from data/taylorswift.txt rename to smolbpe/data/taylorswift.txt diff --git a/data/text.py b/smolbpe/data/text.py similarity index 100% rename from data/text.py rename to smolbpe/data/text.py diff --git a/smolbpe/gpt4.py b/smolbpe/gpt4Tokenizer.py similarity index 100% rename from smolbpe/gpt4.py rename to smolbpe/gpt4Tokenizer.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 From 5c74ff8ff94ea1eec42e6ff9de80c923c493c5ce Mon Sep 17 00:00:00 2001 From: T4ras123 Date: Sun, 3 Nov 2024 17:15:17 +0400 Subject: [PATCH 07/19] fix: update author email and repository URL in setup.py; adjust import path in character_tokenizing.py --- setup.py | 6 +++--- tests/character_tokenizing.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 3007929..3d174a8 100644 --- a/setup.py +++ b/setup.py @@ -5,8 +5,8 @@ version='0.1.0', description='A GPT-4 compatible Byte Pair Encoding (BPE) tokenizer.', author='Vover', - author_email='your.email@example.com', - url='https://github.com/yourusername/gpt4tokenizer', # Replace with your repository URL + author_email='vovatara123@gmail.com', + url='https://github.com/T4ras123/SmolBPE', # Replace with your repository URL packages=find_packages(include=['smolbpe', 'smolbpe.*']), install_requires=[ 'regex>=2021.4.4', @@ -23,7 +23,7 @@ }, entry_points={ 'console_scripts': [ - 'gpt4tokenizer=smolbpe.gpt4Tokenizer:main', # If you have a main function + 'gpt4tokenizer=smolbpe.gpt4Tokenizer:main', ], }, ) \ No newline at end of file diff --git a/tests/character_tokenizing.py b/tests/character_tokenizing.py index 2ca1aeb..c674426 100644 --- a/tests/character_tokenizing.py +++ b/tests/character_tokenizing.py @@ -2,7 +2,7 @@ import os sys.path.append(os.path.abspath('../data')) -from data.text import text +from smolbpe.data import text vocab = list(set(text)) stoi = {ch:i for i, ch in enumerate(vocab)} From 0ac5f142a4563c930e2ce082e2c4dfba7badb7b8 Mon Sep 17 00:00:00 2001 From: T4ras123 Date: Sun, 3 Nov 2024 17:16:44 +0400 Subject: [PATCH 08/19] fix: rename package from gpt4tokenizer to smolbpe in setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3d174a8..42f0633 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages setup( - name='gpt4tokenizer', + name='smolbpe', version='0.1.0', description='A GPT-4 compatible Byte Pair Encoding (BPE) tokenizer.', author='Vover', From fb785debf5ad90f6ce00ceadfd695d8a888e006b Mon Sep 17 00:00:00 2001 From: T4ras123 Date: Sun, 3 Nov 2024 17:44:04 +0400 Subject: [PATCH 09/19] chore: update README.md with installation instructions and usage examples; modify setup.py for long description and version bump; include README.md in MANIFEST.in; remove unused test files and refactor GPT4Tokenizer class --- MANIFEST.in | 3 +- README.md | 154 +++++- setup.py | 5 +- smolbpe/basicTokenizer.py | 84 --- smolbpe/data/__init__.py | 0 smolbpe/data/taylorswift.txt | 988 ---------------------------------- smolbpe/data/text.py | 1 - smolbpe/gpt4Tokenizer.py | 8 +- tests/__init__.py | 0 tests/character_tokenizing.py | 14 - tests/tokenization.ipynb | 651 ---------------------- tests/word_tokenization.py | 20 - 12 files changed, 154 insertions(+), 1774 deletions(-) delete mode 100644 smolbpe/basicTokenizer.py delete mode 100644 smolbpe/data/__init__.py delete mode 100644 smolbpe/data/taylorswift.txt delete mode 100644 smolbpe/data/text.py delete mode 100644 tests/__init__.py delete mode 100644 tests/character_tokenizing.py delete mode 100644 tests/tokenization.ipynb delete mode 100644 tests/word_tokenization.py diff --git a/MANIFEST.in b/MANIFEST.in index 454c748..4d35845 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ include smolbpe/data/*.txt -include smolbpe/data/*.json \ No newline at end of file +include smolbpe/data/*.json +include README.md \ No newline at end of file diff --git a/README.md b/README.md index 30fde64..2846084 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,156 @@ # SmolBPE -Tokenization for Deep Learning and Large Language Models (LLMs). -## Description +## Overview -SmolBPE is a repository focused on providing efficient tokenization techniques for deep learning and large language models. This project is composed primarily of Jupyter Notebooks and Python scripts. +**SmolBPE** is a lightweight and efficient Byte Pair Encoding (BPE) tokenizer designed for deep learning applications and large language models (LLMs) such as GPT-4. It provides a simple interface to tokenize textual data, facilitating better handling of out-of-vocabulary words and improving the performance of language models. ## Features -- Efficient tokenization algorithms. +- **Efficient Tokenization**: Implements the BPE algorithm for effective subword tokenization. +- **Customizable Vocabulary Size**: Allows you to specify the desired vocabulary size according to your needs. +- **Unicode Support**: Handles a wide range of characters, including Unicode characters, enabling multilingual tokenization. +- **Easy Integration**: Designed for seamless integration with existing Python projects and NLP pipelines. +- **Command-Line Interface**: Provides a CLI tool for training and using the tokenizer without writing additional code. +- **Open Source**: Licensed under the MIT License, promoting openness and collaboration. ----- +## Installation -## Byte pair encoding +You can install SmolBPE using `pip`: -Byte pair encoding (also known as digram coding) is an algorithm, first described in 1994 by Philip Gage for encoding strings of text into tabular form for use in downstream modeling. Its modification is notable as the large language model tokenizer with an ability to combine both tokens that encode single characters (including single digits or single punctuation marks) and those that encode whole words (even the longest compound words). This modification, in the first step, assumes all unique characters to be an initial set of 1-character long n-grams (i.e. initial "tokens"). Then, successively, the most frequent pair of adjacent characters is merged into a new, 2-character long n-gram and all instances of the pair are replaced by this new token. This is repeated until a vocabulary of prescribed size is obtained. Note that new words can always be constructed from final vocabulary tokens and initial-set characters. This algorithmic approach has been extended from spoken language to sign language in recent years. +```sh +pip install smolbpe +``` -All the unique tokens found in a corpus are listed in a token vocabulary, the size of which, in the case of GPT-3.5 and GPT-4, is 100256. +Alternatively, you can install it directly from the source code: -![image](https://github.com/user-attachments/assets/27cf64e5-42a1-470b-baee-fc5a170bb4eb) +```sh +git clone https://github.com/T4ras123/SmolBPE.git +cd SmolBPE +pip install . +``` + +## Quick Start Guide + +### Using the Tokenizer in Python + +1.Importing the Tokenizer + + ```python + from smolbpe.gpt4Tokenizer import GPT4Tokenizer + ``` + +2.Initializing the Tokenizer + + ```python + tokenizer = GPT4Tokenizer() + ``` + + You can specify a custom output file to save the vocab file to and regex pattern if needed: + + ```python + tokenizer = GPT4Tokenizer(output='vocab.json', pattern=r"\p{L}+|\p{Z}+|\p{N}+|[\p{P}&&[^.]]") + ``` + +3.Training the Tokenizer + + Train the tokenizer on your dataset to build the vocabulary and merge rules: + + ```python + with open("path_to_your_data", "r", encoding="utf-8") as f: + text = f.read() + + tokenizer.train(text, vocab_size=400) + ``` + +4.Encoding Text + + Convert text into a list of token IDs: + + ```python + encoded_tokens = tokenizer.encode("Tokenizing isn't real") + print(encoded_tokens) + ``` + +5.Decoding Tokens + +Convert token IDs back into human-readable text: + +```python +decoded_text = tokenizer.decode(encoded_tokens) +print(decoded_text) +``` + +### Command-Line Interface + +SmolBPE provides a command-line interface for easy tokenization tasks. + +#### Training the Tokenizer + +```sh +gpt4tokenizer --text data/taylorswift.txt --vocab_size 400 --output vocab.json +``` + +## Advanced Usage + +### Loading a Pre-trained Vocabulary + +If you have a pre-trained vocabulary and merges file, you can load them directly: + +```python +tokenizer = GPT4Tokenizer() +tokenizer.load_vocab('vocab.json') +``` + +### Custom Regex Pattern + +Customize the tokenization by providing a different regex pattern: + +```python +custom_pattern = r"\w+|\s+|[^\s\w]+" +tokenizer = GPT4Tokenizer(pattern=custom_pattern) +``` + +## Project Structure + +```sh +SmolBPE/ +├── smolbpe/ +│ ├── __init__.py +│ └── gpt4Tokenizer.py +├── LICENSE +├── MANIFEST.in +├── README.md +└── setup.py +``` + +## Contributing + +Contributions are welcome! To contribute: + +1. Fork the repository on GitHub. +2. Create a new branch for your feature or bug fix. +3. Commit your changes with descriptive commit messages. +4. Push your branch to your forked repository. +5. Open a pull request on the main repository. + +Please ensure your code adheres to the project's coding standards and includes appropriate tests. + +## License + +This project is licensed under the MIT License. You are free to use, modify, and distribute this software in accordance with the license. + +## Contact + +For any inquiries or feedback, please contact the author: + +- Author: Vover +- Email: +- GitHub: [T4ras123](https://github.com/T4ras123) + +## Acknowledgments + +- Inspired by tokenization techniques used in GPT models. +- Special thanks to the open-source community for continuous support. + +---- +Happy tokenizing with *SmolBPE*! diff --git a/setup.py b/setup.py index 42f0633..575ba49 100644 --- a/setup.py +++ b/setup.py @@ -2,11 +2,12 @@ setup( name='smolbpe', - version='0.1.0', + version='0.2.0', description='A GPT-4 compatible Byte Pair Encoding (BPE) tokenizer.', + long_description=open('README.md').read(), author='Vover', author_email='vovatara123@gmail.com', - url='https://github.com/T4ras123/SmolBPE', # Replace with your repository URL + url='https://github.com/T4ras123/SmolBPE', packages=find_packages(include=['smolbpe', 'smolbpe.*']), install_requires=[ 'regex>=2021.4.4', diff --git a/smolbpe/basicTokenizer.py b/smolbpe/basicTokenizer.py deleted file mode 100644 index 136d5db..0000000 --- a/smolbpe/basicTokenizer.py +++ /dev/null @@ -1,84 +0,0 @@ - -class BasicTokenizer(): - - - def __init__(self): - self.vocab = {idx : bytes([idx]) for idx in range(256)} - self.merges = dict() - self.special_characters = [] - self.decoded_text = "Pop" - self.encoded_text = [] - - - def train(self, text, vocab_size, verbose = False): - tokens = self.tokenize(text) - steps = vocab_size - 256 - for _ in range(steps): - bigrams = self.get_bigrams(tokens) - pair = max(bigrams, key=bigrams.get) - idx = 256 + _ - tokens = self.merge(tokens, pair, idx) - self.merges[pair] = idx - - for (p1,p2), idx in self.merges.items(): - self.vocab[idx] = self.vocab[p1]+self.vocab[p2] - return tokens - - - def encode(self, text): - tokens = self.tokenize(text) - while len(tokens)>=2: - bigrams = self.get_bigrams(tokens) - pair = min(bigrams, key = lambda p: bigrams.get(p, float("inf"))) - if pair not in self.merges: - break - idx = self.merges[pair] - tokens = self.merge(tokens, pair, idx) - self.encoded_text = tokens - return tokens - - - def decode(self, ids): - tokens = b"".join(self.vocab[idx] for idx in ids) - text = tokens.decode("utf-8", errors="replace") - self.decoded_text = text - return text - - - def get_bigrams(self, tokens): - bigrams = {} - for pair in zip(tokens, tokens[1:]): - bigrams[pair] = bigrams.get(pair, 0) + 1 - return bigrams - - - def merge(self, text, pair, idx): - newids = [] - id = 0 - while id=2:\n", - " stats = byte_pairs(tokens)\n", - " pair = min(stats, key = lambda p: merges.get(p, float(\"inf\")))\n", - " if pair not in merges:\n", - " break\n", - " idx = merges[pair]\n", - " tokens = merges(tokens, pair, idx)\n", - " \n", - " return tokens\n", - " \n", - " \n", - "print(encode(\"PFD\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Forcing splits with regex" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "import regex as re # type: ignore" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['Hello', \"'ve\", ' world', ' how', '123', ' are', ' you']\n" - ] - } - ], - "source": [ - "gpt2pat = re.compile(r\"\"\"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+\"\"\")\n", - "print(re.findall(gpt2pat, \"Hello've world how123 are you\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "ML", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tests/word_tokenization.py b/tests/word_tokenization.py deleted file mode 100644 index 081618b..0000000 --- a/tests/word_tokenization.py +++ /dev/null @@ -1,20 +0,0 @@ -import sys -import os - -# Add the parent directory to the Python path -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'data'))) - -# Import the text variable from the text module -from text import text - -tokenized_text = text.split() - -dictionary = list(set(tokenized_text)) - -wtoi = {w:i for i,w in enumerate(dictionary)} -itow = {i:w for w, i in wtoi.items()} - -encode = lambda text: [wtoi[i] for i in text.split()] -decode = lambda ids: " ".join(itow[i] for i in ids) - -print(decode(encode("Introduction to Unicode"))) From dbbf7bf6f8b20d8eb561079e650b4d1b86a6d288 Mon Sep 17 00:00:00 2001 From: T4ras123 Date: Sun, 3 Nov 2024 17:55:44 +0400 Subject: [PATCH 10/19] chore: include LICENSE in MANIFEST.in and update setup.py for UTF-8 encoding and markdown content type --- MANIFEST.in | 3 ++- setup.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 4d35845..5ef47fd 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ include smolbpe/data/*.txt include smolbpe/data/*.json -include README.md \ No newline at end of file +include README.md +include LICENSE \ No newline at end of file diff --git a/setup.py b/setup.py index 575ba49..f77fba8 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,8 @@ name='smolbpe', version='0.2.0', description='A GPT-4 compatible Byte Pair Encoding (BPE) tokenizer.', - long_description=open('README.md').read(), + long_description=open('README.md', encoding='utf-8').read(), + long_description_content_type='text/markdown', author='Vover', author_email='vovatara123@gmail.com', url='https://github.com/T4ras123/SmolBPE', From c358f028143d1e23e28385c7521c82a0dd6ba9a5 Mon Sep 17 00:00:00 2001 From: T4ras123 Date: Sun, 3 Nov 2024 23:06:58 +0400 Subject: [PATCH 11/19] feat: add badges for version, downloads, stars, license, Python versions, and sponsorship to README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 2846084..5c4f2ce 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # SmolBPE +![PyPI Version](https://img.shields.io/pypi/v/smolbpe) ![PyPI Downloads](https://img.shields.io/pypi/dm/smolbpe) ![GitHub Stars](https://img.shields.io/github/stars/T4ras123/SmolBPE?style=social) ![License](https://img.shields.io/github/license/T4ras123/SmolBPE) ![Python Versions](https://img.shields.io/pypi/pyversions/smolbpe) ![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/T4ras123/SmolBPE) [![Sponsor](https://img.shields.io/badge/sponsor-GitHub%20Sponsors-critical)](https://github.com/sponsors/T4ras123) [![Twitter Follow](https://img.shields.io/twitter/follow/yourusername?style=social)](https://twitter.com/Vover163) ![PyPI - Downloads](https://img.shields.io/pypi/dt/smolbpe) ![Made with Love](https://img.shields.io/badge/Made%20with-%E2%9D%A4-red) + ## Overview **SmolBPE** is a lightweight and efficient Byte Pair Encoding (BPE) tokenizer designed for deep learning applications and large language models (LLMs) such as GPT-4. It provides a simple interface to tokenize textual data, facilitating better handling of out-of-vocabulary words and improving the performance of language models. From 11035c43c3f5c594c2223c0e827ba0471f7898bc Mon Sep 17 00:00:00 2001 From: T4ras123 Date: Sun, 3 Nov 2024 23:08:43 +0400 Subject: [PATCH 12/19] fix: reorder badges in README.md for improved clarity --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5c4f2ce..f8e1905 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # SmolBPE -![PyPI Version](https://img.shields.io/pypi/v/smolbpe) ![PyPI Downloads](https://img.shields.io/pypi/dm/smolbpe) ![GitHub Stars](https://img.shields.io/github/stars/T4ras123/SmolBPE?style=social) ![License](https://img.shields.io/github/license/T4ras123/SmolBPE) ![Python Versions](https://img.shields.io/pypi/pyversions/smolbpe) ![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/T4ras123/SmolBPE) [![Sponsor](https://img.shields.io/badge/sponsor-GitHub%20Sponsors-critical)](https://github.com/sponsors/T4ras123) [![Twitter Follow](https://img.shields.io/twitter/follow/yourusername?style=social)](https://twitter.com/Vover163) ![PyPI - Downloads](https://img.shields.io/pypi/dt/smolbpe) ![Made with Love](https://img.shields.io/badge/Made%20with-%E2%9D%A4-red) +![PyPI Version](https://img.shields.io/pypi/v/smolbpe) ![PyPI - Downloads](https://img.shields.io/pypi/dm/smolbpe) ![GitHub Stars](https://img.shields.io/github/stars/T4ras123/SmolBPE?style=social) ![License](https://img.shields.io/github/license/T4ras123/SmolBPE) ![Python Versions](https://img.shields.io/pypi/pyversions/smolbpe) ![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/T4ras123/SmolBPE) [![Sponsor](https://img.shields.io/badge/sponsor-GitHub%20Sponsors-critical)](https://github.com/sponsors/T4ras123) [![Twitter Follow](https://img.shields.io/twitter/follow/yourusername?style=social)](https://twitter.com/Vover163) ![PyPI - Downloads](https://img.shields.io/pypi/dt/smolbpe) ![Made with Love](https://img.shields.io/badge/Made%20with-%E2%9D%A4-red) ## Overview From dd20e82b2cedc4adc543aa40ec5552a7692b35cb Mon Sep 17 00:00:00 2001 From: T4ras123 Date: Sun, 3 Nov 2024 23:10:03 +0400 Subject: [PATCH 13/19] fix: update Twitter badge link in README.md for correct username --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f8e1905..33f98b9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # SmolBPE -![PyPI Version](https://img.shields.io/pypi/v/smolbpe) ![PyPI - Downloads](https://img.shields.io/pypi/dm/smolbpe) ![GitHub Stars](https://img.shields.io/github/stars/T4ras123/SmolBPE?style=social) ![License](https://img.shields.io/github/license/T4ras123/SmolBPE) ![Python Versions](https://img.shields.io/pypi/pyversions/smolbpe) ![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/T4ras123/SmolBPE) [![Sponsor](https://img.shields.io/badge/sponsor-GitHub%20Sponsors-critical)](https://github.com/sponsors/T4ras123) [![Twitter Follow](https://img.shields.io/twitter/follow/yourusername?style=social)](https://twitter.com/Vover163) ![PyPI - Downloads](https://img.shields.io/pypi/dt/smolbpe) ![Made with Love](https://img.shields.io/badge/Made%20with-%E2%9D%A4-red) +![PyPI Version](https://img.shields.io/pypi/v/smolbpe) ![PyPI - Downloads](https://img.shields.io/pypi/dm/smolbpe) ![GitHub Stars](https://img.shields.io/github/stars/T4ras123/SmolBPE?style=social) ![License](https://img.shields.io/github/license/T4ras123/SmolBPE) ![Python Versions](https://img.shields.io/pypi/pyversions/smolbpe) ![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/T4ras123/SmolBPE) [![Sponsor](https://img.shields.io/badge/sponsor-GitHub%20Sponsors-critical)](https://github.com/sponsors/T4ras123) [![Twitter Follow](https://img.shields.io/twitter/follow/vover163?style=social)](https://twitter.com/Vover163) ![Made with Love](https://img.shields.io/badge/Made%20with-%E2%9D%A4-red) ## Overview From d6ab197213e32c1279dddb495b94e223934c7386 Mon Sep 17 00:00:00 2001 From: T4ras123 Date: Mon, 4 Nov 2024 11:20:19 +0400 Subject: [PATCH 14/19] feat: enhance GPT4Tokenizer to support special tokens and improve training logic --- .gitignore | 4 ++- smolbpe/gpt4Tokenizer.py | 78 ++++++++++++++++++++++++++++------------ 2 files changed, 58 insertions(+), 24 deletions(-) diff --git a/.gitignore b/.gitignore index 9151771..2f83b38 100644 --- a/.gitignore +++ b/.gitignore @@ -163,4 +163,6 @@ cython_debug/ -.vscode/* \ No newline at end of file +.vscode/* +smolbpe/text.txt +smolbpe/vocab.json diff --git a/smolbpe/gpt4Tokenizer.py b/smolbpe/gpt4Tokenizer.py index d97a2e1..96ec68a 100644 --- a/smolbpe/gpt4Tokenizer.py +++ b/smolbpe/gpt4Tokenizer.py @@ -4,54 +4,84 @@ class GPT4Tokenizer(): - def __init__(self, output='vocab.json', pattern=None): + def __init__(self, output='vocab.json', pattern=None, special_tokens=None): self.vocab = {idx : bytes([idx]) for idx in range(256)} self.merges = dict() - self.pattern = pattern if pattern else r"\p{L}+|\p{Z}+|\p{N}+|[\p{P}&&[^.]]" + self.pattern = pattern if pattern else r"." self.splitby = re.compile(self.pattern) self.output_file = output + self.special_tokens = special_tokens if special_tokens else [] + self.special_token_ids = {} + for i, token in enumerate(self.special_tokens): + token_id = 256 + i + self.vocab[token_id] = token.encode('utf-8') + self.merges[(token_id, token_id)] = token_id + self.special_token_ids[token] = token_id def train(self, text, vocab_size): - assert vocab_size >= 256 - - num_merges = vocab_size - 256 - + + assert vocab_size > len(self.vocab), "Vocab size must be greater than the number of tokens in the vocab" + num_merges = vocab_size - len(self.vocab) text_splitted = re.findall(self.splitby, text) - - ids = [list(ch.encode("utf-8")) for ch in text_splitted] - + ids = [list(self.encode(chunk)) for chunk in text_splitted] + vocab_len = max(self.vocab.keys()) + 1 for i in range(num_merges): stats = {} for _ in ids: self.get_pairs(_, stats) + if not stats: + print(f"No more pairs to merge at iteration {i}. Stopping early.") + break pair = max(stats, key=stats.get) - idx = 256 + i - ids = [self.merge(chunk_ids, pair, idx) for chunk_ids in ids] + idx = vocab_len + i self.merges[pair] = idx self.vocab[idx] = self.vocab[pair[0]] + self.vocab[pair[1]] - self.save_vocab_and_merges(self.path) - + ids = [self.merge(chunk_ids, pair, idx) for chunk_ids in ids] + self.save_vocab_and_merges(self.output_file) def encode(self, text): - ids = list(text.encode('utf-8')) - + tokens = [] + i = 0 + while i < len(text): + matched = False + # Check for special tokens at the current position + for token in self.special_tokens: + if text.startswith(token, i): + token_id = self.special_token_ids[token] + tokens.append(token_id) + i += len(token) + matched = True + break + if not matched: + # Find the next special token position + next_positions = [text.find(st, i) for st in self.special_tokens if text.find(st, i) != -1] + next_special = min(next_positions) if next_positions else len(text) + # Extract substring up to the next special token + substring = text[i:next_special] + # Encode the substring using BPE + ids = list(substring.encode('utf-8')) + ids = self.apply_bpe(ids) + tokens.extend(ids) + i = next_special + return tokens + + + def apply_bpe(self, ids): while True: pairs = self.get_pairs(ids) mergeable_pairs = {p: self.merges[p] for p in pairs if p in self.merges} - if not mergeable_pairs: break pair = min(mergeable_pairs, key=self.merges.get) - ids = self.merge(ids, pair, self.merges[pair]) return ids - + def decode(self, ids): tokens = b"".join(self.vocab[idx] for idx in ids) @@ -68,7 +98,7 @@ def get_pairs(self, ids, counts=None): return counts - def save_vocab_and_merges(self): + def save_vocab_and_merges(self, path): data = { 'vocab': {}, 'merges': {} @@ -81,9 +111,9 @@ def save_vocab_and_merges(self): data['vocab'][str(idx)] = byte_val.hex() # Save merges for (first, second), idx in self.merges.items(): - key = f"{first},{second}" # Convert tuple to string + key = f"{first},{second}" data['merges'][key] = idx - with open(self.output_file, 'w', encoding='utf-8') as f: + with open(path, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2) @@ -122,10 +152,12 @@ def merge(self, ids, pair, idx): parser.add_argument('-v','--vocab_size', type=int, help='Vocab size for tokenizer') parser.add_argument('-o', '--output', default='vocab.json', type=str, help='Output path for vocab and merges') parser.add_argument('-p', '--pattern', type=str, help='Regex pattern to split text') + parser.add_argument('-s', '--special_tokens', nargs='*', default=None, help='Special tokens to add to vocab') args = parser.parse_args() with open(args.text, 'r') as f: args.text = f.read() - - tokenizer = GPT4Tokenizer(args.output, args.pattern) + print(args.special_tokens) + tokenizer = GPT4Tokenizer(args.output, args.pattern, special_tokens=args.special_tokens) tokenizer.train(args.text, args.vocab_size) + print(f"Tokenizer trained and saved to {args.output}") \ No newline at end of file From 3a189476b66510d9294e92e154b9edefd1d6c646 Mon Sep 17 00:00:00 2001 From: T4ras123 Date: Mon, 4 Nov 2024 12:36:50 +0400 Subject: [PATCH 15/19] chore: bump version to 0.3.0 in setup.py --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index f77fba8..756ff44 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='smolbpe', - version='0.2.0', + version='0.3.0', description='A GPT-4 compatible Byte Pair Encoding (BPE) tokenizer.', long_description=open('README.md', encoding='utf-8').read(), long_description_content_type='text/markdown', @@ -28,4 +28,4 @@ 'gpt4tokenizer=smolbpe.gpt4Tokenizer:main', ], }, -) \ No newline at end of file +) From 503c4ef8fc8b054a680e8561b5e418bc546ebd72 Mon Sep 17 00:00:00 2001 From: T4ras123 Date: Mon, 4 Nov 2024 12:39:01 +0400 Subject: [PATCH 16/19] refactor: clean up comments and whitespace in GPT4Tokenizer --- smolbpe/gpt4Tokenizer.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/smolbpe/gpt4Tokenizer.py b/smolbpe/gpt4Tokenizer.py index 96ec68a..5e7549e 100644 --- a/smolbpe/gpt4Tokenizer.py +++ b/smolbpe/gpt4Tokenizer.py @@ -20,8 +20,7 @@ def __init__(self, output='vocab.json', pattern=None, special_tokens=None): def train(self, text, vocab_size): - - + assert vocab_size > len(self.vocab), "Vocab size must be greater than the number of tokens in the vocab" num_merges = vocab_size - len(self.vocab) text_splitted = re.findall(self.splitby, text) @@ -47,7 +46,6 @@ def encode(self, text): i = 0 while i < len(text): matched = False - # Check for special tokens at the current position for token in self.special_tokens: if text.startswith(token, i): token_id = self.special_token_ids[token] @@ -56,12 +54,9 @@ def encode(self, text): matched = True break if not matched: - # Find the next special token position next_positions = [text.find(st, i) for st in self.special_tokens if text.find(st, i) != -1] next_special = min(next_positions) if next_positions else len(text) - # Extract substring up to the next special token substring = text[i:next_special] - # Encode the substring using BPE ids = list(substring.encode('utf-8')) ids = self.apply_bpe(ids) tokens.extend(ids) @@ -120,12 +115,10 @@ def save_vocab_and_merges(self, path): def load_vocab(self, path='vocab.json'): with open(path, 'r', encoding='utf-8') as f: data = json.load(f) - # Load vocab self.vocab = {} for idx_str, value in data['vocab'].items(): idx = idx_str self.vocab[idx] = value.encode('utf-8') - # Load merges self.merges = {} for pair_str, idx in data['merges'].items(): first_str, second_str = pair_str.split(',') From eb6a3f896809442a068afc3519d939a1af5d5c52 Mon Sep 17 00:00:00 2001 From: T4ras123 Date: Mon, 4 Nov 2024 12:40:15 +0400 Subject: [PATCH 17/19] feat: migrate GPT4Tokenizer implementation to a new file and enhance structure --- smolbpe/{gpt4Tokenizer.py => tokenizer.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename smolbpe/{gpt4Tokenizer.py => tokenizer.py} (100%) diff --git a/smolbpe/gpt4Tokenizer.py b/smolbpe/tokenizer.py similarity index 100% rename from smolbpe/gpt4Tokenizer.py rename to smolbpe/tokenizer.py From 11202855e94dd2d520fcc7a38d44cb239add0b45 Mon Sep 17 00:00:00 2001 From: T4ras123 Date: Mon, 4 Nov 2024 12:47:41 +0400 Subject: [PATCH 18/19] feat: update GPT4Tokenizer constructor to set default regex pattern and improve argument order --- smolbpe/tokenizer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/smolbpe/tokenizer.py b/smolbpe/tokenizer.py index 5e7549e..5cf27c7 100644 --- a/smolbpe/tokenizer.py +++ b/smolbpe/tokenizer.py @@ -4,10 +4,10 @@ class GPT4Tokenizer(): - def __init__(self, output='vocab.json', pattern=None, special_tokens=None): + def __init__(self, output='vocab.json', special_tokens=None, pattern=r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""): self.vocab = {idx : bytes([idx]) for idx in range(256)} self.merges = dict() - self.pattern = pattern if pattern else r"." + self.pattern = pattern self.splitby = re.compile(self.pattern) self.output_file = output self.special_tokens = special_tokens if special_tokens else [] @@ -142,15 +142,15 @@ def merge(self, ids, pair, idx): if __name__=='__main__': parser = argparse.ArgumentParser() parser.add_argument('-t', '--text', type=str, help='Text to train tokenizer on') - parser.add_argument('-v','--vocab_size', type=int, help='Vocab size for tokenizer') + parser.add_argument('-v', '--vocab_size', type=int, help='Vocab size for tokenizer') parser.add_argument('-o', '--output', default='vocab.json', type=str, help='Output path for vocab and merges') - parser.add_argument('-p', '--pattern', type=str, help='Regex pattern to split text') + parser.add_argument('-p', '--pattern', type=str, default=r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""", help='Regex pattern to split text') parser.add_argument('-s', '--special_tokens', nargs='*', default=None, help='Special tokens to add to vocab') args = parser.parse_args() with open(args.text, 'r') as f: args.text = f.read() print(args.special_tokens) - tokenizer = GPT4Tokenizer(args.output, args.pattern, special_tokens=args.special_tokens) + tokenizer = GPT4Tokenizer(args.output, special_tokens=args.special_tokens, pattern=args.pattern) tokenizer.train(args.text, args.vocab_size) print(f"Tokenizer trained and saved to {args.output}") \ No newline at end of file From fba88fa6c150f40109b106012195ea452566217a Mon Sep 17 00:00:00 2001 From: T4ras123 Date: Mon, 4 Nov 2024 12:59:35 +0400 Subject: [PATCH 19/19] feat: rename GPT4Tokenizer to Tokenizer and update README and setup.py --- .gitignore | 1 + README.md | 14 +++++++------- setup.py | 2 +- smolbpe/tokenizer.py | 4 ++-- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 2f83b38..aee1f37 100644 --- a/.gitignore +++ b/.gitignore @@ -166,3 +166,4 @@ cython_debug/ .vscode/* smolbpe/text.txt smolbpe/vocab.json +test.py diff --git a/README.md b/README.md index 33f98b9..9ce36f2 100644 --- a/README.md +++ b/README.md @@ -38,19 +38,19 @@ pip install . 1.Importing the Tokenizer ```python - from smolbpe.gpt4Tokenizer import GPT4Tokenizer + from smolbpe.tokenizer import Tokenizer ``` 2.Initializing the Tokenizer ```python - tokenizer = GPT4Tokenizer() + tokenizer = Tokenizer() ``` You can specify a custom output file to save the vocab file to and regex pattern if needed: ```python - tokenizer = GPT4Tokenizer(output='vocab.json', pattern=r"\p{L}+|\p{Z}+|\p{N}+|[\p{P}&&[^.]]") + tokenizer = Tokenizer(output='vocab.json', pattern=r"\p{L}+|\p{Z}+|\p{N}+|[\p{P}&&[^.]]") ``` 3.Training the Tokenizer @@ -89,7 +89,7 @@ SmolBPE provides a command-line interface for easy tokenization tasks. #### Training the Tokenizer ```sh -gpt4tokenizer --text data/taylorswift.txt --vocab_size 400 --output vocab.json +tokenizer --text smth.txt --vocab_size 400 --output vocab.json ``` ## Advanced Usage @@ -99,7 +99,7 @@ gpt4tokenizer --text data/taylorswift.txt --vocab_size 400 --output vocab.json If you have a pre-trained vocabulary and merges file, you can load them directly: ```python -tokenizer = GPT4Tokenizer() +tokenizer = Tokenizer() tokenizer.load_vocab('vocab.json') ``` @@ -109,7 +109,7 @@ Customize the tokenization by providing a different regex pattern: ```python custom_pattern = r"\w+|\s+|[^\s\w]+" -tokenizer = GPT4Tokenizer(pattern=custom_pattern) +tokenizer = Tokenizer(pattern=custom_pattern) ``` ## Project Structure @@ -118,7 +118,7 @@ tokenizer = GPT4Tokenizer(pattern=custom_pattern) SmolBPE/ ├── smolbpe/ │ ├── __init__.py -│ └── gpt4Tokenizer.py +│ └── tokenizer.py ├── LICENSE ├── MANIFEST.in ├── README.md diff --git a/setup.py b/setup.py index 756ff44..9734021 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='smolbpe', - version='0.3.0', + version='0.3.1', description='A GPT-4 compatible Byte Pair Encoding (BPE) tokenizer.', long_description=open('README.md', encoding='utf-8').read(), long_description_content_type='text/markdown', diff --git a/smolbpe/tokenizer.py b/smolbpe/tokenizer.py index 5cf27c7..ed3d3b2 100644 --- a/smolbpe/tokenizer.py +++ b/smolbpe/tokenizer.py @@ -3,7 +3,7 @@ import json -class GPT4Tokenizer(): +class Tokenizer(): def __init__(self, output='vocab.json', special_tokens=None, pattern=r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""): self.vocab = {idx : bytes([idx]) for idx in range(256)} self.merges = dict() @@ -151,6 +151,6 @@ def merge(self, ids, pair, idx): with open(args.text, 'r') as f: args.text = f.read() print(args.special_tokens) - tokenizer = GPT4Tokenizer(args.output, special_tokens=args.special_tokens, pattern=args.pattern) + tokenizer = Tokenizer(args.output, special_tokens=args.special_tokens, pattern=args.pattern) tokenizer.train(args.text, args.vocab_size) print(f"Tokenizer trained and saved to {args.output}") \ No newline at end of file