Refactor SDPMChunker and SemanticChunker to remove tokenizer dependen…

…cy and improve documentation
chonkie-ai · Nov 12, 2024 · 75132ce · 75132ce
1 parent 247b872
commit 75132ce
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 81 deletions.
diff --git a/DOCS.md b/DOCS.md
@@ -266,14 +266,12 @@ chunker = SentenceChunker(
 
 The `SemanticChunker` groups content by semantic similarity. The implementation is inspired by the semantic chunking approach described in the [FullStackRetrieval Tutorials](https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb), with modifications and optimizations for better performance and integration with Chonkie's architecture.
 
+This version of `SemanticChunker` has some optimizations that speed it up considerably, but make the assumption that the `tokenizer` you used is the same as the one used for `embedding_model`. This is a valid assumption since most often than not, `chunk_size` and hence, `token_count` is dependent on the `embedding_model` context sizes rather than on the Generative models context length.
+
 ```python
 from chonkie import SemanticChunker
-from autotiktokenizer import AutoTikTokenizer
-
-tokenizer = AutoTikTokenizer.from_pretrained("gpt2")
 
 chunker = SemanticChunker(
-    tokenizer=tokenizer,
     embedding_model="all-minilm-l6-v2",
     max_chunk_size=512,
     similarity_threshold=0.7
@@ -283,6 +281,7 @@ chunker = SemanticChunker(
 **key parameters:**
 
 - `embedding_model`: model for semantic embeddings. This can either be a `str` or a `SentenceTransformer` model. If a `str` is passed, it uses `SentenceTransformer` to load it. 
+- `max_chunk_size`: max size of the chunk received from `SemanticChunker` 
 - `similarity_threshold`: threshold for semantic grouping
 
 ## SDPMChunker
@@ -291,12 +290,8 @@ the `SDPMChunker` groups content via the semantic double-pass merging method, wh
 
 ```python
 from chonkie import SDPMChunker
-from autotiktokenizer import AutoTikTokenizer
-
-tokenizer = AutoTikTokenizer.from_pretrained("gpt2")
 
 chunker = SDPMChunker(
-    tokenizer=tokenizer,
     embedding_model="all-minilm-l6-v2",
     max_chunk_size=512,
     similarity_threshold=0.7, 

diff --git a/src/chonkie/chunker/sdpm.py b/src/chonkie/chunker/sdpm.py
@@ -4,9 +4,23 @@
 
 
 class SDPMChunker(SemanticChunker):
+    """Chunker implementation using the Semantic Document Partitioning Method (SDPM).
+
+    The SDPM approach involves three main steps:
+    1. Grouping sentences by semantic similarity (Same as SemanticChunker)
+    2. Merging similar groups with a skip window
+    3. Splitting the merged groups into size-appropriate chunks
+
+    Args:
+        embedding_model: Sentence embedding model to use
+        similarity_threshold: Minimum similarity score to consider sentences similar
+        similarity_percentile: Minimum similarity percentile to consider sentences similar
+        max_chunk_size: Maximum token count for a chunk
+        initial_sentences: Number of sentences to consider for initial grouping
+        skip_window: Number of chunks to skip when looking for similarities
+    """
     def __init__(
         self,
-        tokenizer: Union[str, Any] = "gpt2",
         embedding_model: Union[str, Any] = "sentence-transformers/all-MiniLM-L6-v2",
         similarity_threshold: float = None,
         similarity_percentile: float = None,
@@ -17,11 +31,14 @@ def __init__(
         """Initialize the SDPMChunker.
 
         Args:
-            Same as SemanticChunker, plus:
+            embedding_model: Sentence embedding model to use
+            similarity_threshold: Minimum similarity score to consider sentences similar
+            similarity_percentile: Minimum similarity percentile to consider sentences similar
+            max_chunk_size: Maximum token count for a chunk
+            initial_sentences: Number of sentences to consider for initial grouping
             skip_window: Number of chunks to skip when looking for similarities
         """
         super().__init__(
-            tokenizer=tokenizer,
             embedding_model=embedding_model,
             max_chunk_size=max_chunk_size,
             similarity_threshold=similarity_threshold,

diff --git a/src/chonkie/chunker/semantic.py b/src/chonkie/chunker/semantic.py
@@ -31,7 +31,6 @@ class SemanticChunk(SentenceChunk):
 class SemanticChunker(BaseChunker):
     def __init__(
         self,
-        tokenizer: Union[str, Any] = "gpt2",
         embedding_model: Union[str, Any] = "sentence-transformers/all-MiniLM-L6-v2",
         similarity_threshold: Optional[float] = None,
         similarity_percentile: Optional[float] = None,
@@ -43,7 +42,6 @@ def __init__(
         SemanticChunkers split text into semantically coherent chunks using embeddings.
 
         Args:
-            tokenizer: Tokenizer for counting tokens
             embedding_model: Name of the sentence-transformers model to load
             max_chunk_size: Maximum tokens allowed per chunk
             similarity_threshold: Absolute threshold for semantic similarity (0-1)
@@ -54,8 +52,6 @@ def __init__(
             ValueError: If parameters are invalid
             ImportError: If required dependencies aren't installed
         """
-        super().__init__(tokenizer)
-
         if max_chunk_size <= 0:
             raise ValueError("max_chunk_size must be positive")
         if similarity_threshold is not None and (
@@ -88,6 +84,11 @@ def __init__(
             )
         else:
             self.embedding_model = embedding_model
+
+        # Keeping the tokenizer the same as the sentence model is important
+        # for the group semantic meaning to be calculated properly
+        tokenizer = self.embedding_model.tokenizer
+        super().__init__(tokenizer)
 
     def _import_sentence_transformers(self) -> Any:
         """Import sentence-transformers library. Imports mentioned inside the class,
@@ -197,27 +198,6 @@ def _split_sentences(self, text: str) -> List[str]:
         # Split into sentences and clean up
         sentences = [s.strip() for s in text.split('\n') if s.strip()]
 
-        # Get token counts for sentences
-        # token_counts = self._get_token_counts(sentences)
-
-        # # Create Sentence objects
-        # result_sentences = []
-        # current_pos = 0
-        # for sent, token_count in zip(sentences, token_counts):
-        #     # Find the actual position in original text
-        #     start_idx = text.find(sent, current_pos)
-        #     end_idx = start_idx + len(sent)
-        #     current_pos = end_idx
-
-        #     result_sentences.append(
-        #         Sentence(
-        #             text=sent,
-        #             start_index=start_idx,
-        #             end_index=end_idx,
-        #             token_count=token_count
-        #         )
-        #     )
-
         return sentences
 
     def _compute_similarity_threshold(self, all_similarities: List[float]) -> float:
@@ -445,4 +425,4 @@ def __repr__(self) -> str:
             f"SemanticChunker(max_chunk_size={self.max_chunk_size}, "
             f"{threshold_info}, "
             f"initial_sentences={self.initial_sentences})"
-        )
+        )
diff --git a/tests/chunker/test_sdpm_chunker.py b/tests/chunker/test_sdpm_chunker.py
@@ -1,16 +1,9 @@
 import pytest
 from sentence_transformers import SentenceTransformer
-from tokenizers import Tokenizer
 
 from chonkie.chunker.sdpm import SDPMChunker
 from chonkie.chunker.semantic import SemanticChunk
 
-
-@pytest.fixture
-def tokenizer():
-    return Tokenizer.from_pretrained("gpt2")
-
-
 @pytest.fixture
 def sample_text():
     text = """The process of text chunking in RAG applications represents a delicate balance between competing requirements. On one side, we have the need for semantic coherence – ensuring that each chunk maintains meaningful context that can be understood and processed independently. On the other, we must optimize for information density, ensuring that each chunk carries sufficient signal without excessive noise that might impede retrieval accuracy. In this post, we explore the challenges of text chunking in RAG applications and propose a novel approach that leverages recent advances in transformer-based language models to achieve a more effective balance between these competing requirements."""
@@ -22,28 +15,25 @@ def embedding_model():
     return SentenceTransformer("all-MiniLM-L6-v2")
 
 
-def test_spdm_chunker_initialization(tokenizer, embedding_model):
+def test_spdm_chunker_initialization(embedding_model):
     """Test that the SPDMChunker can be initialized with required parameters."""
     chunker = SDPMChunker(
-        tokenizer=tokenizer,
         embedding_model=embedding_model,
         max_chunk_size=512,
         similarity_threshold=0.5,
         skip_window=2,
     )
 
     assert chunker is not None
-    assert chunker.tokenizer == tokenizer
     assert chunker.max_chunk_size == 512
     assert chunker.similarity_threshold == 0.5
     assert chunker.initial_sentences == 1
     assert chunker.skip_window == 2
 
 
-def test_spdm_chunker_chunking(tokenizer, embedding_model, sample_text):
+def test_spdm_chunker_chunking(embedding_model, sample_text):
     """Test that the SPDMChunker can chunk a sample text."""
     chunker = SDPMChunker(
-        tokenizer=tokenizer,
         embedding_model=embedding_model,
         max_chunk_size=512,
         similarity_threshold=0.5,
@@ -60,10 +50,9 @@ def test_spdm_chunker_chunking(tokenizer, embedding_model, sample_text):
     assert all([chunk.sentences is not None for chunk in chunks])
 
 
-def test_spdm_chunker_empty_text(tokenizer, embedding_model):
+def test_spdm_chunker_empty_text(embedding_model):
     """Test that the SPDMChunker can handle empty text input."""
     chunker = SDPMChunker(
-        tokenizer=tokenizer,
         embedding_model=embedding_model,
         max_chunk_size=512,
         similarity_threshold=0.5,
@@ -73,10 +62,9 @@ def test_spdm_chunker_empty_text(tokenizer, embedding_model):
     assert len(chunks) == 0
 
 
-def test_spdm_chunker_single_sentence(tokenizer, embedding_model):
+def test_spdm_chunker_single_sentence(embedding_model):
     """Test that the SPDMChunker can handle text with a single sentence."""
     chunker = SDPMChunker(
-        tokenizer=tokenizer,
         embedding_model=embedding_model,
         max_chunk_size=512,
         similarity_threshold=0.5,
@@ -88,10 +76,9 @@ def test_spdm_chunker_single_sentence(tokenizer, embedding_model):
     assert len(chunks[0].sentences) == 1
 
 
-def test_spdm_chunker_repr(tokenizer, embedding_model):
+def test_spdm_chunker_repr(embedding_model):
     """Test that the SPDMChunker has a string representation."""
     chunker = SDPMChunker(
-        tokenizer=tokenizer,
         embedding_model=embedding_model,
         max_chunk_size=512,
         similarity_threshold=0.5,

diff --git a/tests/chunker/test_semantic_chunker.py b/tests/chunker/test_semantic_chunker.py
@@ -1,15 +1,9 @@
 import pytest
 from sentence_transformers import SentenceTransformer
-from tokenizers import Tokenizer
 
 from chonkie.chunker.semantic import SemanticChunk, SemanticChunker
 
 
-@pytest.fixture
-def tokenizer():
-    return Tokenizer.from_pretrained("gpt2")
-
-
 @pytest.fixture
 def sample_text():
     text = """The process of text chunking in RAG applications represents a delicate balance between competing requirements. On one side, we have the need for semantic coherence – ensuring that each chunk maintains meaningful context that can be understood and processed independently. On the other, we must optimize for information density, ensuring that each chunk carries sufficient signal without excessive noise that might impede retrieval accuracy. In this post, we explore the challenges of text chunking in RAG applications and propose a novel approach that leverages recent advances in transformer-based language models to achieve a more effective balance between these competing requirements."""
@@ -21,42 +15,37 @@ def embedding_model():
     return SentenceTransformer("all-MiniLM-L6-v2")
 
 
-def test_semantic_chunker_initialization(tokenizer, embedding_model):
+def test_semantic_chunker_initialization(embedding_model):
     """Test that the SemanticChunker can be initialized with required parameters."""
     chunker = SemanticChunker(
-        tokenizer=tokenizer,
         embedding_model=embedding_model,
         max_chunk_size=512,
         similarity_threshold=0.5,
     )
 
     assert chunker is not None
-    assert chunker.tokenizer == tokenizer
     assert chunker.max_chunk_size == 512
     assert chunker.similarity_threshold == 0.5
     assert chunker.initial_sentences == 1
 
 
-def test_semantic_chunker_initialization_sentence_transformer(tokenizer):
+def test_semantic_chunker_initialization_sentence_transformer():
     """Test that the SemanticChunker can be initialized with SentenceTransformer model."""
     chunker = SemanticChunker(
-        tokenizer=tokenizer,
         embedding_model="all-MiniLM-L6-v2",
         max_chunk_size=512,
         similarity_threshold=0.5,
     )
 
     assert chunker is not None
-    assert chunker.tokenizer == tokenizer
     assert chunker.max_chunk_size == 512
     assert chunker.similarity_threshold == 0.5
     assert chunker.initial_sentences == 1
 
 
-def test_semantic_chunker_chunking(tokenizer, embedding_model, sample_text):
+def test_semantic_chunker_chunking(embedding_model, sample_text):
     """Test that the SemanticChunker can chunk a sample text."""
     chunker = SemanticChunker(
-        tokenizer=tokenizer,
         embedding_model="all-MiniLM-L6-v2",
         max_chunk_size=512,
         similarity_threshold=0.5,
@@ -73,10 +62,9 @@ def test_semantic_chunker_chunking(tokenizer, embedding_model, sample_text):
     assert all([chunk.sentences is not None for chunk in chunks])
 
 
-def test_semantic_chunker_empty_text(tokenizer, embedding_model):
+def test_semantic_chunker_empty_text(embedding_model):
     """Test that the SemanticChunker can handle empty text input."""
     chunker = SemanticChunker(
-        tokenizer=tokenizer,
         embedding_model=embedding_model,
         max_chunk_size=512,
         similarity_threshold=0.5,
@@ -86,10 +74,9 @@ def test_semantic_chunker_empty_text(tokenizer, embedding_model):
     assert len(chunks) == 0
 
 
-def test_semantic_chunker_single_sentence(tokenizer, embedding_model):
+def test_semantic_chunker_single_sentence(embedding_model):
     """Test that the SemanticChunker can handle text with a single sentence."""
     chunker = SemanticChunker(
-        tokenizer=tokenizer,
         embedding_model=embedding_model,
         max_chunk_size=512,
         similarity_threshold=0.5,
@@ -101,10 +88,9 @@ def test_semantic_chunker_single_sentence(tokenizer, embedding_model):
     assert len(chunks[0].sentences) == 1
 
 
-def test_semantic_chunker_single_chunk_text(tokenizer, embedding_model):
+def test_semantic_chunker_single_chunk_text(embedding_model):
     """Test that the SemanticChunker can handle text that fits in a single chunk."""
     chunker = SemanticChunker(
-        tokenizer=tokenizer,
         embedding_model=embedding_model,
         max_chunk_size=512,
         similarity_threshold=0.5,
@@ -117,10 +103,9 @@ def test_semantic_chunker_single_chunk_text(tokenizer, embedding_model):
     assert len(chunks[0].sentences) == 2
 
 
-def test_semantic_chunker_repr(tokenizer, embedding_model):
+def test_semantic_chunker_repr(embedding_model):
     """Test that the SemanticChunker has a string representation."""
     chunker = SemanticChunker(
-        tokenizer=tokenizer,
         embedding_model=embedding_model,
         max_chunk_size=512,
         similarity_threshold=0.5,
@@ -133,10 +118,9 @@ def test_semantic_chunker_repr(tokenizer, embedding_model):
     assert repr(chunker) == expected
 
 
-def test_semantic_chunker_similarity_threshold(tokenizer, embedding_model):
+def test_semantic_chunker_similarity_threshold(embedding_model):
     """Test that the SemanticChunker respects similarity threshold."""
     chunker = SemanticChunker(
-        tokenizer=tokenizer,
         embedding_model=embedding_model,
         max_chunk_size=512,
         similarity_threshold=0.9,  # High threshold should create more chunks
@@ -151,10 +135,9 @@ def test_semantic_chunker_similarity_threshold(tokenizer, embedding_model):
     assert len(chunks) > 1
 
 
-def test_semantic_chunker_percentile_mode(tokenizer, embedding_model, sample_text):
+def test_semantic_chunker_percentile_mode(embedding_model, sample_text):
     """Test that the SemanticChunker works with percentile-based similarity."""
     chunker = SemanticChunker(
-        tokenizer=tokenizer,
         embedding_model=embedding_model,
         max_chunk_size=512,
         similarity_percentile=50,  # Use median similarity as threshold