Skip to content

Commit

Permalink
Refactor SDPMChunker and SemanticChunker to remove tokenizer dependen…
Browse files Browse the repository at this point in the history
…cy and improve documentation
  • Loading branch information
bhavnicksm committed Nov 12, 2024
1 parent 247b872 commit 75132ce
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 81 deletions.
11 changes: 3 additions & 8 deletions DOCS.md
Original file line number Diff line number Diff line change
Expand Up @@ -266,14 +266,12 @@ chunker = SentenceChunker(

The `SemanticChunker` groups content by semantic similarity. The implementation is inspired by the semantic chunking approach described in the [FullStackRetrieval Tutorials](https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb), with modifications and optimizations for better performance and integration with Chonkie's architecture.

This version of `SemanticChunker` has some optimizations that speed it up considerably, but make the assumption that the `tokenizer` you used is the same as the one used for `embedding_model`. This is a valid assumption since most often than not, `chunk_size` and hence, `token_count` is dependent on the `embedding_model` context sizes rather than on the Generative models context length.

```python
from chonkie import SemanticChunker
from autotiktokenizer import AutoTikTokenizer

tokenizer = AutoTikTokenizer.from_pretrained("gpt2")

chunker = SemanticChunker(
tokenizer=tokenizer,
embedding_model="all-minilm-l6-v2",
max_chunk_size=512,
similarity_threshold=0.7
Expand All @@ -283,6 +281,7 @@ chunker = SemanticChunker(
**key parameters:**

- `embedding_model`: model for semantic embeddings. This can either be a `str` or a `SentenceTransformer` model. If a `str` is passed, it uses `SentenceTransformer` to load it.
- `max_chunk_size`: max size of the chunk received from `SemanticChunker`
- `similarity_threshold`: threshold for semantic grouping

## SDPMChunker
Expand All @@ -291,12 +290,8 @@ the `SDPMChunker` groups content via the semantic double-pass merging method, wh

```python
from chonkie import SDPMChunker
from autotiktokenizer import AutoTikTokenizer

tokenizer = AutoTikTokenizer.from_pretrained("gpt2")

chunker = SDPMChunker(
tokenizer=tokenizer,
embedding_model="all-minilm-l6-v2",
max_chunk_size=512,
similarity_threshold=0.7,
Expand Down
23 changes: 20 additions & 3 deletions src/chonkie/chunker/sdpm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,23 @@


class SDPMChunker(SemanticChunker):
"""Chunker implementation using the Semantic Document Partitioning Method (SDPM).
The SDPM approach involves three main steps:
1. Grouping sentences by semantic similarity (Same as SemanticChunker)
2. Merging similar groups with a skip window
3. Splitting the merged groups into size-appropriate chunks
Args:
embedding_model: Sentence embedding model to use
similarity_threshold: Minimum similarity score to consider sentences similar
similarity_percentile: Minimum similarity percentile to consider sentences similar
max_chunk_size: Maximum token count for a chunk
initial_sentences: Number of sentences to consider for initial grouping
skip_window: Number of chunks to skip when looking for similarities
"""
def __init__(
self,
tokenizer: Union[str, Any] = "gpt2",
embedding_model: Union[str, Any] = "sentence-transformers/all-MiniLM-L6-v2",
similarity_threshold: float = None,
similarity_percentile: float = None,
Expand All @@ -17,11 +31,14 @@ def __init__(
"""Initialize the SDPMChunker.
Args:
Same as SemanticChunker, plus:
embedding_model: Sentence embedding model to use
similarity_threshold: Minimum similarity score to consider sentences similar
similarity_percentile: Minimum similarity percentile to consider sentences similar
max_chunk_size: Maximum token count for a chunk
initial_sentences: Number of sentences to consider for initial grouping
skip_window: Number of chunks to skip when looking for similarities
"""
super().__init__(
tokenizer=tokenizer,
embedding_model=embedding_model,
max_chunk_size=max_chunk_size,
similarity_threshold=similarity_threshold,
Expand Down
32 changes: 6 additions & 26 deletions src/chonkie/chunker/semantic.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ class SemanticChunk(SentenceChunk):
class SemanticChunker(BaseChunker):
def __init__(
self,
tokenizer: Union[str, Any] = "gpt2",
embedding_model: Union[str, Any] = "sentence-transformers/all-MiniLM-L6-v2",
similarity_threshold: Optional[float] = None,
similarity_percentile: Optional[float] = None,
Expand All @@ -43,7 +42,6 @@ def __init__(
SemanticChunkers split text into semantically coherent chunks using embeddings.
Args:
tokenizer: Tokenizer for counting tokens
embedding_model: Name of the sentence-transformers model to load
max_chunk_size: Maximum tokens allowed per chunk
similarity_threshold: Absolute threshold for semantic similarity (0-1)
Expand All @@ -54,8 +52,6 @@ def __init__(
ValueError: If parameters are invalid
ImportError: If required dependencies aren't installed
"""
super().__init__(tokenizer)

if max_chunk_size <= 0:
raise ValueError("max_chunk_size must be positive")
if similarity_threshold is not None and (
Expand Down Expand Up @@ -88,6 +84,11 @@ def __init__(
)
else:
self.embedding_model = embedding_model

# Keeping the tokenizer the same as the sentence model is important
# for the group semantic meaning to be calculated properly
tokenizer = self.embedding_model.tokenizer
super().__init__(tokenizer)

def _import_sentence_transformers(self) -> Any:
"""Import sentence-transformers library. Imports mentioned inside the class,
Expand Down Expand Up @@ -197,27 +198,6 @@ def _split_sentences(self, text: str) -> List[str]:
# Split into sentences and clean up
sentences = [s.strip() for s in text.split('\n') if s.strip()]

# Get token counts for sentences
# token_counts = self._get_token_counts(sentences)

# # Create Sentence objects
# result_sentences = []
# current_pos = 0
# for sent, token_count in zip(sentences, token_counts):
# # Find the actual position in original text
# start_idx = text.find(sent, current_pos)
# end_idx = start_idx + len(sent)
# current_pos = end_idx

# result_sentences.append(
# Sentence(
# text=sent,
# start_index=start_idx,
# end_index=end_idx,
# token_count=token_count
# )
# )

return sentences

def _compute_similarity_threshold(self, all_similarities: List[float]) -> float:
Expand Down Expand Up @@ -445,4 +425,4 @@ def __repr__(self) -> str:
f"SemanticChunker(max_chunk_size={self.max_chunk_size}, "
f"{threshold_info}, "
f"initial_sentences={self.initial_sentences})"
)
)
23 changes: 5 additions & 18 deletions tests/chunker/test_sdpm_chunker.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,9 @@
import pytest
from sentence_transformers import SentenceTransformer
from tokenizers import Tokenizer

from chonkie.chunker.sdpm import SDPMChunker
from chonkie.chunker.semantic import SemanticChunk


@pytest.fixture
def tokenizer():
return Tokenizer.from_pretrained("gpt2")


@pytest.fixture
def sample_text():
text = """The process of text chunking in RAG applications represents a delicate balance between competing requirements. On one side, we have the need for semantic coherence – ensuring that each chunk maintains meaningful context that can be understood and processed independently. On the other, we must optimize for information density, ensuring that each chunk carries sufficient signal without excessive noise that might impede retrieval accuracy. In this post, we explore the challenges of text chunking in RAG applications and propose a novel approach that leverages recent advances in transformer-based language models to achieve a more effective balance between these competing requirements."""
Expand All @@ -22,28 +15,25 @@ def embedding_model():
return SentenceTransformer("all-MiniLM-L6-v2")


def test_spdm_chunker_initialization(tokenizer, embedding_model):
def test_spdm_chunker_initialization(embedding_model):
"""Test that the SPDMChunker can be initialized with required parameters."""
chunker = SDPMChunker(
tokenizer=tokenizer,
embedding_model=embedding_model,
max_chunk_size=512,
similarity_threshold=0.5,
skip_window=2,
)

assert chunker is not None
assert chunker.tokenizer == tokenizer
assert chunker.max_chunk_size == 512
assert chunker.similarity_threshold == 0.5
assert chunker.initial_sentences == 1
assert chunker.skip_window == 2


def test_spdm_chunker_chunking(tokenizer, embedding_model, sample_text):
def test_spdm_chunker_chunking(embedding_model, sample_text):
"""Test that the SPDMChunker can chunk a sample text."""
chunker = SDPMChunker(
tokenizer=tokenizer,
embedding_model=embedding_model,
max_chunk_size=512,
similarity_threshold=0.5,
Expand All @@ -60,10 +50,9 @@ def test_spdm_chunker_chunking(tokenizer, embedding_model, sample_text):
assert all([chunk.sentences is not None for chunk in chunks])


def test_spdm_chunker_empty_text(tokenizer, embedding_model):
def test_spdm_chunker_empty_text(embedding_model):
"""Test that the SPDMChunker can handle empty text input."""
chunker = SDPMChunker(
tokenizer=tokenizer,
embedding_model=embedding_model,
max_chunk_size=512,
similarity_threshold=0.5,
Expand All @@ -73,10 +62,9 @@ def test_spdm_chunker_empty_text(tokenizer, embedding_model):
assert len(chunks) == 0


def test_spdm_chunker_single_sentence(tokenizer, embedding_model):
def test_spdm_chunker_single_sentence(embedding_model):
"""Test that the SPDMChunker can handle text with a single sentence."""
chunker = SDPMChunker(
tokenizer=tokenizer,
embedding_model=embedding_model,
max_chunk_size=512,
similarity_threshold=0.5,
Expand All @@ -88,10 +76,9 @@ def test_spdm_chunker_single_sentence(tokenizer, embedding_model):
assert len(chunks[0].sentences) == 1


def test_spdm_chunker_repr(tokenizer, embedding_model):
def test_spdm_chunker_repr(embedding_model):
"""Test that the SPDMChunker has a string representation."""
chunker = SDPMChunker(
tokenizer=tokenizer,
embedding_model=embedding_model,
max_chunk_size=512,
similarity_threshold=0.5,
Expand Down
35 changes: 9 additions & 26 deletions tests/chunker/test_semantic_chunker.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,9 @@
import pytest
from sentence_transformers import SentenceTransformer
from tokenizers import Tokenizer

from chonkie.chunker.semantic import SemanticChunk, SemanticChunker


@pytest.fixture
def tokenizer():
return Tokenizer.from_pretrained("gpt2")


@pytest.fixture
def sample_text():
text = """The process of text chunking in RAG applications represents a delicate balance between competing requirements. On one side, we have the need for semantic coherence – ensuring that each chunk maintains meaningful context that can be understood and processed independently. On the other, we must optimize for information density, ensuring that each chunk carries sufficient signal without excessive noise that might impede retrieval accuracy. In this post, we explore the challenges of text chunking in RAG applications and propose a novel approach that leverages recent advances in transformer-based language models to achieve a more effective balance between these competing requirements."""
Expand All @@ -21,42 +15,37 @@ def embedding_model():
return SentenceTransformer("all-MiniLM-L6-v2")


def test_semantic_chunker_initialization(tokenizer, embedding_model):
def test_semantic_chunker_initialization(embedding_model):
"""Test that the SemanticChunker can be initialized with required parameters."""
chunker = SemanticChunker(
tokenizer=tokenizer,
embedding_model=embedding_model,
max_chunk_size=512,
similarity_threshold=0.5,
)

assert chunker is not None
assert chunker.tokenizer == tokenizer
assert chunker.max_chunk_size == 512
assert chunker.similarity_threshold == 0.5
assert chunker.initial_sentences == 1


def test_semantic_chunker_initialization_sentence_transformer(tokenizer):
def test_semantic_chunker_initialization_sentence_transformer():
"""Test that the SemanticChunker can be initialized with SentenceTransformer model."""
chunker = SemanticChunker(
tokenizer=tokenizer,
embedding_model="all-MiniLM-L6-v2",
max_chunk_size=512,
similarity_threshold=0.5,
)

assert chunker is not None
assert chunker.tokenizer == tokenizer
assert chunker.max_chunk_size == 512
assert chunker.similarity_threshold == 0.5
assert chunker.initial_sentences == 1


def test_semantic_chunker_chunking(tokenizer, embedding_model, sample_text):
def test_semantic_chunker_chunking(embedding_model, sample_text):
"""Test that the SemanticChunker can chunk a sample text."""
chunker = SemanticChunker(
tokenizer=tokenizer,
embedding_model="all-MiniLM-L6-v2",
max_chunk_size=512,
similarity_threshold=0.5,
Expand All @@ -73,10 +62,9 @@ def test_semantic_chunker_chunking(tokenizer, embedding_model, sample_text):
assert all([chunk.sentences is not None for chunk in chunks])


def test_semantic_chunker_empty_text(tokenizer, embedding_model):
def test_semantic_chunker_empty_text(embedding_model):
"""Test that the SemanticChunker can handle empty text input."""
chunker = SemanticChunker(
tokenizer=tokenizer,
embedding_model=embedding_model,
max_chunk_size=512,
similarity_threshold=0.5,
Expand All @@ -86,10 +74,9 @@ def test_semantic_chunker_empty_text(tokenizer, embedding_model):
assert len(chunks) == 0


def test_semantic_chunker_single_sentence(tokenizer, embedding_model):
def test_semantic_chunker_single_sentence(embedding_model):
"""Test that the SemanticChunker can handle text with a single sentence."""
chunker = SemanticChunker(
tokenizer=tokenizer,
embedding_model=embedding_model,
max_chunk_size=512,
similarity_threshold=0.5,
Expand All @@ -101,10 +88,9 @@ def test_semantic_chunker_single_sentence(tokenizer, embedding_model):
assert len(chunks[0].sentences) == 1


def test_semantic_chunker_single_chunk_text(tokenizer, embedding_model):
def test_semantic_chunker_single_chunk_text(embedding_model):
"""Test that the SemanticChunker can handle text that fits in a single chunk."""
chunker = SemanticChunker(
tokenizer=tokenizer,
embedding_model=embedding_model,
max_chunk_size=512,
similarity_threshold=0.5,
Expand All @@ -117,10 +103,9 @@ def test_semantic_chunker_single_chunk_text(tokenizer, embedding_model):
assert len(chunks[0].sentences) == 2


def test_semantic_chunker_repr(tokenizer, embedding_model):
def test_semantic_chunker_repr(embedding_model):
"""Test that the SemanticChunker has a string representation."""
chunker = SemanticChunker(
tokenizer=tokenizer,
embedding_model=embedding_model,
max_chunk_size=512,
similarity_threshold=0.5,
Expand All @@ -133,10 +118,9 @@ def test_semantic_chunker_repr(tokenizer, embedding_model):
assert repr(chunker) == expected


def test_semantic_chunker_similarity_threshold(tokenizer, embedding_model):
def test_semantic_chunker_similarity_threshold(embedding_model):
"""Test that the SemanticChunker respects similarity threshold."""
chunker = SemanticChunker(
tokenizer=tokenizer,
embedding_model=embedding_model,
max_chunk_size=512,
similarity_threshold=0.9, # High threshold should create more chunks
Expand All @@ -151,10 +135,9 @@ def test_semantic_chunker_similarity_threshold(tokenizer, embedding_model):
assert len(chunks) > 1


def test_semantic_chunker_percentile_mode(tokenizer, embedding_model, sample_text):
def test_semantic_chunker_percentile_mode(embedding_model, sample_text):
"""Test that the SemanticChunker works with percentile-based similarity."""
chunker = SemanticChunker(
tokenizer=tokenizer,
embedding_model=embedding_model,
max_chunk_size=512,
similarity_percentile=50, # Use median similarity as threshold
Expand Down

0 comments on commit 75132ce

Please sign in to comment.