Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update bm25Vectorizer #323

Merged
merged 9 commits into from
Jan 17, 2021
Merged
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 20 additions & 10 deletions pyserini/vectorizer/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@
from typing import List

from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize

from pyserini import index, search
from pyserini.analysis import Analyzer, get_lucene_analyzer
from tqdm import tqdm


class Vectorizer:
Expand All @@ -42,12 +43,15 @@ def __init__(self, lucene_index_path: str, min_df: int = 1, verbose: bool = Fals
self.index_reader = index.IndexReader(lucene_index_path)
self.searcher = search.SimpleSearcher(lucene_index_path)
self.num_docs: int = self.searcher.num_docs
self.stats = self.index_reader.stats()
self.analyzer = Analyzer(get_lucene_analyzer())

# build vocabulary
self.vocabulary_ = set()
for term in self.index_reader.terms():
if term.df > self.min_df:
self.vocabulary_.add(term.term)
self.vocabulary_ = sorted(self.vocabulary_)

# build term to index mapping
self.term_to_index = {}
Expand All @@ -58,6 +62,17 @@ def __init__(self, lucene_index_path: str, min_df: int = 1, verbose: bool = Fals
if self.verbose:
print(f'Found {self.vocabulary_size} terms with min_df={self.min_df}')

def get_query_vector(self, query: str):
matrix_row, matrix_col, matrix_data = [], [], []
tokens = self.analyzer.analyze(query)
for term in tokens:
if term in self.vocabulary_:
matrix_row.append(0)
matrix_col.append(self.term_to_index[term])
matrix_data.append(1)
vectors = csr_matrix((matrix_data, (matrix_row, matrix_col)), shape=(1, self.vocabulary_size))
return vectors


class TfidfVectorizer(Vectorizer):
"""Wrapper class for tf-idf vectorizer implemented on top of Pyserini.
Expand Down Expand Up @@ -95,10 +110,7 @@ def get_vectors(self, docids: List[str]):
matrix_row, matrix_col, matrix_data = [], [], []
num_docs = len(docids)

for index, doc_id in enumerate(docids):
if index % 1000 == 0 and num_docs > 1000 and self.verbose:
print(f'Vectorizing: {index}/{len(docids)}')

for index, doc_id in enumerate(tqdm(docids)):
# Term Frequency
tf = self.index_reader.get_document_vector(doc_id)
if tf is None:
Expand All @@ -115,7 +127,7 @@ def get_vectors(self, docids: List[str]):
matrix_data.append(tfidf)

vectors = csr_matrix((matrix_data, (matrix_row, matrix_col)), shape=(num_docs, self.vocabulary_size))
return normalize(vectors, norm='l2')
MXueguang marked this conversation as resolved.
Show resolved Hide resolved
return vectors


class BM25Vectorizer(Vectorizer):
Expand Down Expand Up @@ -150,9 +162,7 @@ def get_vectors(self, docids: List[str]):
matrix_row, matrix_col, matrix_data = [], [], []
num_docs = len(docids)

for index, doc_id in enumerate(docids):
if index % 1000 == 0 and num_docs > 1000 and self.verbose:
print(f'Vectorizing: {index}/{len(docids)}')
for index, doc_id in enumerate(tqdm(docids)):

# Term Frequency
tf = self.index_reader.get_document_vector(doc_id)
Expand All @@ -170,4 +180,4 @@ def get_vectors(self, docids: List[str]):
matrix_data.append(bm25_weight)

vectors = csr_matrix((matrix_data, (matrix_row, matrix_col)), shape=(num_docs, self.vocabulary_size))
return normalize(vectors, norm='l2')
return vectors