castorini · MXueguang · Jan 17, 2021 · Jan 13, 2021 · Jan 15, 2021 · Jan 15, 2021
diff --git a/pyserini/vectorizer/_base.py b/pyserini/vectorizer/_base.py
@@ -18,9 +18,10 @@
 from typing import List
 
 from scipy.sparse import csr_matrix
-from sklearn.preprocessing import normalize
 
 from pyserini import index, search
+from pyserini.analysis import Analyzer, get_lucene_analyzer
+from tqdm import tqdm
 
 
 class Vectorizer:
@@ -42,12 +43,15 @@ def __init__(self, lucene_index_path: str, min_df: int = 1, verbose: bool = Fals
         self.index_reader = index.IndexReader(lucene_index_path)
         self.searcher = search.SimpleSearcher(lucene_index_path)
         self.num_docs: int = self.searcher.num_docs
+        self.stats = self.index_reader.stats()
+        self.analyzer = Analyzer(get_lucene_analyzer())
 
         # build vocabulary
         self.vocabulary_ = set()
         for term in self.index_reader.terms():
             if term.df > self.min_df:
                 self.vocabulary_.add(term.term)
+        self.vocabulary_ = sorted(self.vocabulary_)
 
         # build term to index mapping
         self.term_to_index = {}
@@ -58,6 +62,17 @@ def __init__(self, lucene_index_path: str, min_df: int = 1, verbose: bool = Fals
         if self.verbose:
             print(f'Found {self.vocabulary_size} terms with min_df={self.min_df}')
 
+    def get_query_vector(self, query: str):
+        matrix_row, matrix_col, matrix_data = [], [], []
+        tokens = self.analyzer.analyze(query)
+        for term in tokens:
+            if term in self.vocabulary_:
+                matrix_row.append(0)
+                matrix_col.append(self.term_to_index[term])
+                matrix_data.append(1)
+        vectors = csr_matrix((matrix_data, (matrix_row, matrix_col)), shape=(1, self.vocabulary_size))
+        return vectors
+
 
 class TfidfVectorizer(Vectorizer):
     """Wrapper class for tf-idf vectorizer implemented on top of Pyserini.
@@ -95,10 +110,7 @@ def get_vectors(self, docids: List[str]):
         matrix_row, matrix_col, matrix_data = [], [], []
         num_docs = len(docids)
 
-        for index, doc_id in enumerate(docids):
-            if index % 1000 == 0 and num_docs > 1000 and self.verbose:
-                print(f'Vectorizing: {index}/{len(docids)}')
-
+        for index, doc_id in enumerate(tqdm(docids)):
             # Term Frequency
             tf = self.index_reader.get_document_vector(doc_id)
             if tf is None:
@@ -115,7 +127,7 @@ def get_vectors(self, docids: List[str]):
                 matrix_data.append(tfidf)
 
         vectors = csr_matrix((matrix_data, (matrix_row, matrix_col)), shape=(num_docs, self.vocabulary_size))
-        return normalize(vectors, norm='l2')
+        return vectors
 
 
 class BM25Vectorizer(Vectorizer):
@@ -150,9 +162,7 @@ def get_vectors(self, docids: List[str]):
         matrix_row, matrix_col, matrix_data = [], [], []
         num_docs = len(docids)
 
-        for index, doc_id in enumerate(docids):
-            if index % 1000 == 0 and num_docs > 1000 and self.verbose:
-                print(f'Vectorizing: {index}/{len(docids)}')
+        for index, doc_id in enumerate(tqdm(docids)):
 
             # Term Frequency
             tf = self.index_reader.get_document_vector(doc_id)
@@ -170,4 +180,4 @@ def get_vectors(self, docids: List[str]):
                 matrix_data.append(bm25_weight)
 
         vectors = csr_matrix((matrix_data, (matrix_row, matrix_col)), shape=(num_docs, self.vocabulary_size))
-        return normalize(vectors, norm='l2')
+        return vectors