Computes query-doc score for a specific document (#1194)

castorini · May 14, 2020 · a65646f · a65646f
1 parent 6d48fb4
commit a65646f
Show file tree

Hide file tree

Showing 2 changed files with 137 additions and 8 deletions.
diff --git a/src/main/java/io/anserini/index/IndexReaderUtils.java b/src/main/java/io/anserini/index/IndexReaderUtils.java
@@ -18,6 +18,8 @@
 
 import io.anserini.analysis.AnalyzerUtils;
 import io.anserini.analysis.DefaultEnglishAnalyzer;
+import io.anserini.search.SearchArgs;
+import io.anserini.search.query.BagOfWordsQueryGenerator;
 import io.anserini.search.query.PhraseQueryGenerator;
 import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
 import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
@@ -46,6 +48,7 @@
 import org.apache.lucene.search.ConstantScoreQuery;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.similarities.BM25Similarity;
+import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.BytesRef;
@@ -464,20 +467,41 @@ public static String documentContents(IndexReader reader, String docid) {
   }
 
   /**
-   * Computes the BM25 weight of a term (prior to analysis) in a particular document.
+   * Computes the BM25 weight of an unanalyzed term in a particular document (with Anserini default parameters).
    *
    * @param reader index reader
    * @param docid collection docid
-   * @param term term (prior to analysis)
+   * @param term unanalyzed term
    * @return BM25 weight of the term in the specified document
    * @throws IOException if error encountered during query
    */
   public static float getBM25TermWeight(IndexReader reader, String docid, String term) throws IOException {
+    SearchArgs args = new SearchArgs();
+    return getBM25TermWeight(reader, docid, term, Float.parseFloat(args.bm25_k1[0]), Float.parseFloat(args.bm25_b[0]));
+  }
+
+  /**
+   * Computes the BM25 weight of an unanalyzed term in a particular document.
+   *
+   * @param reader index reader
+   * @param docid collection docid
+   * @param term unanalyzed term
+   * @param k1 k1 setting for BM25
+   * @param b b setting for BM25
+   * @return BM25 weight of the term in the specified document
+   * @throws IOException if error encountered during query
+   */
+  public static float getBM25TermWeight(IndexReader reader, String docid, String term, float k1, float b)
+      throws IOException {
+    // We compute the BM25 score by issuing a single-term query with an additional filter clause that restricts
+    // consideration to only the docid in question, and then returning the retrieval score.
+    //
+    // This implementation is inefficient, but as the advantage of using the existing Lucene similarity, which means
+    // that we don't need to copy the scoring function and keep it in sync wrt code updates.
+
     IndexSearcher searcher = new IndexSearcher(reader);
-    searcher.setSimilarity(new BM25Similarity());
+    searcher.setSimilarity(new BM25Similarity(k1, b));
 
-    // The way to compute the BM25 score is to issue a query with the exact docid and the
-    // term in question, and look at the retrieval score.
     Query filterQuery = new ConstantScoreQuery(new TermQuery(new Term(IndexArgs.ID, docid)));
     Query termQuery = new TermQuery(new Term(IndexArgs.CONTENTS, term));
     BooleanQuery.Builder builder = new BooleanQuery.Builder();
@@ -486,10 +510,80 @@ public static float getBM25TermWeight(IndexReader reader, String docid, String t
     Query finalQuery = builder.build();
     TopDocs rs = searcher.search(finalQuery, 1);
 
-    // The BM25 weight is the score of the first (and only) hit, but remember to remove 1 for the ConstantScoreQuery
-    return rs.scoreDocs.length == 0 ? Float.NaN : rs.scoreDocs[0].score - 1;
+    // The BM25 weight is the score of the first (and only) hit, but remember to remove 1 for the ConstantScoreQuery.
+    // If we get zero results, indicates that term isn't found in the document.
+    return rs.scoreDocs.length == 0 ? 0 : rs.scoreDocs[0].score - 1;
+  }
+
+  /**
+   * Computes the BM25 score of a document with respect to a query. Assumes default BM25 parameter settings and
+   * Anserini's default analyzer.
+   *
+   * @param reader index reader
+   * @param docid docid of the document to score
+   * @param q query
+   * @return the score of the document with respect to the query
+   * @throws IOException if error encountered during query
+   */
+  public static float computeQueryDocumentScore(IndexReader reader, String docid, String q) throws IOException {
+    return computeQueryDocumentScore(reader, docid, q, new BM25Similarity(), IndexCollection.DEFAULT_ANALYZER);
   }
 
+  /**
+   * Computes the score of a document with respect to a query given a scoring function. Assumes Anserini's default
+   * analyzer.
+   *
+   * @param reader index reader
+   * @param docid docid of the document to score
+   * @param q query
+   * @param similarity scoring function
+   * @return the score of the document with respect to the query
+   * @throws IOException if error encountered during query
+   */
+  public static float computeQueryDocumentScore(IndexReader reader, String docid, String q, Similarity similarity)
+      throws IOException {
+    return computeQueryDocumentScore(reader, docid, q, similarity, IndexCollection.DEFAULT_ANALYZER);
+  }
+
+  /**
+   * Computes the score of a document with respect to a query given a scoring function and an analyzer.
+   *
+   * @param reader index reader
+   * @param docid docid of the document to score
+   * @param q query
+   * @param similarity scoring function
+   * @param analyzer analyzer to use
+   * @return the score of the document with respect to the query
+   * @throws IOException if error encountered during query
+   */
+  public static float computeQueryDocumentScore(IndexReader reader, String docid, String q,
+                                                Similarity similarity, Analyzer analyzer) throws IOException {
+    // We compute the query-document score by issuing the query with an additional filter clause that restricts
+    // consideration to only the docid in question, and then returning the retrieval score.
+    //
+    // This implementation is inefficient, but as the advantage of using the existing Lucene similarity, which means
+    // that we don't need to copy the scoring function and keep it in sync wrt code updates.
+
+    IndexSearcher searcher = new IndexSearcher(reader);
+    searcher.setSimilarity(similarity);
+
+    Query query = new BagOfWordsQueryGenerator().buildQuery(IndexArgs.CONTENTS, analyzer, q);
+
+    Query filterQuery = new ConstantScoreQuery(new TermQuery(new Term(IndexArgs.ID, docid)));
+    BooleanQuery.Builder builder = new BooleanQuery.Builder();
+    builder.add(filterQuery, BooleanClause.Occur.MUST);
+    builder.add(query, BooleanClause.Occur.MUST);
+    Query finalQuery = builder.build();
+
+    TopDocs rs = searcher.search(finalQuery, 1);
+
+    // We want the score of the first (and only) hit, but remember to remove 1 for the ConstantScoreQuery.
+    // If we get zero results, indicates that term isn't found in the document.
+    return rs.scoreDocs.length == 0 ? 0 : rs.scoreDocs[0].score - 1;
+  }
+
+  // TODO: Write a variant of computeQueryDocumentScore that takes a set of documents.
+
   public static void dumpDocumentVectors(IndexReader reader, String reqDocidsPath, DocumentVectorWeight weight) throws IOException {
     String outFileName = weight == null ? reqDocidsPath+".docvector.tar.gz" : reqDocidsPath+".docvector." + weight +".tar.gz";
     LOG.info("Start dump document vectors with weight " + weight);

diff --git a/src/test/java/io/anserini/index/IndexReaderUtilsTest.java b/src/test/java/io/anserini/index/IndexReaderUtilsTest.java
@@ -19,6 +19,8 @@
 import io.anserini.IndexerTestBase;
 import io.anserini.analysis.AnalyzerUtils;
 import io.anserini.analysis.DefaultEnglishAnalyzer;
+import io.anserini.search.SearchArgs;
+import io.anserini.search.SimpleSearcher;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiTerms;
@@ -29,11 +31,13 @@
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.similarities.BM25Similarity;
+import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.BytesRef;
 import org.junit.Test;
 
+import java.nio.file.Paths;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
@@ -272,6 +276,9 @@ public void testPostingsLists2() throws Exception {
 
   @Test
   public void computeAllTermBM25Weights() throws Exception {
+    SearchArgs args = new SearchArgs();
+    Similarity similarity = new BM25Similarity(Float.parseFloat(args.bm25_k1[0]), Float.parseFloat(args.bm25_b[0]));
+
     Directory dir = FSDirectory.open(tempDir1);
     IndexReader reader = DirectoryReader.open(dir);
 
@@ -286,7 +293,7 @@ public void computeAllTermBM25Weights() throws Exception {
       String term = text.utf8ToString();
 
       IndexSearcher searcher = new IndexSearcher(reader);
-      searcher.setSimilarity(new BM25Similarity());
+      searcher.setSimilarity(similarity);
 
       TopDocs rs = searcher.search(new TermQuery(new Term("contents", term)), 3);
       for (int i=0; i<rs.scoreDocs.length; i++) {
@@ -434,4 +441,32 @@ public void testDocidConversion() throws Exception {
     reader.close();
     dir.close();
   }
+
+  @Test
+  public void testComputeQueryDocumentScore() throws Exception {
+    SimpleSearcher searcher = new SimpleSearcher(tempDir1.toString());
+    Directory dir = FSDirectory.open(tempDir1);
+    IndexReader reader = DirectoryReader.open(dir);
+    Similarity similarity = new BM25Similarity(0.9f, 0.4f);
+
+    // A bunch of test queries...
+    String[] queries = {"text city", "text", "city"};
+
+    for (String query: queries) {
+      SimpleSearcher.Result[] results = searcher.search(query);
+
+      // Strategy is to loop over the results, compute query-document score individually, and compare.
+      for (int i = 0; i < results.length; i++) {
+        float score = IndexReaderUtils.computeQueryDocumentScore(reader, results[i].docid, query, similarity);
+        assertEquals(score, results[i].score, 10e-5);
+      }
+
+      // This is hard coded - doc3 isn't retrieved by any of the queries.
+      assertEquals(0.0f,
+          IndexReaderUtils.computeQueryDocumentScore(reader, "doc3", query, similarity), 10e-6);
+    }
+
+    reader.close();
+    dir.close();
+  }
 }