Skip to content

Commit

Permalink
Computes query-doc score for a specific document (#1194)
Browse files Browse the repository at this point in the history
  • Loading branch information
lintool authored May 14, 2020
1 parent 6d48fb4 commit a65646f
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 8 deletions.
108 changes: 101 additions & 7 deletions src/main/java/io/anserini/index/IndexReaderUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

import io.anserini.analysis.AnalyzerUtils;
import io.anserini.analysis.DefaultEnglishAnalyzer;
import io.anserini.search.SearchArgs;
import io.anserini.search.query.BagOfWordsQueryGenerator;
import io.anserini.search.query.PhraseQueryGenerator;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
Expand Down Expand Up @@ -46,6 +48,7 @@
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
Expand Down Expand Up @@ -464,20 +467,41 @@ public static String documentContents(IndexReader reader, String docid) {
}

/**
* Computes the BM25 weight of a term (prior to analysis) in a particular document.
* Computes the BM25 weight of an unanalyzed term in a particular document (with Anserini default parameters).
*
* @param reader index reader
* @param docid collection docid
* @param term term (prior to analysis)
* @param term unanalyzed term
* @return BM25 weight of the term in the specified document
* @throws IOException if error encountered during query
*/
public static float getBM25TermWeight(IndexReader reader, String docid, String term) throws IOException {
SearchArgs args = new SearchArgs();
return getBM25TermWeight(reader, docid, term, Float.parseFloat(args.bm25_k1[0]), Float.parseFloat(args.bm25_b[0]));
}

/**
* Computes the BM25 weight of an unanalyzed term in a particular document.
*
* @param reader index reader
* @param docid collection docid
* @param term unanalyzed term
* @param k1 k1 setting for BM25
* @param b b setting for BM25
* @return BM25 weight of the term in the specified document
* @throws IOException if error encountered during query
*/
public static float getBM25TermWeight(IndexReader reader, String docid, String term, float k1, float b)
throws IOException {
// We compute the BM25 score by issuing a single-term query with an additional filter clause that restricts
// consideration to only the docid in question, and then returning the retrieval score.
//
// This implementation is inefficient, but as the advantage of using the existing Lucene similarity, which means
// that we don't need to copy the scoring function and keep it in sync wrt code updates.

IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(new BM25Similarity());
searcher.setSimilarity(new BM25Similarity(k1, b));

// The way to compute the BM25 score is to issue a query with the exact docid and the
// term in question, and look at the retrieval score.
Query filterQuery = new ConstantScoreQuery(new TermQuery(new Term(IndexArgs.ID, docid)));
Query termQuery = new TermQuery(new Term(IndexArgs.CONTENTS, term));
BooleanQuery.Builder builder = new BooleanQuery.Builder();
Expand All @@ -486,10 +510,80 @@ public static float getBM25TermWeight(IndexReader reader, String docid, String t
Query finalQuery = builder.build();
TopDocs rs = searcher.search(finalQuery, 1);

// The BM25 weight is the score of the first (and only) hit, but remember to remove 1 for the ConstantScoreQuery
return rs.scoreDocs.length == 0 ? Float.NaN : rs.scoreDocs[0].score - 1;
// The BM25 weight is the score of the first (and only) hit, but remember to remove 1 for the ConstantScoreQuery.
// If we get zero results, indicates that term isn't found in the document.
return rs.scoreDocs.length == 0 ? 0 : rs.scoreDocs[0].score - 1;
}

/**
* Computes the BM25 score of a document with respect to a query. Assumes default BM25 parameter settings and
* Anserini's default analyzer.
*
* @param reader index reader
* @param docid docid of the document to score
* @param q query
* @return the score of the document with respect to the query
* @throws IOException if error encountered during query
*/
public static float computeQueryDocumentScore(IndexReader reader, String docid, String q) throws IOException {
return computeQueryDocumentScore(reader, docid, q, new BM25Similarity(), IndexCollection.DEFAULT_ANALYZER);
}

/**
* Computes the score of a document with respect to a query given a scoring function. Assumes Anserini's default
* analyzer.
*
* @param reader index reader
* @param docid docid of the document to score
* @param q query
* @param similarity scoring function
* @return the score of the document with respect to the query
* @throws IOException if error encountered during query
*/
public static float computeQueryDocumentScore(IndexReader reader, String docid, String q, Similarity similarity)
throws IOException {
return computeQueryDocumentScore(reader, docid, q, similarity, IndexCollection.DEFAULT_ANALYZER);
}

/**
* Computes the score of a document with respect to a query given a scoring function and an analyzer.
*
* @param reader index reader
* @param docid docid of the document to score
* @param q query
* @param similarity scoring function
* @param analyzer analyzer to use
* @return the score of the document with respect to the query
* @throws IOException if error encountered during query
*/
public static float computeQueryDocumentScore(IndexReader reader, String docid, String q,
Similarity similarity, Analyzer analyzer) throws IOException {
// We compute the query-document score by issuing the query with an additional filter clause that restricts
// consideration to only the docid in question, and then returning the retrieval score.
//
// This implementation is inefficient, but as the advantage of using the existing Lucene similarity, which means
// that we don't need to copy the scoring function and keep it in sync wrt code updates.

IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(similarity);

Query query = new BagOfWordsQueryGenerator().buildQuery(IndexArgs.CONTENTS, analyzer, q);

Query filterQuery = new ConstantScoreQuery(new TermQuery(new Term(IndexArgs.ID, docid)));
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(filterQuery, BooleanClause.Occur.MUST);
builder.add(query, BooleanClause.Occur.MUST);
Query finalQuery = builder.build();

TopDocs rs = searcher.search(finalQuery, 1);

// We want the score of the first (and only) hit, but remember to remove 1 for the ConstantScoreQuery.
// If we get zero results, indicates that term isn't found in the document.
return rs.scoreDocs.length == 0 ? 0 : rs.scoreDocs[0].score - 1;
}

// TODO: Write a variant of computeQueryDocumentScore that takes a set of documents.

public static void dumpDocumentVectors(IndexReader reader, String reqDocidsPath, DocumentVectorWeight weight) throws IOException {
String outFileName = weight == null ? reqDocidsPath+".docvector.tar.gz" : reqDocidsPath+".docvector." + weight +".tar.gz";
LOG.info("Start dump document vectors with weight " + weight);
Expand Down
37 changes: 36 additions & 1 deletion src/test/java/io/anserini/index/IndexReaderUtilsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
import io.anserini.IndexerTestBase;
import io.anserini.analysis.AnalyzerUtils;
import io.anserini.analysis.DefaultEnglishAnalyzer;
import io.anserini.search.SearchArgs;
import io.anserini.search.SimpleSearcher;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiTerms;
Expand All @@ -29,11 +31,13 @@
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.junit.Test;

import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
Expand Down Expand Up @@ -272,6 +276,9 @@ public void testPostingsLists2() throws Exception {

@Test
public void computeAllTermBM25Weights() throws Exception {
SearchArgs args = new SearchArgs();
Similarity similarity = new BM25Similarity(Float.parseFloat(args.bm25_k1[0]), Float.parseFloat(args.bm25_b[0]));

Directory dir = FSDirectory.open(tempDir1);
IndexReader reader = DirectoryReader.open(dir);

Expand All @@ -286,7 +293,7 @@ public void computeAllTermBM25Weights() throws Exception {
String term = text.utf8ToString();

IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(new BM25Similarity());
searcher.setSimilarity(similarity);

TopDocs rs = searcher.search(new TermQuery(new Term("contents", term)), 3);
for (int i=0; i<rs.scoreDocs.length; i++) {
Expand Down Expand Up @@ -434,4 +441,32 @@ public void testDocidConversion() throws Exception {
reader.close();
dir.close();
}

@Test
public void testComputeQueryDocumentScore() throws Exception {
SimpleSearcher searcher = new SimpleSearcher(tempDir1.toString());
Directory dir = FSDirectory.open(tempDir1);
IndexReader reader = DirectoryReader.open(dir);
Similarity similarity = new BM25Similarity(0.9f, 0.4f);

// A bunch of test queries...
String[] queries = {"text city", "text", "city"};

for (String query: queries) {
SimpleSearcher.Result[] results = searcher.search(query);

// Strategy is to loop over the results, compute query-document score individually, and compare.
for (int i = 0; i < results.length; i++) {
float score = IndexReaderUtils.computeQueryDocumentScore(reader, results[i].docid, query, similarity);
assertEquals(score, results[i].score, 10e-5);
}

// This is hard coded - doc3 isn't retrieved by any of the queries.
assertEquals(0.0f,
IndexReaderUtils.computeQueryDocumentScore(reader, "doc3", query, similarity), 10e-6);
}

reader.close();
dir.close();
}
}

0 comments on commit a65646f

Please sign in to comment.