Support Chinese indexing and search (castorini#804)

+ Add CJKAnalyzer in the Indexing class. + Add language argument in indexing argument. + Add setLanguage method and CJKAnalyzer in SimpleSearch class. + Add experiments on NTCIR-8 ZH dataset.
dannychn11 · Oct 8, 2019 · b771bb9 · b771bb9
1 parent 70350fa
commit b771bb9
Show file tree

Hide file tree

Showing 20 changed files with 110,742 additions and 10 deletions.
diff --git a/docs/experiments-ntcir8-zh.md b/docs/experiments-ntcir8-zh.md
@@ -0,0 +1,52 @@
+# Cross-lingual Information Retrieval Experiments
+
+This page contains instructions for running BM25 baselines on the NTCIR 8 *IR4QA* task.
+
+## Data Prep
+
+First, we need to convert the corpus into jsonline file format.
+
+```
+python src/main/python/clir/convert_collection_to_jsonl.py \
+--language zh \
+--corpus_directory /directory/to/ntcir-collection/ \
+--output_path /path/to/dump
+```
+## Document Ranking with BM25
+
+Run the command
+
+```
+nohup sh target/appassembler/bin/IndexCollection -collection JsonCollection \
+-generator LuceneDocumentGenerator -threads 1 \
+-input /directory/to/dump \
+-index /directory/to/index/lucene-index.clir_zh.pos+docvectors+rawdocs -storePositions -storeDocvectors \
+-storeRawDocs -language zh >& log.clir_zh.pos+docvectors+rawdocs &
+```
+
+to index the documents.
+
+## Retrieval
+
+To do the document retrieval, run
+
+```
+nohup target/appassembler/bin/SearchCollection -topicreader TsvStringKey \
+-index lucene-index.clir_zh.pos+docvectors+rawdocs/ \
+-topics src/main/resources/topics-and-qrels/topics.ntcir8zh.eval.txt \
+-output run.clir-zh.bm25-default.zh.topics.txt -bm25 -language zh &
+```
+
+## Evaluation
+
+To evalutate, run
+
+```
+eval/trec_eval.9.0.4/trec_eval -m map \
+src/main/resources/topics-and-qrels/qrels.ntcir8.eval.txt \
+run.clir-zh.bm25-default.zh.topics.txt
+```
+
+| Collection |  MAP  |
+|:----------:|:-----:|
+| NTCIR-8 ZH | 0.3568|
diff --git a/src/main/java/io/anserini/index/IndexArgs.java b/src/main/java/io/anserini/index/IndexArgs.java
@@ -76,6 +76,9 @@ public class IndexArgs {
   @Option(name = "-bm25.accurate", usage = "Switch to use the accurate BM25 similarity)")
   public boolean bm25Accurate = false;
 
+  @Option(name = "-language", usage = "the language for analyzer")
+  public String language= "en";
+
   @Option(name = "-tweet.keepRetweets", usage = "boolean switch to keep retweets while indexing")
   public boolean tweetKeepRetweets = false;
 

diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java
@@ -44,6 +44,9 @@
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.ar.ArabicAnalyzer;
+import org.apache.lucene.analysis.cjk.CJKAnalyzer;
+import org.apache.lucene.analysis.fr.FrenchAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.*;
 import org.apache.lucene.search.similarities.BM25Similarity;
@@ -667,10 +670,24 @@ public void run() throws IOException {
     if (indexPath != null && !args.dryRun) {
 
       final Directory dir = FSDirectory.open(indexPath);
+      final CJKAnalyzer chineseAnalyzer = new CJKAnalyzer();
+      final ArabicAnalyzer arabicAnalyzer = new ArabicAnalyzer();
+      final FrenchAnalyzer frenchAnalyzer = new FrenchAnalyzer();
       final EnglishStemmingAnalyzer analyzer = args.keepStopwords ?
           new EnglishStemmingAnalyzer(args.stemmer, CharArraySet.EMPTY_SET) : new EnglishStemmingAnalyzer(args.stemmer);
       final TweetAnalyzer tweetAnalyzer = new TweetAnalyzer(args.tweetStemming);
-      final IndexWriterConfig config = args.collectionClass.equals("TweetCollection") ? new IndexWriterConfig(tweetAnalyzer) : new IndexWriterConfig(analyzer);
+      final IndexWriterConfig config;
+      if (args.collectionClass.equals("TweetCollection")) {
+        config = new IndexWriterConfig(tweetAnalyzer);
+      } else if (args.language.equals("zh")) {
+        config = new IndexWriterConfig(chineseAnalyzer);
+      } else if (args.language.equals("ar")) {
+        config = new IndexWriterConfig(arabicAnalyzer);
+      } else if (args.language.equals("fr")) {
+        config = new IndexWriterConfig(frenchAnalyzer);
+      } else {
+        config = new IndexWriterConfig(analyzer);
+      }
       if (args.bm25Accurate) {
         config.setSimilarity(new AccurateBM25Similarity()); // necessary during indexing as the norm used in BM25 is already determined at index time.
       } else {

diff --git a/src/main/java/io/anserini/search/SearchArgs.java b/src/main/java/io/anserini/search/SearchArgs.java
@@ -36,6 +36,9 @@ public class SearchArgs {
   // optional arguments
   @Option(name = "-threads", metaVar = "[Number]", usage = "Number of Threads")
   public int threads = 1;
+
+  @Option(name = "-language", usage = "Analyzer Language")
+  public String language = "en";
 
   @Option(name = "-inmem", usage = "Boolean switch to read index in memory")
   public Boolean inmem = false;

diff --git a/src/main/java/io/anserini/search/SearchCollection.java b/src/main/java/io/anserini/search/SearchCollection.java
@@ -40,6 +40,9 @@
 import org.apache.logging.log4j.Logger;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.ar.ArabicAnalyzer;
+import org.apache.lucene.analysis.cjk.CJKAnalyzer;
+import org.apache.lucene.analysis.fr.FrenchAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.LongPoint;
 import org.apache.lucene.index.DirectoryReader;
@@ -217,7 +220,14 @@ public SearchCollection(SearchArgs args) throws IOException {
     if (args.searchtweets) {
       LOG.info("Search Tweets");
       analyzer = new TweetAnalyzer();
+    } else if (args.language.equals("zh")) {
+      analyzer = new CJKAnalyzer();
+    } else if (args.language.equals("ar")) {
+      analyzer = new ArabicAnalyzer();
+    } else if (args.language.equals("fr")) {
+      analyzer = new FrenchAnalyzer();
     } else {
+      // Default to English
       analyzer = args.keepstop ?
           new EnglishStemmingAnalyzer(args.stemmer, CharArraySet.EMPTY_SET) : new EnglishStemmingAnalyzer(args.stemmer);
     }
@@ -362,6 +372,7 @@ public<K> void runTopics() throws IOException {
             .getConstructor(Path.class).newInstance(topicsFilePath);
         topics.putAll(tr.read());
       } catch (Exception e) {
+        e.printStackTrace();
         throw new IllegalArgumentException("Unable to load topic reader: " + args.topicReader);
       }
     }

diff --git a/src/main/java/io/anserini/search/SimpleSearcher.java b/src/main/java/io/anserini/search/SimpleSearcher.java
@@ -29,6 +29,9 @@
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.ar.ArabicAnalyzer;
+import org.apache.lucene.analysis.cjk.CJKAnalyzer;
+import org.apache.lucene.analysis.fr.FrenchAnalyzer;
 import org.apache.lucene.analysis.en.EnglishAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.LongPoint;
@@ -128,6 +131,16 @@ public void setSearchTweets(boolean flag) {
      this.analyzer = flag? new TweetAnalyzer(true) : new EnglishAnalyzer();
   }
 
+  public void setLanguage(String language) {
+    if (language.equals("zh")) {
+      this.analyzer = new CJKAnalyzer();
+    } else if (language.equals("ar")) {
+      this.analyzer = new ArabicAnalyzer();
+    } else if (language.equals("fr")) {
+      this.analyzer = new FrenchAnalyzer();
+    }
+  }
+
   public void setRM3Reranker() {
     setRM3Reranker(10, 10, 0.5f, false);
   }

diff --git a/...ni/search/topicreader/TsvTopicReader.java → ...search/topicreader/TsvIntTopicReader.java b/...ni/search/topicreader/TsvTopicReader.java → ...search/topicreader/TsvIntTopicReader.java
@@ -37,8 +37,8 @@
  * ...
  * </pre>
  */
-public class TsvTopicReader extends TopicReader<Integer> {
-  public TsvTopicReader(Path topicFile) {
+public class TsvIntTopicReader extends TopicReader<Integer> {
+  public TsvIntTopicReader(Path topicFile) {
     super(topicFile);
   }
 

diff --git a/src/main/java/io/anserini/search/topicreader/TsvStringTopicReader.java b/src/main/java/io/anserini/search/topicreader/TsvStringTopicReader.java
@@ -0,0 +1,61 @@
+/**
+ * Anserini: A Lucene toolkit for replicable information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.search.topicreader;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+/**
+ * Topic reader for queries in tsv format, such as the MS MARCO queries.
+ *
+ * <pre>
+ * 174249 does xpress bet charge to deposit money in your account
+ * 320792 how much is a cost to run disneyland
+ * 1090270  botulinum definition
+ * 1101279	do physicians pay for insurance from their salaries?
+ * 201376 here there be dragons comic
+ * 54544  blood diseases that are sexually transmitted
+ * ...
+ * </pre>
+ */
+public class TsvStringTopicReader extends TopicReader<String> {
+  public TsvStringTopicReader(Path topicFile) {
+    super(topicFile);
+  }
+
+  @Override
+  public SortedMap<String, Map<String, String>> read(BufferedReader reader) throws IOException {
+    SortedMap<String, Map<String, String>> map = new TreeMap<>();
+
+    String line;
+    while ((line = reader.readLine()) != null) {
+      line = line.trim();
+      String[] arr = line.split("\\t");
+
+      Map<String,String> fields = new HashMap<>();
+      fields.put("title", arr[1].trim());
+      map.put(arr[0], fields);
+    }
+
+    return map;
+  }
+}
diff --git a/src/main/python/clir/convert_collection_to_jsonl.py b/src/main/python/clir/convert_collection_to_jsonl.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+"""
+Anserini: A Lucene toolkit for replicable information retrieval research
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""
+This script is used for converting the cross-lingual IR corpus
+into json format, which can be easily indexed by Anserini.
+
+The jsonline format of Anserini is as follows:
+
+{"id": "doc1", "contents": "string1"}
+
+Currently the data we have:
+  - ZH: gigaword-xin.2002-06.zh-cleaned.xml
+"""
+
+import argparse
+import json
+import os
+
+ZH_CORPUS_NAME = "gigaword-xin.2002-06.zh-cleaned.xml"
+
+
+def zh2json(file_path, output_path):
+    """
+    Processing rules:
+        1. If two lines are successive, then concatenate them without space
+        2. If two lines are separated with two lines, then separate them with period 。.
+    This rules do not matter for passage level indexing, but if when we do the
+    sentence level indexing, it will affect the performance.
+
+    :param file_path:
+    :return:
+    """
+    fout = open(output_path, 'w')
+    counter = 0
+    with open(os.path.join(file_path, ZH_CORPUS_NAME)) as fin:
+        while True:
+            line = fin.readline()
+            if line.startswith("<DOC>"):
+                # We assume the nextline of "<DOC>" label line is
+                # "<DOCNO>" line.
+                example = {}
+                line = fin.readline()
+                if line.startswith("<DOCNO>"):
+                    line = line.replace("<DOCNO>", "").replace("</DOCNO>", "").strip()
+                    example["id"] = line
+                else:
+                    print("The line is {}, but we assume it is <DOCNO> line".format(line))
+                    exit()
+                # Read contents
+                example["contents"] = []
+                line = fin.readline()
+                while (not line.startswith("</DOC>")):
+                    line = line.strip()
+                    if len(line) == 0:
+                        example["contents"].append("。")
+                    else:
+                        example["contents"].append(line)
+                    line = fin.readline()
+                example["contents"] = "".join(example["contents"])
+                fout.write(json.dumps(example) + "\n")
+                counter += 1
+                if counter % 10000 == 0:
+                    print("Dump {} examples".format(counter))
+            elif not line:
+                break
+        print("Done")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--language", type=str, choices=["zh"])
+    parser.add_argument("--corpus_directory", type=str)
+    parser.add_argument("--output_path", type=str)
+    args = parser.parse_args()
+
+    dir = os.path.dirname(args.output_path)
+    if not os.path.exists(dir):
+        os.makedirs(dir)
+
+    if args.language == "zh":
+        zh2json(args.corpus_directory, args.output_path)
diff --git a/src/main/python/run_regression.py b/src/main/python/run_regression.py
@@ -160,8 +160,8 @@ def construct_ranking_command(output_root, yaml_data, build_index=True):
             '-index', get_index_path(yaml_data),
             ' '.join(model['params']),
             '-topics', os.path.join(yaml_data['root'], yaml_data['topic_root'], topic['path']),
-            '-output', os.path.join(output_root, 'run.{0}.{1}.{2}'.format(yaml_data['name'], model['name'], topic['path']))
-        ]
+            '-output', os.path.join(output_root, 'run.{0}.{1}.{2}'.format(yaml_data['name'], model['name'], topic['path'])),
+        ] + (yaml_data['search_options'] if 'search_options' in yaml_data else [])
         for (model, topic) in list(itertools.product(yaml_data['models'], yaml_data['topics']))
     ]
     return ranking_commands

diff --git a/src/main/resources/docgen/templates/ntcir8-zh.template b/src/main/resources/docgen/templates/ntcir8-zh.template
@@ -0,0 +1,45 @@
+# Anserini: Regressions for [NTCIR-8 Simple Chinese](http://research.nii.ac.jp/ntcir/ntcir-ws8/ws-en.html)
+
+This page documents regression experiments for the NTCIR Information Retrieval for Question Answering Task, which is integrated into 
+Anserini's regression testing framework.
+For more complete instructions on how to run end-to-end experiments, refer to [this page](experiments-ntcir8-zh.md).
+
+## Indexing
+
+Typical indexing command:
+
+```
+${index_cmds}
+```
+
+The directory `/path/to/ntcir-8/` should be a directory containing the official document collection (a single file), in Json format.
+[This page](experiments-ntcir8-zh.md) explains how to perform this conversion.
+
+For additional details, see explanation of [common indexing options](common-indexing-options.md).
+
+## Retrieval
+
+Topics and qrels are stored in `src/main/resources/topics-and-qrels/`.
+The regression experiments here evaluate on the 73 questions.
+
+After indexing has completed, you should be able to perform retrieval as follows:
+
+```
+${ranking_cmds}
+```
+
+Evaluation can be performed using `trec_eval`:
+
+```
+${eval_cmds}
+```
+
+## Effectiveness
+
+With the above commands, you should be able to replicate the following results:
+
+${effectiveness}
+
+The setting "default" refers the default BM25 settings of `k1=0.9`, `b=0.4`.
+See [this page](experiments-ntcir8-zh.md) for more details.
+Note that here we are using `trec_eval` to evaluate the top 1000 hits for each query.