Skip to content

Commit

Permalink
Support Chinese indexing and search (castorini#804)
Browse files Browse the repository at this point in the history
+ Add CJKAnalyzer in the Indexing class.
+ Add language argument in indexing argument.
+ Add setLanguage method and CJKAnalyzer in SimpleSearch class.
+ Add experiments on NTCIR-8 ZH dataset.
  • Loading branch information
Impavidity authored and lintool committed Oct 8, 2019
1 parent 70350fa commit b771bb9
Show file tree
Hide file tree
Showing 20 changed files with 110,742 additions and 10 deletions.
52 changes: 52 additions & 0 deletions docs/experiments-ntcir8-zh.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Cross-lingual Information Retrieval Experiments

This page contains instructions for running BM25 baselines on the NTCIR 8 *IR4QA* task.

## Data Prep

First, we need to convert the corpus into jsonline file format.

```
python src/main/python/clir/convert_collection_to_jsonl.py \
--language zh \
--corpus_directory /directory/to/ntcir-collection/ \
--output_path /path/to/dump
```
## Document Ranking with BM25

Run the command

```
nohup sh target/appassembler/bin/IndexCollection -collection JsonCollection \
-generator LuceneDocumentGenerator -threads 1 \
-input /directory/to/dump \
-index /directory/to/index/lucene-index.clir_zh.pos+docvectors+rawdocs -storePositions -storeDocvectors \
-storeRawDocs -language zh >& log.clir_zh.pos+docvectors+rawdocs &
```

to index the documents.

## Retrieval

To do the document retrieval, run

```
nohup target/appassembler/bin/SearchCollection -topicreader TsvStringKey \
-index lucene-index.clir_zh.pos+docvectors+rawdocs/ \
-topics src/main/resources/topics-and-qrels/topics.ntcir8zh.eval.txt \
-output run.clir-zh.bm25-default.zh.topics.txt -bm25 -language zh &
```

## Evaluation

To evalutate, run

```
eval/trec_eval.9.0.4/trec_eval -m map \
src/main/resources/topics-and-qrels/qrels.ntcir8.eval.txt \
run.clir-zh.bm25-default.zh.topics.txt
```

| Collection | MAP |
|:----------:|:-----:|
| NTCIR-8 ZH | 0.3568|
3 changes: 3 additions & 0 deletions src/main/java/io/anserini/index/IndexArgs.java
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ public class IndexArgs {
@Option(name = "-bm25.accurate", usage = "Switch to use the accurate BM25 similarity)")
public boolean bm25Accurate = false;

@Option(name = "-language", usage = "the language for analyzer")
public String language= "en";

@Option(name = "-tweet.keepRetweets", usage = "boolean switch to keep retweets while indexing")
public boolean tweetKeepRetweets = false;

Expand Down
19 changes: 18 additions & 1 deletion src/main/java/io/anserini/index/IndexCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.*;
import org.apache.lucene.search.similarities.BM25Similarity;
Expand Down Expand Up @@ -667,10 +670,24 @@ public void run() throws IOException {
if (indexPath != null && !args.dryRun) {

final Directory dir = FSDirectory.open(indexPath);
final CJKAnalyzer chineseAnalyzer = new CJKAnalyzer();
final ArabicAnalyzer arabicAnalyzer = new ArabicAnalyzer();
final FrenchAnalyzer frenchAnalyzer = new FrenchAnalyzer();
final EnglishStemmingAnalyzer analyzer = args.keepStopwords ?
new EnglishStemmingAnalyzer(args.stemmer, CharArraySet.EMPTY_SET) : new EnglishStemmingAnalyzer(args.stemmer);
final TweetAnalyzer tweetAnalyzer = new TweetAnalyzer(args.tweetStemming);
final IndexWriterConfig config = args.collectionClass.equals("TweetCollection") ? new IndexWriterConfig(tweetAnalyzer) : new IndexWriterConfig(analyzer);
final IndexWriterConfig config;
if (args.collectionClass.equals("TweetCollection")) {
config = new IndexWriterConfig(tweetAnalyzer);
} else if (args.language.equals("zh")) {
config = new IndexWriterConfig(chineseAnalyzer);
} else if (args.language.equals("ar")) {
config = new IndexWriterConfig(arabicAnalyzer);
} else if (args.language.equals("fr")) {
config = new IndexWriterConfig(frenchAnalyzer);
} else {
config = new IndexWriterConfig(analyzer);
}
if (args.bm25Accurate) {
config.setSimilarity(new AccurateBM25Similarity()); // necessary during indexing as the norm used in BM25 is already determined at index time.
} else {
Expand Down
3 changes: 3 additions & 0 deletions src/main/java/io/anserini/search/SearchArgs.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ public class SearchArgs {
// optional arguments
@Option(name = "-threads", metaVar = "[Number]", usage = "Number of Threads")
public int threads = 1;

@Option(name = "-language", usage = "Analyzer Language")
public String language = "en";

@Option(name = "-inmem", usage = "Boolean switch to read index in memory")
public Boolean inmem = false;
Expand Down
11 changes: 11 additions & 0 deletions src/main/java/io/anserini/search/SearchCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.index.DirectoryReader;
Expand Down Expand Up @@ -217,7 +220,14 @@ public SearchCollection(SearchArgs args) throws IOException {
if (args.searchtweets) {
LOG.info("Search Tweets");
analyzer = new TweetAnalyzer();
} else if (args.language.equals("zh")) {
analyzer = new CJKAnalyzer();
} else if (args.language.equals("ar")) {
analyzer = new ArabicAnalyzer();
} else if (args.language.equals("fr")) {
analyzer = new FrenchAnalyzer();
} else {
// Default to English
analyzer = args.keepstop ?
new EnglishStemmingAnalyzer(args.stemmer, CharArraySet.EMPTY_SET) : new EnglishStemmingAnalyzer(args.stemmer);
}
Expand Down Expand Up @@ -362,6 +372,7 @@ public<K> void runTopics() throws IOException {
.getConstructor(Path.class).newInstance(topicsFilePath);
topics.putAll(tr.read());
} catch (Exception e) {
e.printStackTrace();
throw new IllegalArgumentException("Unable to load topic reader: " + args.topicReader);
}
}
Expand Down
13 changes: 13 additions & 0 deletions src/main/java/io/anserini/search/SimpleSearcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.LongPoint;
Expand Down Expand Up @@ -128,6 +131,16 @@ public void setSearchTweets(boolean flag) {
this.analyzer = flag? new TweetAnalyzer(true) : new EnglishAnalyzer();
}

public void setLanguage(String language) {
if (language.equals("zh")) {
this.analyzer = new CJKAnalyzer();
} else if (language.equals("ar")) {
this.analyzer = new ArabicAnalyzer();
} else if (language.equals("fr")) {
this.analyzer = new FrenchAnalyzer();
}
}

public void setRM3Reranker() {
setRM3Reranker(10, 10, 0.5f, false);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@
* ...
* </pre>
*/
public class TsvTopicReader extends TopicReader<Integer> {
public TsvTopicReader(Path topicFile) {
public class TsvIntTopicReader extends TopicReader<Integer> {
public TsvIntTopicReader(Path topicFile) {
super(topicFile);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
/**
* Anserini: A Lucene toolkit for replicable information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.search.topicreader;

import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;

/**
* Topic reader for queries in tsv format, such as the MS MARCO queries.
*
* <pre>
* 174249 does xpress bet charge to deposit money in your account
* 320792 how much is a cost to run disneyland
* 1090270 botulinum definition
* 1101279 do physicians pay for insurance from their salaries?
* 201376 here there be dragons comic
* 54544 blood diseases that are sexually transmitted
* ...
* </pre>
*/
public class TsvStringTopicReader extends TopicReader<String> {
public TsvStringTopicReader(Path topicFile) {
super(topicFile);
}

@Override
public SortedMap<String, Map<String, String>> read(BufferedReader reader) throws IOException {
SortedMap<String, Map<String, String>> map = new TreeMap<>();

String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
String[] arr = line.split("\\t");

Map<String,String> fields = new HashMap<>();
fields.put("title", arr[1].trim());
map.put(arr[0], fields);
}

return map;
}
}
95 changes: 95 additions & 0 deletions src/main/python/clir/convert_collection_to_jsonl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# -*- coding: utf-8 -*-
"""
Anserini: A Lucene toolkit for replicable information retrieval research
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

"""
This script is used for converting the cross-lingual IR corpus
into json format, which can be easily indexed by Anserini.
The jsonline format of Anserini is as follows:
{"id": "doc1", "contents": "string1"}
Currently the data we have:
- ZH: gigaword-xin.2002-06.zh-cleaned.xml
"""

import argparse
import json
import os

ZH_CORPUS_NAME = "gigaword-xin.2002-06.zh-cleaned.xml"


def zh2json(file_path, output_path):
"""
Processing rules:
1. If two lines are successive, then concatenate them without space
2. If two lines are separated with two lines, then separate them with period 。.
This rules do not matter for passage level indexing, but if when we do the
sentence level indexing, it will affect the performance.
:param file_path:
:return:
"""
fout = open(output_path, 'w')
counter = 0
with open(os.path.join(file_path, ZH_CORPUS_NAME)) as fin:
while True:
line = fin.readline()
if line.startswith("<DOC>"):
# We assume the nextline of "<DOC>" label line is
# "<DOCNO>" line.
example = {}
line = fin.readline()
if line.startswith("<DOCNO>"):
line = line.replace("<DOCNO>", "").replace("</DOCNO>", "").strip()
example["id"] = line
else:
print("The line is {}, but we assume it is <DOCNO> line".format(line))
exit()
# Read contents
example["contents"] = []
line = fin.readline()
while (not line.startswith("</DOC>")):
line = line.strip()
if len(line) == 0:
example["contents"].append("。")
else:
example["contents"].append(line)
line = fin.readline()
example["contents"] = "".join(example["contents"])
fout.write(json.dumps(example) + "\n")
counter += 1
if counter % 10000 == 0:
print("Dump {} examples".format(counter))
elif not line:
break
print("Done")

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--language", type=str, choices=["zh"])
parser.add_argument("--corpus_directory", type=str)
parser.add_argument("--output_path", type=str)
args = parser.parse_args()

dir = os.path.dirname(args.output_path)
if not os.path.exists(dir):
os.makedirs(dir)

if args.language == "zh":
zh2json(args.corpus_directory, args.output_path)
4 changes: 2 additions & 2 deletions src/main/python/run_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,8 @@ def construct_ranking_command(output_root, yaml_data, build_index=True):
'-index', get_index_path(yaml_data),
' '.join(model['params']),
'-topics', os.path.join(yaml_data['root'], yaml_data['topic_root'], topic['path']),
'-output', os.path.join(output_root, 'run.{0}.{1}.{2}'.format(yaml_data['name'], model['name'], topic['path']))
]
'-output', os.path.join(output_root, 'run.{0}.{1}.{2}'.format(yaml_data['name'], model['name'], topic['path'])),
] + (yaml_data['search_options'] if 'search_options' in yaml_data else [])
for (model, topic) in list(itertools.product(yaml_data['models'], yaml_data['topics']))
]
return ranking_commands
Expand Down
45 changes: 45 additions & 0 deletions src/main/resources/docgen/templates/ntcir8-zh.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Anserini: Regressions for [NTCIR-8 Simple Chinese](http://research.nii.ac.jp/ntcir/ntcir-ws8/ws-en.html)

This page documents regression experiments for the NTCIR Information Retrieval for Question Answering Task, which is integrated into
Anserini's regression testing framework.
For more complete instructions on how to run end-to-end experiments, refer to [this page](experiments-ntcir8-zh.md).

## Indexing

Typical indexing command:

```
${index_cmds}
```

The directory `/path/to/ntcir-8/` should be a directory containing the official document collection (a single file), in Json format.
[This page](experiments-ntcir8-zh.md) explains how to perform this conversion.

For additional details, see explanation of [common indexing options](common-indexing-options.md).

## Retrieval

Topics and qrels are stored in `src/main/resources/topics-and-qrels/`.
The regression experiments here evaluate on the 73 questions.

After indexing has completed, you should be able to perform retrieval as follows:

```
${ranking_cmds}
```

Evaluation can be performed using `trec_eval`:

```
${eval_cmds}
```

## Effectiveness

With the above commands, you should be able to replicate the following results:

${effectiveness}

The setting "default" refers the default BM25 settings of `k1=0.9`, `b=0.4`.
See [this page](experiments-ntcir8-zh.md) for more details.
Note that here we are using `trec_eval` to evaluate the top 1000 hits for each query.
Loading

0 comments on commit b771bb9

Please sign in to comment.