Skip to content

Commit

Permalink
Add multilingual analyzers to index and search (castorini#1548)
Browse files Browse the repository at this point in the history
  • Loading branch information
velocityCavalry authored May 20, 2021
1 parent ac1c34b commit aad5490
Show file tree
Hide file tree
Showing 3 changed files with 165 additions and 24 deletions.
65 changes: 57 additions & 8 deletions src/main/java/io/anserini/index/IndexCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,23 @@
import org.apache.lucene.analysis.bn.BengaliAnalyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.da.DanishAnalyzer;
import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.es.SpanishAnalyzer;
import org.apache.lucene.analysis.fi.FinnishAnalyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.hi.HindiAnalyzer;
import org.apache.lucene.analysis.hu.HungarianAnalyzer;
import org.apache.lucene.analysis.id.IndonesianAnalyzer;
import org.apache.lucene.analysis.it.ItalianAnalyzer;
import org.apache.lucene.analysis.nl.DutchAnalyzer;
import org.apache.lucene.analysis.no.NorwegianAnalyzer;
import org.apache.lucene.analysis.pt.PortugueseAnalyzer;
import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.sv.SwedishAnalyzer;
import org.apache.lucene.analysis.th.ThaiAnalyzer;
import org.apache.lucene.analysis.tr.TurkishAnalyzer;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.DocValuesType;
Expand Down Expand Up @@ -723,11 +736,23 @@ public Counters run() throws IOException {
final Directory dir = FSDirectory.open(indexPath);
final CJKAnalyzer chineseAnalyzer = new CJKAnalyzer();
final ArabicAnalyzer arabicAnalyzer = new ArabicAnalyzer();
final FrenchAnalyzer frenchAnalyzer = new FrenchAnalyzer();
final HindiAnalyzer hindiAnalyzer = new HindiAnalyzer();
final BengaliAnalyzer bengaliAnalyzer = new BengaliAnalyzer();
final DanishAnalyzer danishAnalyzer = new DanishAnalyzer();
final DutchAnalyzer dutchAnalyzer = new DutchAnalyzer();
final FinnishAnalyzer finnishAnalyzer = new FinnishAnalyzer();
final FrenchAnalyzer frenchAnalyzer = new FrenchAnalyzer();
final GermanAnalyzer germanAnalyzer = new GermanAnalyzer();
final HindiAnalyzer hindiAnalyzer = new HindiAnalyzer();
final HungarianAnalyzer hungarianAnalyzer = new HungarianAnalyzer();
final IndonesianAnalyzer indonesianAnalyzer = new IndonesianAnalyzer();
final ItalianAnalyzer italianAnalyzer = new ItalianAnalyzer();
final NorwegianAnalyzer norwegianAnalyzer = new NorwegianAnalyzer();
final PortugueseAnalyzer portugueseAnalyzer = new PortugueseAnalyzer();
final RussianAnalyzer russianAnalyzer = new RussianAnalyzer();
final SpanishAnalyzer spanishAnalyzer = new SpanishAnalyzer();
final SwedishAnalyzer swedishAnalyzer = new SwedishAnalyzer();
final ThaiAnalyzer thaiAnalyzer = new ThaiAnalyzer();
final TurkishAnalyzer turkishAnalyzer = new TurkishAnalyzer();
final WhitespaceAnalyzer whitespaceAnalyzer = new WhitespaceAnalyzer();

final DefaultEnglishAnalyzer analyzer = DefaultEnglishAnalyzer.fromArguments(
Expand All @@ -737,20 +762,44 @@ public Counters run() throws IOException {
final IndexWriterConfig config;
if (args.collectionClass.equals("TweetCollection")) {
config = new IndexWriterConfig(tweetAnalyzer);
} else if (args.language.equals("zh")) {
config = new IndexWriterConfig(chineseAnalyzer);
} else if (args.language.equals("ar")) {
config = new IndexWriterConfig(arabicAnalyzer);
} else if (args.language.equals("fr")) {
config = new IndexWriterConfig(frenchAnalyzer);
} else if (args.language.equals("hi")) {
config = new IndexWriterConfig(hindiAnalyzer);
} else if (args.language.equals("bn")) {
config = new IndexWriterConfig(bengaliAnalyzer);
} else if (args.language.equals("da")) {
config = new IndexWriterConfig(danishAnalyzer);
} else if (args.language.equals("de")) {
config = new IndexWriterConfig(germanAnalyzer);
} else if (args.language.equals("es")) {
config = new IndexWriterConfig(spanishAnalyzer);
} else if (args.language.equals("fi")) {
config = new IndexWriterConfig(finnishAnalyzer);
} else if (args.language.equals("fr")) {
config = new IndexWriterConfig(frenchAnalyzer);
} else if (args.language.equals("hi")) {
config = new IndexWriterConfig(hindiAnalyzer);
} else if (args.language.equals("hu")) {
config = new IndexWriterConfig(hungarianAnalyzer);
} else if (args.language.equals("id")) {
config = new IndexWriterConfig(indonesianAnalyzer);
} else if (args.language.equals("it")) {
config = new IndexWriterConfig(italianAnalyzer);
} else if (args.language.equals("nl")) {
config = new IndexWriterConfig(dutchAnalyzer);
} else if (args.language.equals("no")) {
config = new IndexWriterConfig(norwegianAnalyzer);
} else if (args.language.equals("pt")) {
config = new IndexWriterConfig(portugueseAnalyzer);
} else if (args.language.equals("ru")) {
config = new IndexWriterConfig(russianAnalyzer);
} else if (args.language.equals("sv")) {
config = new IndexWriterConfig(swedishAnalyzer);
} else if (args.language.equals("th")) {
config = new IndexWriterConfig(thaiAnalyzer);
} else if (args.language.equals("tr")) {
config = new IndexWriterConfig(turkishAnalyzer);
} else if (args.language.equals("zh") || args.language.equals("ja") || args.language.equals("ko")) {
config = new IndexWriterConfig(chineseAnalyzer);
} else if (args.pretokenized) {
config = new IndexWriterConfig(whitespaceAnalyzer);
} else {
Expand Down
73 changes: 64 additions & 9 deletions src/main/java/io/anserini/search/SearchCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,23 @@
import org.apache.lucene.analysis.bn.BengaliAnalyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.da.DanishAnalyzer;
import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.es.SpanishAnalyzer;
import org.apache.lucene.analysis.fi.FinnishAnalyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.hi.HindiAnalyzer;
import org.apache.lucene.analysis.hu.HungarianAnalyzer;
import org.apache.lucene.analysis.id.IndonesianAnalyzer;
import org.apache.lucene.analysis.it.ItalianAnalyzer;
import org.apache.lucene.analysis.nl.DutchAnalyzer;
import org.apache.lucene.analysis.no.NorwegianAnalyzer;
import org.apache.lucene.analysis.pt.PortugueseAnalyzer;
import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.sv.SwedishAnalyzer;
import org.apache.lucene.analysis.th.ThaiAnalyzer;
import org.apache.lucene.analysis.tr.TurkishAnalyzer;

import org.apache.lucene.document.LongPoint;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
Expand Down Expand Up @@ -277,27 +290,69 @@ public SearchCollection(SearchArgs args) throws IOException {
if (args.searchtweets) {
LOG.info("Searching tweets? true");
analyzer = new TweetAnalyzer();
} else if (args.language.equals("zh")) {
analyzer = new CJKAnalyzer();
LOG.info("Language: zh");
} else if (args.language.equals("ar")) {
analyzer = new ArabicAnalyzer();
LOG.info("Language: ar");
} else if (args.language.equals("fr")) {
analyzer = new FrenchAnalyzer();
LOG.info("Language: fr");
} else if (args.language.equals("hi")) {
analyzer = new HindiAnalyzer();
LOG.info("Language: hi");
} else if (args.language.equals("bn")) {
analyzer = new BengaliAnalyzer();
LOG.info("Language: bn");
} else if (args.language.equals("da")) {
analyzer = new DanishAnalyzer();
LOG.info("Language: da");
} else if (args.language.equals("de")) {
analyzer = new GermanAnalyzer();
LOG.info("Language: de");
} else if (args.language.equals("es")) {
analyzer = new SpanishAnalyzer();
LOG.info("Language: es");
} else if (args.language.equals("fi")) {
analyzer = new FinnishAnalyzer();
LOG.info("Language: fi");
} else if (args.language.equals("fr")) {
analyzer = new FrenchAnalyzer();
LOG.info("Language: fr");
} else if (args.language.equals("hi")) {
analyzer = new HindiAnalyzer();
LOG.info("Language: hi");
} else if (args.language.equals("hu")) {
analyzer = new HungarianAnalyzer();
LOG.info("Language: hu");
} else if (args.language.equals("id")) {
analyzer = new IndonesianAnalyzer();
LOG.info("Language: id");
} else if (args.language.equals("it")) {
analyzer = new ItalianAnalyzer();
LOG.info("Language: it");
} else if (args.language.equals("ja")) {
analyzer = new CJKAnalyzer();
LOG.info("Language: ja");
} else if (args.language.equals("ko")) {
analyzer = new CJKAnalyzer();
LOG.info("Language: ko");
} else if (args.language.equals("nl")) {
analyzer = new DutchAnalyzer();
LOG.info("Language: nl");
} else if (args.language.equals("no")) {
analyzer = new NorwegianAnalyzer();
LOG.info("Language: no");
} else if (args.language.equals("pt")) {
analyzer = new PortugueseAnalyzer();
LOG.info("Language: pt");
} else if (args.language.equals("ru")) {
analyzer = new RussianAnalyzer();
LOG.info("Language: ru");
} else if (args.language.equals("sv")) {
analyzer = new SwedishAnalyzer();
LOG.info("Language: sv");
} else if (args.language.equals("th")) {
analyzer = new ThaiAnalyzer();
LOG.info("Language: th");
} else if (args.language.equals("tr")) {
analyzer = new TurkishAnalyzer();
LOG.info("Language: tr");
} else if (args.language.equals("zh")) {
analyzer = new CJKAnalyzer();
LOG.info("Language: zh");
} else if (args.pretokenized) {
analyzer = new WhitespaceAnalyzer();
LOG.info("Pretokenized");
Expand Down
51 changes: 44 additions & 7 deletions src/main/java/io/anserini/search/SimpleSearcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,23 @@
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
import org.apache.lucene.analysis.bn.BengaliAnalyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.da.DanishAnalyzer;
import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.es.SpanishAnalyzer;
import org.apache.lucene.analysis.fi.FinnishAnalyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.hi.HindiAnalyzer;
import org.apache.lucene.analysis.hu.HungarianAnalyzer;
import org.apache.lucene.analysis.id.IndonesianAnalyzer;
import org.apache.lucene.analysis.it.ItalianAnalyzer;
import org.apache.lucene.analysis.nl.DutchAnalyzer;
import org.apache.lucene.analysis.no.NorwegianAnalyzer;
import org.apache.lucene.analysis.pt.PortugueseAnalyzer;
import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.sv.SwedishAnalyzer;
import org.apache.lucene.analysis.th.ThaiAnalyzer;
import org.apache.lucene.analysis.tr.TurkishAnalyzer;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
Expand Down Expand Up @@ -224,20 +237,44 @@ public Analyzer getAnalyzer(){
* @param language language
*/
public void setLanguage(String language) {
if (language.equals("zh")) {
this.analyzer = new CJKAnalyzer();
} else if (language.equals("ar")) {
if (language.equals("ar")) {
this.analyzer = new ArabicAnalyzer();
} else if (language.equals("fr")) {
this.analyzer = new FrenchAnalyzer();
} else if (language.equals("hi")) {
this.analyzer = new HindiAnalyzer();
} else if (language.equals("bn")) {
this.analyzer = new BengaliAnalyzer();
} else if (language.equals("de")) {
this.analyzer = new GermanAnalyzer();
} else if (language.equals("da")) {
this.analyzer = new DanishAnalyzer();
} else if (language.equals("es")) {
this.analyzer = new SpanishAnalyzer();
} else if (language.equals("fi")) {
this.analyzer = new FinnishAnalyzer();
} else if (language.equals("fr")) {
this.analyzer = new FrenchAnalyzer();
} else if (language.equals("hi")) {
this.analyzer = new HindiAnalyzer();
} else if (language.equals("hu")) {
this.analyzer = new HungarianAnalyzer();
} else if (language.equals("id")) {
this.analyzer = new IndonesianAnalyzer();
} else if (language.equals("it")) {
this.analyzer = new ItalianAnalyzer();
} else if (language.equals("nl")) {
this.analyzer = new DutchAnalyzer();
} else if (language.equals("no")) {
this.analyzer = new NorwegianAnalyzer();
} else if (language.equals("pt")) {
this.analyzer = new PortugueseAnalyzer();
} else if (language.equals("ru")) {
this.analyzer = new RussianAnalyzer();
} else if (language.equals("sv")) {
this.analyzer = new SwedishAnalyzer();
} else if (language.equals("th")) {
this.analyzer = new ThaiAnalyzer();
} else if (language.equals("tr")) {
this.analyzer = new TurkishAnalyzer();
} else if (language.equals("zh") || language.equals("ja") || language.equals("ko")) {
this.analyzer = new CJKAnalyzer();
}
}

Expand Down

0 comments on commit aad5490

Please sign in to comment.