From 6872c878d6969ebdf1875e4436777b95746b35a5 Mon Sep 17 00:00:00 2001 From: "Matt J. H. Yang" Date: Wed, 7 Dec 2022 09:03:08 -0500 Subject: [PATCH] Add JsonStringTopicReader (#2032) * Add JsonStringTopicReader --- .../topicreader/JsonStringTopicReader.java | 56 +++++++++++++++++++ .../JsonStringTopicReaderTest.java | 48 ++++++++++++++++ .../sample_topics/stringID_topics.jsonl | 2 + 3 files changed, 106 insertions(+) create mode 100644 src/main/java/io/anserini/search/topicreader/JsonStringTopicReader.java create mode 100644 src/test/java/io/anserini/search/topicreader/JsonStringTopicReaderTest.java create mode 100644 src/test/resources/sample_topics/stringID_topics.jsonl diff --git a/src/main/java/io/anserini/search/topicreader/JsonStringTopicReader.java b/src/main/java/io/anserini/search/topicreader/JsonStringTopicReader.java new file mode 100644 index 0000000000..9715036ead --- /dev/null +++ b/src/main/java/io/anserini/search/topicreader/JsonStringTopicReader.java @@ -0,0 +1,56 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package io.anserini.search.topicreader; + +import java.io.BufferedReader; +import java.io.IOException; +import java.nio.file.Path; +import java.util.HashMap; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; + +public class JsonStringTopicReader extends TopicReader { + + public JsonStringTopicReader(Path topicFile) { + super(topicFile); + } + + @Override + public SortedMap> read(BufferedReader reader) throws IOException { + SortedMap> map = new TreeMap<>(); + String line; + ObjectMapper mapper = new ObjectMapper(); + while ((line = reader.readLine()) != null) { + line = line.trim(); + JsonNode lineNode = mapper.readerFor(JsonNode.class).readTree(line); + String topicID = lineNode.get("id").asText(); + + Map fields = new HashMap<>(); + lineNode.fields().forEachRemaining( e -> { + if ("id".equals(e.getKey())) return; //skip id + fields.put(e.getKey(), e.getValue().asText()); + }); + map.put(topicID, fields); + } + return map; + } +} diff --git a/src/test/java/io/anserini/search/topicreader/JsonStringTopicReaderTest.java b/src/test/java/io/anserini/search/topicreader/JsonStringTopicReaderTest.java new file mode 100644 index 0000000000..b2ef1ce81a --- /dev/null +++ b/src/test/java/io/anserini/search/topicreader/JsonStringTopicReaderTest.java @@ -0,0 +1,48 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.search.topicreader; + +import org.junit.Test; + +import java.io.IOException; +import java.nio.file.Paths; +import java.util.Map; +import java.util.SortedMap; + +import static org.junit.Assert.assertEquals; + +public class JsonStringTopicReaderTest { + + @Test + public void test() throws IOException { + TopicReader reader = new JsonStringTopicReader( + Paths.get("src/test/resources/sample_topics/stringID_topics.jsonl")); + + SortedMap> topics = reader.read(); + + assertEquals(2, topics.keySet().size()); + assertEquals("topic1", topics.firstKey()); + assertEquals("topic2", topics.lastKey()); + assertEquals("this is the contents 1.", topics.get(topics.firstKey()).get("contents")); + assertEquals("topic1 field1 content", topics.get(topics.firstKey()).get("field1")); + assertEquals("topic1 field2 content", topics.get(topics.firstKey()).get("field2")); + assertEquals("this is the contents 2.", topics.get(topics.lastKey()).get("contents")); + assertEquals("topic2 field1 content", topics.get(topics.lastKey()).get("field1")); + assertEquals("topic2 field2 content", topics.get(topics.lastKey()).get("field2")); + + } +} diff --git a/src/test/resources/sample_topics/stringID_topics.jsonl b/src/test/resources/sample_topics/stringID_topics.jsonl new file mode 100644 index 0000000000..6e820b4962 --- /dev/null +++ b/src/test/resources/sample_topics/stringID_topics.jsonl @@ -0,0 +1,2 @@ +{"id": "topic1", "contents": "this is the contents 1.", "field1": "topic1 field1 content", "field2": "topic1 field2 content"} +{"id": "topic2", "contents": "this is the contents 2.", "field1": "topic2 field1 content", "field2": "topic2 field2 content"} \ No newline at end of file