Add Iso19115 indexer+generator (#1266)

castorini · Jun 9, 2020 · 7954eab · 7954eab
1 parent 53c6b8b
commit 7954eab
Show file tree

Hide file tree

Showing 5 changed files with 1,532 additions and 0 deletions.
diff --git a/src/main/java/io/anserini/collection/Iso19115Collection.java b/src/main/java/io/anserini/collection/Iso19115Collection.java
@@ -0,0 +1,132 @@
+/*
+ * Anserini: A Lucene toolkit for replicable information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.collection;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.MappingIterator;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+public class Iso19115Collection extends DocumentCollection<Iso19115Collection.Document>{
+  public Iso19115Collection(Path path){
+    this.path = path;
+    this.allowedFileSuffix = new HashSet<>(Arrays.asList(".json", ".jsonl"));
+  }
+
+  @Override
+  public FileSegment<Iso19115Collection.Document> createFileSegment(Path p) throws IOException{
+    return new Segment(p);
+  }
+
+  public static class Segment extends FileSegment<Iso19115Collection.Document> {
+    private JsonNode node = null;
+    private Iterator<JsonNode> iter = null;
+    private MappingIterator<JsonNode> iterator;
+
+    public Segment(Path path) throws IOException {
+      super(path);
+      bufferedReader = new BufferedReader(new FileReader(path.toString()));
+      ObjectMapper mapper = new ObjectMapper();
+      iterator = mapper.readerFor(JsonNode.class).readValues(bufferedReader);
+      if(iterator.hasNext()){
+        node = iterator.next();
+        if(node.isArray()) {
+          iter = node.elements();
+        }
+      }
+    }
+
+    @Override
+    public void readNext() throws NoSuchElementException {
+      if (node == null){
+        throw new NoSuchElementException("JsonNode is empty");
+      } else if (node.isObject()) {
+        bufferedRecord = new Iso19115Collection.Document(node);
+        if(iterator.hasNext()) {
+          node = iterator.next();
+        } else {
+          atEOF = true;
+        }
+      } else if (node.isArray()) {
+        if (iter != null && iter.hasNext()) {
+          JsonNode json = iter.next();
+          bufferedRecord = new Iso19115Collection.Document(node);
+        } else {
+          throw new NoSuchElementException("Reached end of JsonNode iterator");
+        }
+      } else {
+        throw new NoSuchElementException("Invalid JsonNode type");
+      } 
+    }
+  }
+
+  public static class Document implements SourceDocument{
+    protected String id;
+    protected String title;
+    protected String abstractContent;
+    protected String raw;
+
+    public Document(JsonNode json) {
+      // extracting the fields from the ISO19115 file
+      this.raw = json.toString();
+      String identifier = json.get("gmd:MD_Metadata").get("gmd:fileIdentifier").get("gco:CharacterString").asText();
+      // extracting the id in the beginning of the text
+      this.id = identifier.substring(0,identifier.length() - 8);
+      this.title = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:citation")
+                   .get("gmd:CI_Citation").get("gmd:title").get("gco:CharacterString").asText();
+      this.abstractContent = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification")
+                             .get("gmd:abstract").get("gco:CharacterString").asText();
+    }
+
+    @Override
+    public String id() {
+      return id;
+    }
+
+    @Override
+    public String contents() {
+      return title + "\n" + abstractContent;
+    }
+
+    @Override
+    public String raw() {
+      return raw;
+    }
+
+    public String getTitle() {
+      return title;
+    }
+
+    public String getAbstract() {
+      return abstractContent;
+    }
+
+    @Override
+    public boolean indexable() {
+      return true;
+    }
+
+  }
+}
diff --git a/src/main/java/io/anserini/index/generator/Iso19115Generator.java b/src/main/java/io/anserini/index/generator/Iso19115Generator.java
@@ -0,0 +1,53 @@
+/*
+ * Anserini: A Lucene toolkit for replicable information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.index.generator;
+
+import io.anserini.collection.Iso19115Collection;
+import io.anserini.index.IndexArgs;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.StoredField;
+
+
+public class Iso19115Generator extends DefaultLuceneDocumentGenerator<Iso19115Collection.Document>{
+  protected IndexArgs args;
+
+  // constants for storing
+  public enum Iso19115Field {
+    ID("id"),
+    TITLE("title"),
+    ABSTRACT("abstract");
+
+    public final String name;
+
+    Iso19115Field(String s) {
+      name = s;
+    }
+  }
+
+  public Iso19115Generator(IndexArgs args) {
+    super(args);
+    this.args = args;
+  }
+
+  public Document createDocument(Iso19115Collection.Document doc) throws GeneratorException {
+    Document document = super.createDocument(doc);
+
+    document.add(new StoredField(Iso19115Field.TITLE.name, doc.getTitle()));
+    document.add(new StoredField(Iso19115Field.ABSTRACT.name, doc.getAbstract()));
+    return document;
+  }
+}
diff --git a/src/main/python/iso19115/convert_iso_to_json.py b/src/main/python/iso19115/convert_iso_to_json.py
@@ -0,0 +1,54 @@
+'''
+ * Anserini: A Lucene toolkit for replicable information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+'''
+
+import json
+import xmltodict
+import os
+
+def convert_collection(args):
+    print("Converting collection...")
+    input_path = os.path.join(args.collection_path)
+    output_path = os.path.join(args.output_folder)
+
+    with open(output_path + "output.json", 'w') as out_file:
+        out_file.write('[\n')
+        for xml_file in os.listdir(input_path + 'datasets_xml'):
+            print(xml_file)
+            with open(os.path.join('datasets_xml', xml_file), 'r', encoding='utf8') as f:
+                xml_string = f.read()
+
+            json_string = json.dumps(xmltodict.parse(xml_string), indent=4)
+
+            out_file.write(json_string)
+            out_file.write(',')
+
+        out_file.write('\n]')
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Converts iso19115 xml files to json files.')
+    parser.add_argument('--collection', required=True, help='iso19115 collection file')
+    parser.add_argument('--output', required=True, help='output folder')
+    # not used yet since dataset is very small
+    parser.add_argument('--max-doc-per-file', default=1000000, type=int, help='maximum number of documents in each jsonl file.')
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.output_folder):
+        os.makedirs(args.output_folder)
+
+    convert_collection(args)
+    print('Done!')
+
diff --git a/src/test/java/io/anserini/collection/Iso19115CollectionTest.java b/src/test/java/io/anserini/collection/Iso19115CollectionTest.java
@@ -0,0 +1,50 @@
+/*
+ * Anserini: A Lucene toolkit for replicable information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.collection;
+
+import org.junit.Before;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Map;
+
+public class Iso19115CollectionTest extends DocumentCollectionTest<Iso19115Collection.Document> {
+  @Before
+  public void setUp() throws Exception {
+    super.setUp();
+    collectionPath = Paths.get("src/test/resources/sample_docs/iso19115");
+    collection = new Iso19115Collection(collectionPath);
+    Path segment = Paths.get("src/test/resources/sample_docs/iso19115/output.json");
+
+    segmentPaths.add(segment);
+    segmentDocCounts.put(segment, 2);
+
+    totalSegments = 1;
+    totalDocs = 2;
+
+    expected.put("12957", Map.of("id", "12957", "title", "Test title", "abstract", "Test abstract"));
+    expected.put("13007", Map.of("id", "13007", "title","Test title 2", "abstract", "Test abstract 2"));
+  }
+
+  @Override
+  void checkDocument(SourceDocument doc, Map<String, String> expected) {
+    assertTrue(doc.indexable());
+    assertEquals(expected.get("id"), doc.id());
+    assertEquals(expected.get("title"), ((Iso19115Collection.Document) doc).getTitle());
+    assertEquals(expected.get("abstract"), ((Iso19115Collection.Document) doc).getAbstract());
+  }
+}