Skip to content

Commit

Permalink
Add Iso19115 indexer+generator (#1266)
Browse files Browse the repository at this point in the history
  • Loading branch information
shaneding authored Jun 9, 2020
1 parent 53c6b8b commit 7954eab
Show file tree
Hide file tree
Showing 5 changed files with 1,532 additions and 0 deletions.
132 changes: 132 additions & 0 deletions src/main/java/io/anserini/collection/Iso19115Collection.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/*
* Anserini: A Lucene toolkit for replicable information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.collection;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.MappingIterator;
import com.fasterxml.jackson.databind.ObjectMapper;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.NoSuchElementException;

public class Iso19115Collection extends DocumentCollection<Iso19115Collection.Document>{
public Iso19115Collection(Path path){
this.path = path;
this.allowedFileSuffix = new HashSet<>(Arrays.asList(".json", ".jsonl"));
}

@Override
public FileSegment<Iso19115Collection.Document> createFileSegment(Path p) throws IOException{
return new Segment(p);
}

public static class Segment extends FileSegment<Iso19115Collection.Document> {
private JsonNode node = null;
private Iterator<JsonNode> iter = null;
private MappingIterator<JsonNode> iterator;

public Segment(Path path) throws IOException {
super(path);
bufferedReader = new BufferedReader(new FileReader(path.toString()));
ObjectMapper mapper = new ObjectMapper();
iterator = mapper.readerFor(JsonNode.class).readValues(bufferedReader);
if(iterator.hasNext()){
node = iterator.next();
if(node.isArray()) {
iter = node.elements();
}
}
}

@Override
public void readNext() throws NoSuchElementException {
if (node == null){
throw new NoSuchElementException("JsonNode is empty");
} else if (node.isObject()) {
bufferedRecord = new Iso19115Collection.Document(node);
if(iterator.hasNext()) {
node = iterator.next();
} else {
atEOF = true;
}
} else if (node.isArray()) {
if (iter != null && iter.hasNext()) {
JsonNode json = iter.next();
bufferedRecord = new Iso19115Collection.Document(node);
} else {
throw new NoSuchElementException("Reached end of JsonNode iterator");
}
} else {
throw new NoSuchElementException("Invalid JsonNode type");
}
}
}

public static class Document implements SourceDocument{
protected String id;
protected String title;
protected String abstractContent;
protected String raw;

public Document(JsonNode json) {
// extracting the fields from the ISO19115 file
this.raw = json.toString();
String identifier = json.get("gmd:MD_Metadata").get("gmd:fileIdentifier").get("gco:CharacterString").asText();
// extracting the id in the beginning of the text
this.id = identifier.substring(0,identifier.length() - 8);
this.title = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:citation")
.get("gmd:CI_Citation").get("gmd:title").get("gco:CharacterString").asText();
this.abstractContent = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification")
.get("gmd:abstract").get("gco:CharacterString").asText();
}

@Override
public String id() {
return id;
}

@Override
public String contents() {
return title + "\n" + abstractContent;
}

@Override
public String raw() {
return raw;
}

public String getTitle() {
return title;
}

public String getAbstract() {
return abstractContent;
}

@Override
public boolean indexable() {
return true;
}

}
}
53 changes: 53 additions & 0 deletions src/main/java/io/anserini/index/generator/Iso19115Generator.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Anserini: A Lucene toolkit for replicable information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.index.generator;

import io.anserini.collection.Iso19115Collection;
import io.anserini.index.IndexArgs;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StoredField;


public class Iso19115Generator extends DefaultLuceneDocumentGenerator<Iso19115Collection.Document>{
protected IndexArgs args;

// constants for storing
public enum Iso19115Field {
ID("id"),
TITLE("title"),
ABSTRACT("abstract");

public final String name;

Iso19115Field(String s) {
name = s;
}
}

public Iso19115Generator(IndexArgs args) {
super(args);
this.args = args;
}

public Document createDocument(Iso19115Collection.Document doc) throws GeneratorException {
Document document = super.createDocument(doc);

document.add(new StoredField(Iso19115Field.TITLE.name, doc.getTitle()));
document.add(new StoredField(Iso19115Field.ABSTRACT.name, doc.getAbstract()));
return document;
}
}
54 changes: 54 additions & 0 deletions src/main/python/iso19115/convert_iso_to_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
'''
* Anserini: A Lucene toolkit for replicable information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
'''

import json
import xmltodict
import os

def convert_collection(args):
print("Converting collection...")
input_path = os.path.join(args.collection_path)
output_path = os.path.join(args.output_folder)

with open(output_path + "output.json", 'w') as out_file:
out_file.write('[\n')
for xml_file in os.listdir(input_path + 'datasets_xml'):
print(xml_file)
with open(os.path.join('datasets_xml', xml_file), 'r', encoding='utf8') as f:
xml_string = f.read()

json_string = json.dumps(xmltodict.parse(xml_string), indent=4)

out_file.write(json_string)
out_file.write(',')

out_file.write('\n]')

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Converts iso19115 xml files to json files.')
parser.add_argument('--collection', required=True, help='iso19115 collection file')
parser.add_argument('--output', required=True, help='output folder')
# not used yet since dataset is very small
parser.add_argument('--max-doc-per-file', default=1000000, type=int, help='maximum number of documents in each jsonl file.')

args = parser.parse_args()

if not os.path.exists(args.output_folder):
os.makedirs(args.output_folder)

convert_collection(args)
print('Done!')

50 changes: 50 additions & 0 deletions src/test/java/io/anserini/collection/Iso19115CollectionTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* Anserini: A Lucene toolkit for replicable information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.collection;

import org.junit.Before;

import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Map;

public class Iso19115CollectionTest extends DocumentCollectionTest<Iso19115Collection.Document> {
@Before
public void setUp() throws Exception {
super.setUp();
collectionPath = Paths.get("src/test/resources/sample_docs/iso19115");
collection = new Iso19115Collection(collectionPath);
Path segment = Paths.get("src/test/resources/sample_docs/iso19115/output.json");

segmentPaths.add(segment);
segmentDocCounts.put(segment, 2);

totalSegments = 1;
totalDocs = 2;

expected.put("12957", Map.of("id", "12957", "title", "Test title", "abstract", "Test abstract"));
expected.put("13007", Map.of("id", "13007", "title","Test title 2", "abstract", "Test abstract 2"));
}

@Override
void checkDocument(SourceDocument doc, Map<String, String> expected) {
assertTrue(doc.indexable());
assertEquals(expected.get("id"), doc.id());
assertEquals(expected.get("title"), ((Iso19115Collection.Document) doc).getTitle());
assertEquals(expected.get("abstract"), ((Iso19115Collection.Document) doc).getAbstract());
}
}
Loading

0 comments on commit 7954eab

Please sign in to comment.