-
Notifications
You must be signed in to change notification settings - Fork 468
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Iso19115 indexer+generator (#1266)
- Loading branch information
Showing
5 changed files
with
1,532 additions
and
0 deletions.
There are no files selected for viewing
132 changes: 132 additions & 0 deletions
132
src/main/java/io/anserini/collection/Iso19115Collection.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
/* | ||
* Anserini: A Lucene toolkit for replicable information retrieval research | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package io.anserini.collection; | ||
|
||
import com.fasterxml.jackson.databind.JsonNode; | ||
import com.fasterxml.jackson.databind.MappingIterator; | ||
import com.fasterxml.jackson.databind.ObjectMapper; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.FileReader; | ||
import java.io.IOException; | ||
import java.nio.file.Path; | ||
import java.util.Arrays; | ||
import java.util.HashSet; | ||
import java.util.Iterator; | ||
import java.util.NoSuchElementException; | ||
|
||
public class Iso19115Collection extends DocumentCollection<Iso19115Collection.Document>{ | ||
public Iso19115Collection(Path path){ | ||
this.path = path; | ||
this.allowedFileSuffix = new HashSet<>(Arrays.asList(".json", ".jsonl")); | ||
} | ||
|
||
@Override | ||
public FileSegment<Iso19115Collection.Document> createFileSegment(Path p) throws IOException{ | ||
return new Segment(p); | ||
} | ||
|
||
public static class Segment extends FileSegment<Iso19115Collection.Document> { | ||
private JsonNode node = null; | ||
private Iterator<JsonNode> iter = null; | ||
private MappingIterator<JsonNode> iterator; | ||
|
||
public Segment(Path path) throws IOException { | ||
super(path); | ||
bufferedReader = new BufferedReader(new FileReader(path.toString())); | ||
ObjectMapper mapper = new ObjectMapper(); | ||
iterator = mapper.readerFor(JsonNode.class).readValues(bufferedReader); | ||
if(iterator.hasNext()){ | ||
node = iterator.next(); | ||
if(node.isArray()) { | ||
iter = node.elements(); | ||
} | ||
} | ||
} | ||
|
||
@Override | ||
public void readNext() throws NoSuchElementException { | ||
if (node == null){ | ||
throw new NoSuchElementException("JsonNode is empty"); | ||
} else if (node.isObject()) { | ||
bufferedRecord = new Iso19115Collection.Document(node); | ||
if(iterator.hasNext()) { | ||
node = iterator.next(); | ||
} else { | ||
atEOF = true; | ||
} | ||
} else if (node.isArray()) { | ||
if (iter != null && iter.hasNext()) { | ||
JsonNode json = iter.next(); | ||
bufferedRecord = new Iso19115Collection.Document(node); | ||
} else { | ||
throw new NoSuchElementException("Reached end of JsonNode iterator"); | ||
} | ||
} else { | ||
throw new NoSuchElementException("Invalid JsonNode type"); | ||
} | ||
} | ||
} | ||
|
||
public static class Document implements SourceDocument{ | ||
protected String id; | ||
protected String title; | ||
protected String abstractContent; | ||
protected String raw; | ||
|
||
public Document(JsonNode json) { | ||
// extracting the fields from the ISO19115 file | ||
this.raw = json.toString(); | ||
String identifier = json.get("gmd:MD_Metadata").get("gmd:fileIdentifier").get("gco:CharacterString").asText(); | ||
// extracting the id in the beginning of the text | ||
this.id = identifier.substring(0,identifier.length() - 8); | ||
this.title = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification").get("gmd:citation") | ||
.get("gmd:CI_Citation").get("gmd:title").get("gco:CharacterString").asText(); | ||
this.abstractContent = json.get("gmd:MD_Metadata").get("gmd:identificationInfo").get("gmd:MD_DataIdentification") | ||
.get("gmd:abstract").get("gco:CharacterString").asText(); | ||
} | ||
|
||
@Override | ||
public String id() { | ||
return id; | ||
} | ||
|
||
@Override | ||
public String contents() { | ||
return title + "\n" + abstractContent; | ||
} | ||
|
||
@Override | ||
public String raw() { | ||
return raw; | ||
} | ||
|
||
public String getTitle() { | ||
return title; | ||
} | ||
|
||
public String getAbstract() { | ||
return abstractContent; | ||
} | ||
|
||
@Override | ||
public boolean indexable() { | ||
return true; | ||
} | ||
|
||
} | ||
} |
53 changes: 53 additions & 0 deletions
53
src/main/java/io/anserini/index/generator/Iso19115Generator.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
/* | ||
* Anserini: A Lucene toolkit for replicable information retrieval research | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package io.anserini.index.generator; | ||
|
||
import io.anserini.collection.Iso19115Collection; | ||
import io.anserini.index.IndexArgs; | ||
import org.apache.lucene.document.Document; | ||
import org.apache.lucene.document.StoredField; | ||
|
||
|
||
public class Iso19115Generator extends DefaultLuceneDocumentGenerator<Iso19115Collection.Document>{ | ||
protected IndexArgs args; | ||
|
||
// constants for storing | ||
public enum Iso19115Field { | ||
ID("id"), | ||
TITLE("title"), | ||
ABSTRACT("abstract"); | ||
|
||
public final String name; | ||
|
||
Iso19115Field(String s) { | ||
name = s; | ||
} | ||
} | ||
|
||
public Iso19115Generator(IndexArgs args) { | ||
super(args); | ||
this.args = args; | ||
} | ||
|
||
public Document createDocument(Iso19115Collection.Document doc) throws GeneratorException { | ||
Document document = super.createDocument(doc); | ||
|
||
document.add(new StoredField(Iso19115Field.TITLE.name, doc.getTitle())); | ||
document.add(new StoredField(Iso19115Field.ABSTRACT.name, doc.getAbstract())); | ||
return document; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
''' | ||
* Anserini: A Lucene toolkit for replicable information retrieval research | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
''' | ||
|
||
import json | ||
import xmltodict | ||
import os | ||
|
||
def convert_collection(args): | ||
print("Converting collection...") | ||
input_path = os.path.join(args.collection_path) | ||
output_path = os.path.join(args.output_folder) | ||
|
||
with open(output_path + "output.json", 'w') as out_file: | ||
out_file.write('[\n') | ||
for xml_file in os.listdir(input_path + 'datasets_xml'): | ||
print(xml_file) | ||
with open(os.path.join('datasets_xml', xml_file), 'r', encoding='utf8') as f: | ||
xml_string = f.read() | ||
|
||
json_string = json.dumps(xmltodict.parse(xml_string), indent=4) | ||
|
||
out_file.write(json_string) | ||
out_file.write(',') | ||
|
||
out_file.write('\n]') | ||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser(description='Converts iso19115 xml files to json files.') | ||
parser.add_argument('--collection', required=True, help='iso19115 collection file') | ||
parser.add_argument('--output', required=True, help='output folder') | ||
# not used yet since dataset is very small | ||
parser.add_argument('--max-doc-per-file', default=1000000, type=int, help='maximum number of documents in each jsonl file.') | ||
|
||
args = parser.parse_args() | ||
|
||
if not os.path.exists(args.output_folder): | ||
os.makedirs(args.output_folder) | ||
|
||
convert_collection(args) | ||
print('Done!') | ||
|
50 changes: 50 additions & 0 deletions
50
src/test/java/io/anserini/collection/Iso19115CollectionTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
/* | ||
* Anserini: A Lucene toolkit for replicable information retrieval research | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package io.anserini.collection; | ||
|
||
import org.junit.Before; | ||
|
||
import java.nio.file.Path; | ||
import java.nio.file.Paths; | ||
import java.util.Map; | ||
|
||
public class Iso19115CollectionTest extends DocumentCollectionTest<Iso19115Collection.Document> { | ||
@Before | ||
public void setUp() throws Exception { | ||
super.setUp(); | ||
collectionPath = Paths.get("src/test/resources/sample_docs/iso19115"); | ||
collection = new Iso19115Collection(collectionPath); | ||
Path segment = Paths.get("src/test/resources/sample_docs/iso19115/output.json"); | ||
|
||
segmentPaths.add(segment); | ||
segmentDocCounts.put(segment, 2); | ||
|
||
totalSegments = 1; | ||
totalDocs = 2; | ||
|
||
expected.put("12957", Map.of("id", "12957", "title", "Test title", "abstract", "Test abstract")); | ||
expected.put("13007", Map.of("id", "13007", "title","Test title 2", "abstract", "Test abstract 2")); | ||
} | ||
|
||
@Override | ||
void checkDocument(SourceDocument doc, Map<String, String> expected) { | ||
assertTrue(doc.indexable()); | ||
assertEquals(expected.get("id"), doc.id()); | ||
assertEquals(expected.get("title"), ((Iso19115Collection.Document) doc).getTitle()); | ||
assertEquals(expected.get("abstract"), ((Iso19115Collection.Document) doc).getAbstract()); | ||
} | ||
} |
Oops, something went wrong.