forked from pykeen/pykeen
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
🧬 💾 Add the PharMeBINet dataset (pykeen#1257)
Closes pykeen#1256 Just a note that I had to add some missing functionality to `TarFileSingleDataset` class that was present in the `SingleTabbedDataset` class. This was to be able to specify which columns to use from a dataframe for the head, edge and target columns. --------- Co-authored-by: Charles Tapley Hoyt <cthoyt@gmail.com>
- Loading branch information
Showing
5 changed files
with
94 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
"""The `PharMeBINet <https://github.com/ckoenigs/PharMeBINet/>`_ dataset. | ||
Get a summary with ``python -m pykeen.datasets.pharmebinet``. | ||
""" | ||
|
||
import click | ||
from docdata import parse_docdata | ||
from more_click import verbose_option | ||
|
||
from .base import TarFileSingleDataset | ||
from ..typing import TorchRandomHint | ||
|
||
__all__ = [ | ||
"PharMeBINet", | ||
] | ||
|
||
RAW_URL = "https://zenodo.org/record/7011027/files/pharmebinet_tsv_2022_08_19_v2.tar.gz" | ||
|
||
|
||
@parse_docdata | ||
class PharMeBINet(TarFileSingleDataset): | ||
"""The PharMeBINet dataset from [koenigs2022]_. | ||
--- | ||
name: PharMeBINet | ||
citation: | ||
github: ckoenigs/PharMeBINet | ||
author: Königs | ||
year: 2022 | ||
link: https://www.nature.com/articles/s41597-022-01510-3 | ||
single: true | ||
statistics: | ||
entities: 2869407 | ||
relations: 208 | ||
triples: 15883653 | ||
training: 12702210 | ||
testing: 1587776 | ||
validation: 1587777 | ||
""" | ||
|
||
def __init__( | ||
self, | ||
random_state: TorchRandomHint = 0, | ||
**kwargs, | ||
): | ||
"""Initialize the PharMeBINet dataset from [koenigs2022]_. | ||
:param random_state: An optional random state to make the training/testing/validation split reproducible. | ||
:param kwargs: keyword arguments passed to :class:`pykeen.datasets.base.TarFileSingleDataset`. | ||
""" | ||
super().__init__( | ||
url=RAW_URL, | ||
relative_path="edges.tsv", | ||
random_state=random_state, | ||
read_csv_kwargs=dict( | ||
usecols=["start_id", "type", "end_id"], | ||
sep="\t", | ||
dtype={"start_id": str, "end_id": str}, | ||
), | ||
**kwargs, | ||
) | ||
|
||
|
||
@click.command() | ||
@verbose_option | ||
def _main(): | ||
from pykeen.datasets import get_dataset | ||
|
||
get_dataset(dataset=PharMeBINet).summarize() | ||
|
||
|
||
if __name__ == "__main__": | ||
_main() |