🧬 💾 Add the PharMeBINet dataset (pykeen#1257)

Closes pykeen#1256 Just a note that I had to add some missing functionality to `TarFileSingleDataset` class that was present in the `SingleTabbedDataset` class. This was to be able to specify which columns to use from a dataframe for the head, edge and target columns. --------- Co-authored-by: Charles Tapley Hoyt <cthoyt@gmail.com>
nicolafan · May 19, 2023 · ba82bbe · ba82bbe
1 parent fd4fe6c
commit ba82bbe
Show file tree

Hide file tree

Showing 5 changed files with 94 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -45,7 +45,7 @@
 <p align="center">
   <a href="#installation">Installation</a> •
   <a href="#quickstart">Quickstart</a> •
-  <a href="#datasets">Datasets (36)</a> •
+  <a href="#datasets">Datasets (37)</a> •
   <a href="#inductive-datasets">Inductive Datasets (5)</a> •
   <a href="#models">Models (44)</a> •
   <a href="#supporters">Support</a> •
@@ -112,7 +112,7 @@ in ``pykeen``.
 
 ### Datasets 
 
-The following 36 datasets are built in to PyKEEN. The citation for each dataset corresponds to either the paper
+The following 37 datasets are built in to PyKEEN. The citation for each dataset corresponds to either the paper
 describing the dataset, the first paper published using the dataset with knowledge graph embedding models,
 or the URL for the dataset if neither of the first two are available. If you want to use a custom dataset,
 see the [Bring Your Own Dataset](https://pykeen.readthedocs.io/en/latest/byo/data.html) tutorial. If you
@@ -146,6 +146,7 @@ have a suggestion for another dataset to include in PyKEEN, please let us know
 | OpenBioLink                        | [`pykeen.datasets.OpenBioLink`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.OpenBioLink.html)       | [Breit *et al*., 2020](https://doi.org/10.1093/bioinformatics/btaa274)                                                  |     180992 |          28 |   4563407 |
 | OpenBioLink LQ                     | [`pykeen.datasets.OpenBioLinkLQ`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.OpenBioLinkLQ.html)   | [Breit *et al*., 2020](https://doi.org/10.1093/bioinformatics/btaa274)                                                  |     480876 |          32 |  27320889 |
 | OpenEA Family                      | [`pykeen.datasets.OpenEA`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.OpenEA.html)                 | [Sun *et al*., 2020](http://www.vldb.org/pvldb/vol13/p2326-sun.pdf)                                                     |      15000 |         248 |     38265 |
+| PharMeBINet                        | [`pykeen.datasets.PharMeBINet`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.PharMeBINet.html)       | [Königs *et al*., 2022](https://www.nature.com/articles/s41597-022-01510-3)                                             |    2869407 |         208 |  15883653 |
 | PharmKG                            | [`pykeen.datasets.PharmKG`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.PharmKG.html)               | [Zheng *et al*., 2020](https://doi.org/10.1093/bib/bbaa344)                                                             |     188296 |          39 |   1093236 |
 | PharmKG8k                          | [`pykeen.datasets.PharmKG8k`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.PharmKG8k.html)           | [Zheng *et al*., 2020](https://doi.org/10.1093/bib/bbaa344)                                                             |       7247 |          28 |    485787 |
 | PrimeKG                            | [`pykeen.datasets.PrimeKG`](https://pykeen.readthedocs.io/en/latest/api/pykeen.datasets.PrimeKG.html)               | [Chandak *et al*., 2022](https://doi.org/10.1101/2022.05.01.489928)                                                     |     129375 |          30 |   8100498 |

diff --git a/docs/source/references.rst b/docs/source/references.rst
@@ -123,3 +123,6 @@ References
 .. [peng2020] Y. Peng and J. Zhang (2020) `LineaRE: Simple but Powerful Knowledge Graph Embedding for
    Link Prediction <https://arxiv.org/abs/2004.10037>`_, *2020 IEEE International Conference on Data Mining (ICDM)*,
    pp. 422-431, doi: 10.1109/ICDM50108.2020.00051.
+
+.. [koenigs2022] Königs, C., *et al* (2022) `The heterogeneous pharmacological medical biochemical
+   network PharMeBINet <https://doi.org/10.1038/s41597-022-01510-3>`_, *Scientific Data*, **9**, 393.
diff --git a/src/pykeen/datasets/__init__.py b/src/pykeen/datasets/__init__.py
@@ -45,6 +45,7 @@
 from .nations import Nations
 from .ogb import OGBBioKG, OGBLoader, OGBWikiKG2
 from .openbiolink import OpenBioLink, OpenBioLinkLQ
+from .pharmebinet import PharMeBINet
 from .pharmkg import PharmKG, PharmKG8k
 from .primekg import PrimeKG
 from .umls import UMLS
@@ -97,6 +98,7 @@
     "PharmKG",
     "PrimeKG",
     "Globi",
+    "PharMeBINet",
 ]
 
 logger = logging.getLogger(__name__)

diff --git a/src/pykeen/datasets/base.py b/src/pykeen/datasets/base.py
@@ -717,6 +717,7 @@ def __init__(
         create_inverse_triples: bool = False,
         delimiter: Optional[str] = None,
         random_state: TorchRandomHint = None,
+        read_csv_kwargs: Optional[Dict[str, Any]] = None,
     ):
         """Initialize dataset.
 
@@ -734,6 +735,7 @@ def __init__(
         :param random_state: An optional random state to make the training/testing/validation split reproducible.
         :param delimiter:
             The delimiter for the contained dataset.
+        :param read_csv_kwargs: Keyword arguments to pass through to :func:`pandas.read_csv`.
         """
         self.cache_root = self._help_cache(cache_root)
 
@@ -743,6 +745,8 @@ def __init__(
         self.url = url
         self._create_inverse_triples = create_inverse_triples
         self._relative_path = pathlib.PurePosixPath(relative_path)
+        self.read_csv_kwargs = read_csv_kwargs or {}
+        self.read_csv_kwargs.setdefault("sep", self.delimiter)
 
         if eager:
             self._load()
@@ -808,7 +812,13 @@ def _get_df(self) -> pd.DataFrame:
                 # tarfile does not like pathlib
                 tar_file.extract(str(self._relative_path), self.cache_root)
 
-        df = pd.read_csv(_actual_path, sep=self.delimiter)
+        df = pd.read_csv(_actual_path, **self.read_csv_kwargs)
+
+        usecols = self.read_csv_kwargs.get("usecols")
+        if usecols is not None:
+            logger.info("reordering columns: %s", usecols)
+            df = df[usecols]
+
         return df
 
 

diff --git a/src/pykeen/datasets/pharmebinet.py b/src/pykeen/datasets/pharmebinet.py
@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+
+"""The `PharMeBINet <https://github.com/ckoenigs/PharMeBINet/>`_ dataset.
+
+Get a summary with ``python -m pykeen.datasets.pharmebinet``.
+"""
+
+import click
+from docdata import parse_docdata
+from more_click import verbose_option
+
+from .base import TarFileSingleDataset
+from ..typing import TorchRandomHint
+
+__all__ = [
+    "PharMeBINet",
+]
+
+RAW_URL = "https://zenodo.org/record/7011027/files/pharmebinet_tsv_2022_08_19_v2.tar.gz"
+
+
+@parse_docdata
+class PharMeBINet(TarFileSingleDataset):
+    """The PharMeBINet dataset from [koenigs2022]_.
+
+    ---
+    name: PharMeBINet
+    citation:
+        github: ckoenigs/PharMeBINet
+        author: Königs
+        year: 2022
+        link: https://www.nature.com/articles/s41597-022-01510-3
+    single: true
+    statistics:
+        entities: 2869407
+        relations: 208
+        triples: 15883653
+        training: 12702210
+        testing: 1587776
+        validation: 1587777
+    """
+
+    def __init__(
+        self,
+        random_state: TorchRandomHint = 0,
+        **kwargs,
+    ):
+        """Initialize the PharMeBINet dataset from [koenigs2022]_.
+
+        :param random_state: An optional random state to make the training/testing/validation split reproducible.
+        :param kwargs: keyword arguments passed to :class:`pykeen.datasets.base.TarFileSingleDataset`.
+        """
+        super().__init__(
+            url=RAW_URL,
+            relative_path="edges.tsv",
+            random_state=random_state,
+            read_csv_kwargs=dict(
+                usecols=["start_id", "type", "end_id"],
+                sep="\t",
+                dtype={"start_id": str, "end_id": str},
+            ),
+            **kwargs,
+        )
+
+
+@click.command()
+@verbose_option
+def _main():
+    from pykeen.datasets import get_dataset
+
+    get_dataset(dataset=PharMeBINet).summarize()
+
+
+if __name__ == "__main__":
+    _main()