diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 5022316fa8..aa0ad94822 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.2.0-dev +current_version = 1.3.0-dev commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(?:-(?P[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))?(?:\+(?P[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*))? diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 90ce0b5fbf..d6a864ded9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -85,6 +85,8 @@ jobs: run: tox -e py - name: Run slow tests run: tox -e integration + - name: Run doctests + run: tox -e doctests windows: if: "contains(github.event.head_commit.message, 'Trigger CI')" name: Windows diff --git a/.github/workflows/tests_master.yml b/.github/workflows/tests_master.yml index eb11b184bb..9719fa7a33 100644 --- a/.github/workflows/tests_master.yml +++ b/.github/workflows/tests_master.yml @@ -84,6 +84,8 @@ jobs: run: tox -e py - name: Run slow tests run: tox -e integration + - name: Run doctests + run: tox -e doctests windows: if: "!contains(github.event.head_commit.message, 'skip ci')" name: Windows diff --git a/.gitignore b/.gitignore index 7bf7dfe10a..a30a2480d9 100644 --- a/.gitignore +++ b/.gitignore @@ -117,3 +117,4 @@ docs/source/api/* scratch/* wandb/* mlruns +doctests/ diff --git a/AUTHORS.md b/AUTHORS.md index e8a1ae4295..b846ecf502 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -16,3 +16,5 @@ - [Michael Galkin](https://github.com/migalkin) - [Felix Hamann](https://github.com/kantholtz) - [Sankranti Joshi](https://github.com/sunny1401) + +See also: https://github.com/pykeen/pykeen/graphs/contributors diff --git a/LICENSE b/LICENSE index 81380cad04..f48adb8aad 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2019-2020 PyKEEN Project Team +Copyright (c) 2019-2021 PyKEEN Project Team Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 6697258ba8..c58fd6fc27 100644 --- a/README.md +++ b/README.md @@ -300,7 +300,7 @@ See [CONTRIBUTING.md](/CONTRIBUTING.md) for more information on getting involved This project has been supported by several organizations (in alphabetical order): - [Bayer](https://www.bayer.com/) -- [Enveda Therapeutics](https://envedatherapeutics.com/) +- [Enveda Biosciences](https://www.envedabio.com/) - [Fraunhofer Institute for Algorithms and Scientific Computing](https://www.scai.fraunhofer.de) - [Fraunhofer Institute for Intelligent Analysis and Information Systems](https://www.iais.fraunhofer.de) - [Fraunhofer Center for Machine Learning](https://www.cit.fraunhofer.de/de/zentren/maschinelles-lernen.html) diff --git a/docs/source/conf.py b/docs/source/conf.py index 39106fb3be..0a7ca9f829 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -52,7 +52,7 @@ author = 'PyKEEN Project Team' # The full version, including alpha/beta/rc tags. -release = '1.2.0-dev' +release = '1.3.0-dev' # The short X.Y version. parsed_version = re.match( diff --git a/docs/source/reference/constants.rst b/docs/source/reference/constants.rst index 678d6016ec..3ca0facdba 100644 --- a/docs/source/reference/constants.rst +++ b/docs/source/reference/constants.rst @@ -2,3 +2,6 @@ Constants ========= .. automodule:: pykeen.constants :members: + +.. automodule:: pykeen.typing + :members: diff --git a/docs/source/tutorial/byod.rst b/docs/source/tutorial/byod.rst index 0947f8ab89..71bb07fe5b 100644 --- a/docs/source/tutorial/byod.rst +++ b/docs/source/tutorial/byod.rst @@ -1,7 +1,9 @@ Bring Your Own Data =================== As an alternative to using a pre-packaged dataset, the training and testing can be set explicitly -by file path or with instances of :class:`pykeen.triples.TriplesFactory`. +by file path or with instances of :class:`pykeen.triples.TriplesFactory`. Throughout this +tutorial, the paths to the training, testing, and validation sets for built-in +:class:`pykeen.datasets.Nations` will be used as examples. Pre-stratified Dataset ---------------------- @@ -9,20 +11,16 @@ You've got a training and testing file as 3-column TSV files, all ready to go. Y any entities or relations appearing in the testing set that don't appear in the training set. Load them in the pipeline like this: -.. code-block:: python - - from pykeen.triples import TriplesFactory - from pykeen.pipeline import pipeline - - training_path: str = ... - testing_path: str = ... - - result = pipeline( - training_triples_factory=training_path, - testing_triples_factory=testing_path, - model='TransE', - ) - result.save_to_directory('test_pre_stratified_transe') +>>> from pykeen.triples import TriplesFactory +>>> from pykeen.pipeline import pipeline +>>> from pykeen.datasets.nations import NATIONS_TRAIN_PATH, NATIONS_TEST_PATH +>>> result = pipeline( +... training=NATIONS_TRAIN_PATH, +... testing=NATIONS_TEST_PATH, +... model='TransE', +... training_kwargs=dict(num_epochs=5), # short epochs for testing - you should go higher +... ) +>>> result.save_to_directory('doctests/test_pre_stratified_transe') PyKEEN will take care of making sure that the entities are mapped from their labels to appropriate integer (technically, 0-dimensional :class:`torch.LongTensor`) indexes and that the different sets of triples @@ -31,21 +29,17 @@ share the same mapping. This is equally applicable for the :func:`pykeen.hpo.hpo_pipeline`, which has a similar interface to the :func:`pykeen.pipeline.pipeline` as in: -.. code-block:: python - - from pykeen.triples import TriplesFactory - from pykeen.hpo import hpo_pipeline - - training_path: str = ... - testing_path: str = ... - - result = hpo_pipeline( - n_trials=30, - training_triples_factory=training_path, - testing_triples_factory=testing_path, - model='TransE', - ) - result.save_to_directory('test_hpo_pre_stratified_transe') +>>> from pykeen.hpo import hpo_pipeline +>>> from pykeen.datasets.nations import NATIONS_TRAIN_PATH, NATIONS_TEST_PATH, NATIONS_VALIDATE_PATH +>>> result = hpo_pipeline( +... n_trials=3, # you probably want more than this +... training=NATIONS_TRAIN_PATH, +... testing=NATIONS_TEST_PATH, +... validation=NATIONS_VALIDATE_PATH, +... model='TransE', +... training_kwargs=dict(num_epochs=5), # short epochs for testing - you should go higher +... ) +>>> result.save_to_directory('doctests/test_hpo_pre_stratified_transe') The remainder of the examples will be for :func:`pykeen.pipeline.pipeline`, but all work exactly the same for :func:`pykeen.hpo.hpo_pipeline`. @@ -53,46 +47,36 @@ for :func:`pykeen.hpo.hpo_pipeline`. If you want to add dataset-wide arguments, you can use the ``dataset_kwargs`` argument to the :class:`pykeen.pipeline.pipeline` to enable options like ``create_inverse_triples=True``. -.. code-block:: python - - from pykeen.triples import TriplesFactory - from pykeen.pipeline import pipeline - - training_path: str = ... - testing_path: str = ... - - result = pipeline( - training_triples_factory=training_path, - testing_triples_factory=testing_path, - dataset_kwargs={'create_inverse_triples': True}, - model='TransE', - ) - result.save_to_directory('test_pre_stratified_transe') +>>> from pykeen.pipeline import pipeline +>>> from pykeen.datasets.nations import NATIONS_TRAIN_PATH, NATIONS_TEST_PATH +>>> result = pipeline( +... training=NATIONS_TRAIN_PATH, +... testing=NATIONS_TEST_PATH, +... dataset_kwargs={'create_inverse_triples': True}, +... model='TransE', +... training_kwargs=dict(num_epochs=5), # short epochs for testing - you should go higher +... ) +>>> result.save_to_directory('doctests/test_pre_stratified_transe') If you want finer control over how the triples are created, for example, if they are not all coming from TSV files, you can use the :class:`pykeen.triples.TriplesFactory` interface. -.. code-block:: python - - from pykeen.triples import TriplesFactory - from pykeen.pipeline import pipeline - - training_path: str = ... - testing_path: str = ... - - training = TriplesFactory(path=training_path) - testing = TriplesFactory( - path=testing_path, - entity_to_id=training.entity_to_id, - relation_to_id=training.relation_to_id, - ) - - result = pipeline( - training_triples_factory=training, - testing_triples_factory=testing, - model='TransE', - ) - pipeline_result.save_to_directory('test_pre_stratified_transe') +>>> from pykeen.triples import TriplesFactory +>>> from pykeen.pipeline import pipeline +>>> from pykeen.datasets.nations import NATIONS_TRAIN_PATH, NATIONS_TEST_PATH +>>> training = TriplesFactory.from_path(NATIONS_TRAIN_PATH) +>>> testing = TriplesFactory.from_path( +... NATIONS_TEST_PATH, +... entity_to_id=training.entity_to_id, +... relation_to_id=training.relation_to_id, +... ) +>>> result = pipeline( +... training=training, +... testing=testing, +... model='TransE', +... training_kwargs=dict(num_epochs=5), # short epochs for testing - you should go higher +... ) +>>> result.save_to_directory('doctests/test_pre_stratified_transe') .. warning:: @@ -106,31 +90,26 @@ The ``dataset_kwargs`` argument is ignored when passing your own :class:`pykeen. sure to include the ``create_inverse_triples=True`` in the instantiation of those classes if that's your desired behavior as in: -.. code-block:: python - - from pykeen.triples import TriplesFactory - from pykeen.pipeline import pipeline - - training_path: str = ... - testing_path: str = ... - - training = TriplesFactory( - path=training_path, - create_inverse_triples=True, - ) - testing = TriplesFactory( - path=testing_path, - entity_to_id=training.entity_to_id, - relation_to_id=training.relation_to_id, - create_inverse_triples=True, - ) - - result = pipeline( - training_triples_factory=training, - testing_triples_factory=testing, - model='TransE', - ) - result.save_to_directory('test_pre_stratified_transe') +>>> from pykeen.triples import TriplesFactory +>>> from pykeen.pipeline import pipeline +>>> from pykeen.datasets.nations import NATIONS_TRAIN_PATH, NATIONS_TEST_PATH +>>> training = TriplesFactory.from_path( +... NATIONS_TRAIN_PATH, +... create_inverse_triples=True, +... ) +>>> testing = TriplesFactory.from_path( +... NATIONS_TEST_PATH, +... entity_to_id=training.entity_to_id, +... relation_to_id=training.relation_to_id, +... create_inverse_triples=True, +... ) +>>> result = pipeline( +... training=training, +... testing=testing, +... model='TransE', +... training_kwargs=dict(num_epochs=5), # short epochs for testing - you should go higher +... ) +>>> result.save_to_directory('doctests/test_pre_stratified_transe') Triples factories can also be instantiated using the ``triples`` keyword argument instead of the ``path`` argument if you already have triples loaded in a :class:`numpy.ndarray`. @@ -141,37 +120,34 @@ It's more realistic your real-world dataset is not already stratified into train PyKEEN has you covered with :func:`pykeen.triples.TriplesFactory.split`, which will allow you to create a stratified dataset. -.. code-block:: python - - from pykeen.triples import TriplesFactory - from pykeen.pipeline import pipeline - - tf = TriplesFactory(path=...) - training, testing = tf.split() - - result = pipeline( - training_triples_factory=training, - testing_triples_factory=testing, - model='TransE', - ) - pipeline_result.save_to_directory('test_unstratified_transe') +>>> from pykeen.triples import TriplesFactory +>>> from pykeen.pipeline import pipeline +>>> from pykeen.datasets.nations import NATIONS_TRAIN_PATH +>>> tf = TriplesFactory.from_path(NATIONS_TRAIN_PATH) +>>> training, testing = tf.split() +>>> result = pipeline( +... training=training, +... testing=testing, +... model='TransE', +... training_kwargs=dict(num_epochs=5), # short epochs for testing - you should go higher +... ) +>>> result.save_to_directory('doctests/test_unstratified_transe') By default, this is an 80/20 split. If you want to use early stopping, you'll also need a validation set, so you should specify the splits: -.. code-block:: python - - from pykeen.triples import TriplesFactory - from pykeen.pipeline import pipeline - - tf = TriplesFactory(path=...) - training, testing, validation = tf.split([.8, .1, .1]) - - result = pipeline( - training_triples_factory=training, - testing_triples_factory=testing, - validation_triples_factory=validation, - model='TransE', - stopper='early', - ) - pipeline_result.save_to_directory('test_unstratified_stopped_transe') +>>> from pykeen.triples import TriplesFactory +>>> from pykeen.pipeline import pipeline +>>> from pykeen.datasets.nations import NATIONS_TRAIN_PATH +>>> tf = TriplesFactory.from_path(NATIONS_TRAIN_PATH) +>>> training, testing, validation = tf.split([.8, .1, .1]) +>>> result = pipeline( +... training=training, +... testing=testing, +... validation=validation, +... model='TransE', +... stopper='early', +... training_kwargs=dict(num_epochs=5), # short epochs for testing - you should go +... # higher, especially with early stopper enabled +... ) +>>> result.save_to_directory('doctests/test_unstratified_stopped_transe') diff --git a/docs/source/tutorial/checkpoints.rst b/docs/source/tutorial/checkpoints.rst index 157e0bfde7..cfa550020e 100644 --- a/docs/source/tutorial/checkpoints.rst +++ b/docs/source/tutorial/checkpoints.rst @@ -17,55 +17,46 @@ Regular Checkpoints The tutorial :ref:`first_steps` showed how the :func:`pykeen.pipeline.pipeline` function can be used to set up an entire KGEM for training and evaluation in just two lines of code. A slightly extended example is shown below: -.. code-block:: python - - from pykeen.pipeline import pipeline - - pipeline_result = pipeline( - dataset='Nations', - model='TransE', - optimizer='Adam', - training_kwargs=dict( - num_epochs=1000, - ), - ) +>>> from pykeen.pipeline import pipeline +>>> pipeline_result = pipeline( +... dataset='Nations', +... model='TransE', +... optimizer='Adam', +... training_kwargs=dict( +... num_epochs=1000, +... ), +... ) To enable checkpoints, all you have to do is add a ``checkpoint_name`` argument to the ``training_kwargs``. This argument should have the name you would like the checkpoint files saved on your computer to be called. -.. code-block:: python - - from pykeen.pipeline import pipeline - - pipeline_result = pipeline( - dataset='Nations', - model='TransE', - optimizer='Adam', - training_kwargs=dict( - num_epochs=1000, - checkpoint_name='my_checkpoint.pt', - ), - ) +>>> from pykeen.pipeline import pipeline +>>> pipeline_result = pipeline( +... dataset='Nations', +... model='TransE', +... optimizer='Adam', +... training_kwargs=dict( +... num_epochs=1000, +... checkpoint_name='my_checkpoint.pt', +... ), +... ) Furthermore, you can set the checkpoint frequency, i.e. how often checkpoints should be saved given in minutes, by setting the argument ``checkpoint_frequency`` with an integer. The default frequency is 30 minutes and setting it to ``0`` will cause the training loop to save a checkpoint after each epoch. Let's look at an example. -.. code-block:: python - - from pykeen.pipeline import pipeline - - pipeline_result = pipeline( - dataset='Nations', - model='TransE', - optimizer='Adam', - training_kwargs=dict( - num_epochs=1000, - checkpoint_name='my_checkpoint.pt', - checkpoint_frequency=5, - ), - ) +>>> from pykeen.pipeline import pipeline +>>> pipeline_result = pipeline( +... dataset='Nations', +... model='TransE', +... optimizer='Adam', +... training_kwargs=dict( +... num_epochs=1000, +... checkpoint_name='my_checkpoint.pt', +... checkpoint_frequency=5, +... ), +... ) Here we have defined a pipeline that will save training loop checkpoints in the checkpoint file called ``my_checkpoint.pt`` every time an epoch finishes and at least `5` minutes have passed since saving previously. @@ -78,20 +69,17 @@ or the early stopper stops it. Assuming that you successfully trained the KGEM a that you would like to test the model with `2000` epochs, all you have to do is to change the number of epochs and execute the code like: -.. code-block:: python - - from pykeen.pipeline import pipeline - - pipeline_result = pipeline( - dataset='Nations', - model='TransE', - optimizer='Adam', - training_kwargs=dict( - num_epochs=2000, # more epochs than before - checkpoint_name='my_checkpoint.pt', - checkpoint_frequency=5, - ), - ) +>>> from pykeen.pipeline import pipeline +>>> pipeline_result = pipeline( +... dataset='Nations', +... model='TransE', +... optimizer='Adam', +... training_kwargs=dict( +... num_epochs=2000, # more epochs than before +... checkpoint_name='my_checkpoint.pt', +... checkpoint_frequency=5, +... ), +... ) The above code will load the saved state after finishing `1000` epochs and continue to train to `2000` epochs, giving the exact same results as if you would have run it for `2000` epochs in the first place. @@ -101,20 +89,17 @@ which is a subdirectory in your home directory, e.g. ``~/.data/pykeen/checkpoint Optionally, you can set the path to where you want the checkpoints to be saved by setting the ``checkpoint_directory`` argument with a string or a :class:`pathlib.Path` object containing your desired root path, as shown in this example: -.. code-block:: python - - from pykeen.pipeline import pipeline - - pipeline_result = pipeline( - dataset='Nations', - model='TransE', - optimizer='Adam', - training_kwargs=dict( - num_epochs=2000, - checkpoint_name='my_checkpoint.pt', - checkpoint_directory='/my/secret/dir', - ), - ) +>>> from pykeen.pipeline import pipeline +>>> pipeline_result = pipeline( +... dataset='Nations', +... model='TransE', +... optimizer='Adam', +... training_kwargs=dict( +... num_epochs=2000, +... checkpoint_name='my_checkpoint.pt', +... checkpoint_directory='doctests/checkpoint_dir', +... ), +... ) .. _failure_checkpoints_how_to: @@ -123,16 +108,16 @@ Checkpoints on Failure In cases where you only would like to save checkpoints whenever the training loop might fail, you can use the argument ``checkpoint_on_failure=True``, like: -.. code-block:: python - - from pykeen.pipeline import pipeline - - pipeline_result = pipeline( - dataset='Nations', - model='TransE', - optimizer='Adam', - training_kwargs=dict(num_epochs=2000, checkpoint_on_failure=True), - ) +>>> from pykeen.pipeline import pipeline +>>> pipeline_result = pipeline( +... dataset='Nations', +... model='TransE', +... optimizer='Adam', +... training_kwargs=dict( +... num_epochs=2000, +... checkpoint_on_failure=True, +... ), +... ) This option differs from regular checkpoints, since regular checkpoints are only saved after a successful epoch. When saving checkpoints due to failure of the training loop there is no guarantee that all @@ -141,19 +126,17 @@ specific training loop. Therefore, these checkpoints are saved with a distinct c ``PyKEEN_just_saved_my_day_{datetime}.pt`` in the given ``checkpoint_directory``, even when you also opted to use regular checkpoints as defined above, e.g. with this code: -.. code-block:: python - - from pykeen.pipeline import pipeline - pipeline_result = pipeline( - dataset='Nations', - model='TransE', - optimizer='Adam', - training_kwargs=dict( - num_epochs=2000, - checkpoint_name='my_checkpoint.pt', - checkpoint_on_failure=True, - ), - ) +>>> from pykeen.pipeline import pipeline +>>> pipeline_result = pipeline( +... dataset='Nations', +... model='TransE', +... optimizer='Adam', +... training_kwargs=dict( +... num_epochs=2000, +... checkpoint_name='my_checkpoint.pt', +... checkpoint_on_failure=True, +... ), +... ) Note: Use this argument with caution, since every failed training loop will create a distinct checkpoint file. @@ -193,21 +176,17 @@ the same compared to running uninterrupted without checkpoints, also for the eva To show how to use the checkpoint functionality without the pipeline, we define a KGEM first: -.. code-block:: python - - from pykeen.models import TransE - from pykeen.training import SLCWATrainingLoop - from pykeen.triples import TriplesFactory - from torch.optim import Adam - - triples_factory = Nations().training - model = TransE( - triples_factory=triples_factory, - random_seed=123, - ) - - optimizer = Adam(params=model.get_grad_params()) - training_loop = SLCWATrainingLoop(model=model, optimizer=optimizer) +>>> from pykeen.models import TransE +>>> from pykeen.training import SLCWATrainingLoop +>>> from pykeen.triples import TriplesFactory +>>> from torch.optim import Adam +>>> triples_factory = Nations().training +>>> model = TransE( +... triples_factory=triples_factory, +... random_seed=123, +... ) +>>> optimizer = Adam(params=model.get_grad_params()) +>>> training_loop = SLCWATrainingLoop(model=model, optimizer=optimizer) At this point we have a model, dataset and optimizer all setup in a training loop and are ready to train the model with the ``training_loop``'s method :func:`pykeen.training.TrainingLoop.train`. To enable checkpoints all you have to do is @@ -222,13 +201,11 @@ argument with a string or a :class:`pathlib.Path` object containing your desired Here is an example: -.. code-block:: python - - losses = training_loop.train( - num_epochs=1000, - checkpoint_name='my_checkpoint.pt', - checkpoint_frequency=5, - ) +>>> losses = training_loop.train( +... num_epochs=1000, +... checkpoint_name='my_checkpoint.pt', +... checkpoint_frequency=5, +... ) With this code we have started the training loop with the above defined KGEM. The training loop will save a checkpoint in the ``my_checkpoint.pt`` file, which will be saved in the ``~/.data/pykeen/checkpoints/`` directory, since we haven't @@ -249,26 +226,22 @@ E.g. the above training loop finished successfully after 1000 epochs, but you wo train the same model from that state for 2000 epochs. All you have have to do is to change the argument ``num_epochs`` in the above code to: -.. code-block:: python - - losses = training_loop.train( - num_epochs=2000, - checkpoint_name='my_checkpoint.pt', - checkpoint_frequency=5, - ) +>>> losses = training_loop.train( +... num_epochs=2000, +... checkpoint_name='my_checkpoint.pt', +... checkpoint_frequency=5, +... ) and now the training loop will resume from the state at 1000 epochs and continue to train until 2000 epochs. As shown in :ref:`failure_checkpoints_how_to`, you can also save checkpoints only in cases where the training loop fails. To do this you just have to set the argument `checkpoint_on_failure=True`, like: -.. code-block:: python - - losses = training_loop.train( - num_epochs=2000, - checkpoint_directory='/my/secret/dir', - checkpoint_on_failure=True, - ) +>>> losses = training_loop.train( +... num_epochs=2000, +... checkpoint_directory='/my/secret/dir', +... checkpoint_on_failure=True, +... ) This code will save a checkpoint in case the training loop fails. Note how we also chose a new checkpoint directory by setting the `checkpoint_directory` argument to ``/my/secret/dir``. diff --git a/docs/source/tutorial/making_predictions.rst b/docs/source/tutorial/making_predictions.rst index 3e10337107..5744984674 100644 --- a/docs/source/tutorial/making_predictions.rst +++ b/docs/source/tutorial/making_predictions.rst @@ -26,30 +26,22 @@ This example shows using the :func:`pykeen.pipeline.pipeline` to train a model which will already be in memory. Each of the high-level interfaces are exposed through the model: -.. code-block:: python - - from pykeen.pipeline import pipeline - - pipeline_result = pipeline(dataset='Nations', model='RotatE') - model = pipeline_result.model - - # Predict tails - predicted_tails_df = model.get_tail_prediction_df('brazil', 'intergovorgs') - - # Predict relations - predicted_relations_df = model.get_relation_prediction_df('brazil', 'uk') - - # Predict heads - predicted_heads_df = model.get_head_prediction_df('conferences', 'brazil') - - # Score all triples (memory intensive) - predictions_df = model.get_all_prediction_df() - - # Score top K triples - predictions_df = model.get_all_prediction_df(k=150) - - # save the model - pipeline_result.save_to_directory('nations_rotate') +>>> from pykeen.pipeline import pipeline +>>> # Run the pipeline +>>> pipeline_result = pipeline(dataset='Nations', model='RotatE') +>>> model = pipeline_result.model +>>> # Predict tails +>>> predicted_tails_df = model.get_tail_prediction_df('brazil', 'intergovorgs') +>>> # Predict relations +>>> predicted_relations_df = model.get_relation_prediction_df('brazil', 'uk') +>>> # Predict heads +>>> predicted_heads_df = model.get_head_prediction_df('conferences', 'brazil') +>>> # Score all triples (memory intensive) +>>> predictions_df = model.get_all_prediction_df() +>>> # Score top K triples +>>> top_k_predictions_df = model.get_all_prediction_df(k=150) +>>> # save the model +>>> pipeline_result.save_to_directory('doctests/nations_rotate') Loading a Model ~~~~~~~~~~~~~~~ @@ -58,16 +50,11 @@ This example shows how to reload a previously trained model. The a file named ``trained_model.pkl``, so we will use the one from the previous example. -.. code-block:: python - - import torch - - model = torch.load('nations_rotate/trained_model.pkl') - - # Predict tails - predicted_tails_df = model.get_tail_prediction_df('brazil', 'intergovorgs') - - # everything else is the same as above +>>> import torch +>>> model = torch.load('doctests/nations_rotate/trained_model.pkl') +>>> # Predict tails +>>> predicted_tails_df = model.get_tail_prediction_df('brazil', 'intergovorgs') +>>> # everything else is the same as above There's an example model available at https://github.com/pykeen/pykeen/blob/master/notebooks/hello_world/nations_transe/trained_model.pkl diff --git a/src/pykeen/datasets/__init__.py b/src/pykeen/datasets/__init__.py index 842374448b..5019000c68 100644 --- a/src/pykeen/datasets/__init__.py +++ b/src/pykeen/datasets/__init__.py @@ -125,7 +125,7 @@ def get_dataset( raise TypeError(f'Dataset is invalid type: {type(dataset)}') if isinstance(training, str) and isinstance(testing, str): - if isinstance(validation, str): + if validation is None or isinstance(validation, str): return PathDataset( training_path=training, testing_path=testing, @@ -146,7 +146,12 @@ def get_dataset( validation=validation, ) - raise TypeError('Training and testing must both be given as strings or Triples Factories') + raise TypeError( + f'''Training and testing must both be given as strings or Triples Factories. + - Training: {type(training)}: {training} + - Testing: {type(testing)}: {testing} + ''', + ) def has_dataset(key: str) -> bool: diff --git a/src/pykeen/datasets/base.py b/src/pykeen/datasets/base.py index d71cdaa9c1..ec368d09ed 100644 --- a/src/pykeen/datasets/base.py +++ b/src/pykeen/datasets/base.py @@ -177,13 +177,12 @@ def testing(self) -> TriplesFactory: # type:ignore # noqa: D401 return self._testing @property - def validation(self) -> TriplesFactory: # type:ignore # noqa: D401 + def validation(self) -> Optional[TriplesFactory]: # type:ignore # noqa: D401 """The validation triples factory that shares indices with the training triples factory.""" if not self._loaded: self._load() if not self._loaded_validation: self._load_validation() - assert self._validation is not None return self._validation @property @@ -224,7 +223,7 @@ def __init__( self, training_path: Union[str, TextIO], testing_path: Union[str, TextIO], - validation_path: Union[str, TextIO], + validation_path: Union[None, str, TextIO], eager: bool = False, create_inverse_triples: bool = False, load_triples_kwargs: Optional[Mapping[str, Any]] = None, @@ -269,14 +268,17 @@ def _load_validation(self) -> None: # don't call this function by itself. assumes called through the `validation` # property and the _training factory has already been loaded assert self._training is not None - self._validation = TriplesFactory.from_path( - path=self.validation_path, - entity_to_id=self._training.entity_to_id, # share entity index with training - relation_to_id=self._training.relation_to_id, # share relation index with training - # do not explicitly create inverse triples for testing; this is handled by the evaluation code - create_inverse_triples=False, - load_triples_kwargs=self.load_triples_kwargs, - ) + if self.validation_path is None: + self._validation = None + else: + self._validation = TriplesFactory.from_path( + path=self.validation_path, + entity_to_id=self._training.entity_to_id, # share entity index with training + relation_to_id=self._training.relation_to_id, # share relation index with training + # do not explicitly create inverse triples for testing; this is handled by the evaluation code + create_inverse_triples=False, + load_triples_kwargs=self.load_triples_kwargs, + ) def __repr__(self) -> str: # noqa: D105 return ( diff --git a/src/pykeen/datasets/dbpedia.py b/src/pykeen/datasets/dbpedia.py index d508d9154f..404620519d 100644 --- a/src/pykeen/datasets/dbpedia.py +++ b/src/pykeen/datasets/dbpedia.py @@ -45,8 +45,4 @@ def __init__(self, create_inverse_triples: bool = False, **kwargs): if __name__ == '__main__': - _d = DBpedia50() - _d.summarize() - print(_d.training.triples[:5]) - print(_d.testing.triples[:5]) - print(_d.validation.triples[:5]) + DBpedia50().summarize() diff --git a/src/pykeen/pipeline.py b/src/pykeen/pipeline.py index f5375ac95f..eab5371faf 100644 --- a/src/pykeen/pipeline.py +++ b/src/pykeen/pipeline.py @@ -174,6 +174,7 @@ import pickle import time from dataclasses import dataclass, field +from pathlib import Path from typing import Any, Collection, Dict, Iterable, List, Mapping, Optional, Set, Type, Union import pandas as pd @@ -423,7 +424,12 @@ def _get_results(self) -> Mapping[str, Any]: results['stopper'] = self.stopper.get_summary_dict() return results - def save_to_directory(self, directory: str, save_metadata: bool = True, save_replicates: bool = True) -> None: + def save_to_directory( + self, + directory: Union[str, Path], + save_metadata: bool = True, + save_replicates: bool = True, + ) -> None: """Save all artifacts in the given directory.""" os.makedirs(directory, exist_ok=True) diff --git a/src/pykeen/templates/README.md b/src/pykeen/templates/README.md index 42fdd76b4e..155b0820eb 100644 --- a/src/pykeen/templates/README.md +++ b/src/pykeen/templates/README.md @@ -202,7 +202,7 @@ See [CONTRIBUTING.md](/CONTRIBUTING.md) for more information on getting involved This project has been supported by several organizations (in alphabetical order): - [Bayer](https://www.bayer.com/) -- [Enveda Therapeutics](https://envedatherapeutics.com/) +- [Enveda Biosciences](https://www.envedabio.com/) - [Fraunhofer Institute for Algorithms and Scientific Computing](https://www.scai.fraunhofer.de) - [Fraunhofer Institute for Intelligent Analysis and Information Systems](https://www.iais.fraunhofer.de) - [Fraunhofer Center for Machine Learning](https://www.cit.fraunhofer.de/de/zentren/maschinelles-lernen.html) diff --git a/src/pykeen/typing.py b/src/pykeen/typing.py index 6ac0c4db22..d2d0b09159 100644 --- a/src/pykeen/typing.py +++ b/src/pykeen/typing.py @@ -12,18 +12,20 @@ 'Hint', 'Mutation', 'OneOrSequence', - # Others + # Triples 'LabeledTriples', 'MappedTriples', 'EntityMapping', 'RelationMapping', + # Others + 'DeviceHint', + 'TorchRandomHint', + # Tensor Functions 'Initializer', 'Normalizer', 'Constrainer', 'cast_constrainer', - 'InteractionFunction', - 'DeviceHint', - 'TorchRandomHint', + # Tensors 'HeadRepresentation', 'RelationRepresentation', 'TailRepresentation', @@ -34,6 +36,7 @@ X = TypeVar('X') Hint = Union[None, str, X] +#: A function that mutates the input and returns a new object of the same type as output Mutation = Callable[[X], X] OneOrSequence = Union[X, Sequence[X]] @@ -42,13 +45,12 @@ EntityMapping = Mapping[str, int] RelationMapping = Mapping[str, int] -# comment: TypeVar expects none, or at least two super-classes -TensorType = TypeVar("TensorType", torch.Tensor, torch.FloatTensor) -InteractionFunction = Callable[[TensorType, TensorType, TensorType], TensorType] - -Initializer = Mutation[TensorType] -Normalizer = Mutation[TensorType] -Constrainer = Mutation[TensorType] +#: A function that can be applied to a tensor to initialize it +Initializer = Mutation[torch.FloatTensor] +#: A function that can be applied to a tensor to normalize it +Normalizer = Mutation[torch.FloatTensor] +#: A function that can be applied to a tensor to constrain it +Constrainer = Mutation[torch.FloatTensor] def cast_constrainer(f) -> Constrainer: @@ -56,11 +58,19 @@ def cast_constrainer(f) -> Constrainer: return cast(Constrainer, f) +#: A hint for a :class:`torch.device` DeviceHint = Hint[torch.device] +#: A hint for a :class:`torch.Generator` TorchRandomHint = Hint[torch.Generator] +#: A type variable for head representations used in :class:`pykeen.models.Model`, +#: :class:`pykeen.nn.modules.Interaction`, etc. HeadRepresentation = TypeVar("HeadRepresentation", bound=OneOrSequence[torch.FloatTensor]) +#: A type variable for relation representations used in :class:`pykeen.models.Model`, +#: :class:`pykeen.nn.modules.Interaction`, etc. RelationRepresentation = TypeVar("RelationRepresentation", bound=OneOrSequence[torch.FloatTensor]) +#: A type variable for tail representations used in :class:`pykeen.models.Model`, +#: :class:`pykeen.nn.modules.Interaction`, etc. TailRepresentation = TypeVar("TailRepresentation", bound=OneOrSequence[torch.FloatTensor]) diff --git a/src/pykeen/version.py b/src/pykeen/version.py index 0a27740ef5..a6ffa3c798 100644 --- a/src/pykeen/version.py +++ b/src/pykeen/version.py @@ -11,7 +11,7 @@ 'get_git_hash', ] -VERSION = '1.2.0-dev' +VERSION = '1.3.0-dev' def get_git_hash() -> str: diff --git a/tox.ini b/tox.ini index 84b4400ad0..a1b7b76d99 100644 --- a/tox.ini +++ b/tox.ini @@ -17,6 +17,7 @@ envlist = doc8 docs # the actual tests + doctests py integration # always keep coverage-report last @@ -49,6 +50,15 @@ deps = extras = mlflow +[testenv:doctests] +commands = + # TODO make this automatic for all RST in a loop (but not using xargs since doctest uses multiprocessing) + python -m doctest docs/source/tutorial/first_steps.rst + python -m doctest docs/source/tutorial/byod.rst + python -m doctest docs/source/tutorial/making_predictions.rst + # python -m doctest src/pykeen/pipeline.py + # python -m doctest src/pykeen/hpo/__init__.py + [testenv:coverage-clean] deps = coverage skip_install = true