Skip to content

Commit

Permalink
avoid empty aliases and improve UX and docs (explosion#6840)
Browse files Browse the repository at this point in the history
svlandeg authored Jan 29, 2021
1 parent 837a4f5 commit 24a697a
Showing 4 changed files with 35 additions and 2 deletions.
4 changes: 4 additions & 0 deletions spacy/errors.py
Original file line number Diff line number Diff line change
@@ -470,6 +470,10 @@ class Errors:
"issue tracker: http://github.com/explosion/spaCy/issues")

# TODO: fix numbering after merging develop into master
E890 = ("Can not add the alias '{alias}' to the Knowledge base. "
"Each alias should be a meaningful string.")
E891 = ("Alias '{alias}' could not be added to the Knowledge base. "
"This is likely a bug in spaCy.")
E892 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}")
E893 = ("Could not find function '{name}' in function registry '{reg_name}'. "
"If you're using a custom function, make sure the code is available. "
6 changes: 6 additions & 0 deletions spacy/kb.pyx
Original file line number Diff line number Diff line change
@@ -187,6 +187,10 @@ cdef class KnowledgeBase:
For a given alias, add its potential entities and prior probabilies to the KB.
Return the alias_hash at the end
"""
if alias is None or len(alias) == 0:
raise ValueError(Errors.E890.format(alias=alias))

previous_alias_nr = self.get_size_aliases()
# Throw an error if the length of entities and probabilities are not the same
if not len(entities) == len(probabilities):
raise ValueError(Errors.E132.format(alias=alias,
@@ -220,6 +224,8 @@ cdef class KnowledgeBase:
new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
self._alias_index[alias_hash] = new_index

if previous_alias_nr + 1 != self.get_size_aliases():
raise RuntimeError(Errors.E891.format(alias=alias))
return alias_hash

def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False):
23 changes: 23 additions & 0 deletions spacy/tests/regression/test_issue6730.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pytest
from ..util import make_tempdir


def test_issue6730(en_vocab):
"""Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
from spacy.kb import KnowledgeBase

kb = KnowledgeBase(en_vocab, entity_vector_length=3)
kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])

with pytest.raises(ValueError):
kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
assert kb.contains_alias("") is False

kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])

with make_tempdir() as tmp_dir:
kb.to_disk(tmp_dir)
kb.from_disk(tmp_dir)
assert kb.get_size_aliases() == 2
assert set(kb.get_alias_strings()) == {"x", "y"}
4 changes: 2 additions & 2 deletions website/docs/api/kb.md
Original file line number Diff line number Diff line change
@@ -82,7 +82,7 @@ Add an alias or mention to the knowledge base, specifying its potential KB
identifiers and their prior probabilities. The entity identifiers should refer
to entities previously added with [`add_entity`](/api/kb#add_entity) or
[`set_entities`](/api/kb#set_entities). The sum of the prior probabilities
should not exceed 1.
should not exceed 1. Note that an empty string can not be used as alias.
> #### Example
>
@@ -92,7 +92,7 @@ should not exceed 1.
| Name | Description |
| --------------- | --------------------------------------------------------------------------------- |
| `alias` | The textual mention or alias. ~~str~~ |
| `alias` | The textual mention or alias. Can not be the empty string. ~~str~~ |
| `entities` | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ |
| `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~ |

0 comments on commit 24a697a

Please sign in to comment.