-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathtest_backend_pav.py
192 lines (154 loc) · 5.86 KB
/
test_backend_pav.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
"""Unit tests for the PAV backend in Annif"""
import logging
from datetime import datetime, timedelta, timezone
import py.path
import pytest
import annif.backend
import annif.corpus
from annif.exception import NotSupportedException
def test_pav_default_params(document_corpus, app_project):
pav_type = annif.backend.get_backend("pav")
pav = pav_type(backend_id="pav", config_params={}, project=app_project)
expected_default_params = {
"min-docs": 10,
"limit": 100,
}
actual_params = pav.params
for param, val in expected_default_params.items():
assert param in actual_params and actual_params[param] == val
def test_pav_is_not_trained(app_project):
pav_type = annif.backend.get_backend("pav")
pav = pav_type(
backend_id="pav",
config_params={"limit": 50, "min-docs": 2, "sources": "dummy-fi"},
project=app_project,
)
assert not pav.is_trained
def test_pav_train(tmpdir, app_project):
pav_type = annif.backend.get_backend("pav")
pav = pav_type(
backend_id="pav",
config_params={"limit": 50, "min-docs": 2, "sources": "dummy-fi"},
project=app_project,
)
tmpfile = tmpdir.join("document.tsv")
tmpfile.write(
"dummy\thttp://example.org/dummy\n"
+ "another\thttp://example.org/dummy\n"
+ "none\thttp://example.org/none"
)
document_corpus = annif.corpus.DocumentFile(str(tmpfile), app_project.subjects)
pav.train(document_corpus)
datadir = py.path.local(app_project.datadir)
assert datadir.join("pav-model-dummy-fi").exists()
assert datadir.join("pav-model-dummy-fi").size() > 0
def test_pav_train_cached(app_project):
pav_type = annif.backend.get_backend("pav")
pav = pav_type(
backend_id="pav",
config_params={"limit": 50, "min-docs": 2, "sources": "dummy-fi"},
project=app_project,
)
with pytest.raises(NotSupportedException):
pav.train("cached")
def test_pav_train_nodocuments(app_project, empty_corpus):
pav_type = annif.backend.get_backend("pav")
pav = pav_type(
backend_id="pav",
config_params={"limit": 50, "min-docs": 2, "sources": "dummy-fi"},
project=app_project,
)
with pytest.raises(NotSupportedException) as excinfo:
pav.train(empty_corpus)
assert "training backend pav with no documents" in str(excinfo.value)
def test_pav_initialize(app_project):
pav_type = annif.backend.get_backend("pav")
pav = pav_type(
backend_id="pav",
config_params={"limit": 50, "min-docs": 2, "sources": "dummy-fi"},
project=app_project,
)
assert pav._models is None
pav.initialize()
assert pav._models is not None
# initialize a second time - this shouldn't do anything
pav.initialize()
def test_pav_suggest(app_project):
pav_type = annif.backend.get_backend("pav")
pav = pav_type(
backend_id="pav",
config_params={"limit": 50, "min-docs": 2, "sources": "dummy-fi"},
project=app_project,
)
results = pav.suggest(
[
"""Arkeologiaa sanotaan joskus myös
muinaistutkimukseksi tai muinaistieteeksi. Se on humanistinen tiede
tai oikeammin joukko tieteitä, jotka tutkivat ihmisen menneisyyttä.
Tutkimusta tehdään analysoimalla muinaisjäännöksiä eli niitä jälkiä,
joita ihmisten toiminta on jättänyt maaperään tai vesistöjen
pohjaan."""
]
)[0]
assert len(pav._models["dummy-fi"]) == 1
assert len(results) > 0
assert list(results)[0].score == pytest.approx(2 / 3) # PAV recalculated score
def test_pav_train_params(tmpdir, app_project, caplog):
logger = annif.logger
logger.propagate = True
pav_type = annif.backend.get_backend("pav")
pav = pav_type(
backend_id="pav",
config_params={"limit": 50, "min-docs": 2, "sources": "dummy-fi"},
project=app_project,
)
tmpfile = tmpdir.join("document.tsv")
tmpfile.write(
"dummy\thttp://example.org/dummy\n"
+ "another\thttp://example.org/dummy\n"
+ "none\thttp://example.org/none"
)
document_corpus = annif.corpus.DocumentFile(str(tmpfile), app_project.subjects)
params = {"min-docs": 5}
with caplog.at_level(logging.DEBUG):
pav.train(document_corpus, params)
parameters_spec = "creating PAV model for source dummy-fi, min_docs=5"
assert parameters_spec in caplog.text
def test_pav_suggest_after_min_docs(app_project):
pav_type = annif.backend.get_backend("pav")
pav = pav_type(
backend_id="pav",
config_params={"limit": 50, "min-docs": 2, "sources": "dummy-fi"},
project=app_project,
)
results = pav.suggest(
[
"""Arkeologiaa sanotaan joskus myös
muinaistutkimukseksi tai muinaistieteeksi. Se on humanistinen tiede
tai oikeammin joukko tieteitä, jotka tutkivat ihmisen menneisyyttä.
Tutkimusta tehdään analysoimalla muinaisjäännöksiä eli niitä jälkiä,
joita ihmisten toiminta on jättänyt maaperään tai vesistöjen
pohjaan."""
]
)[0]
assert len(pav._models["dummy-fi"]) == 0
assert len(results) > 0
print(results)
print(list(results)[0])
assert list(results)[0].score == 1.0 # original score from dummy-fi project
def test_pav_is_trained(app_project):
pav_type = annif.backend.get_backend("pav")
pav = pav_type(
backend_id="pav",
config_params={"limit": 50, "min-docs": 2, "sources": "dummy-fi"},
project=app_project,
)
assert pav.is_trained
def test_pav_modification_time(app_project):
pav_type = annif.backend.get_backend("pav")
pav = pav_type(
backend_id="pav",
config_params={"limit": 50, "min-docs": 2, "sources": "dummy-fi"},
project=app_project,
)
assert datetime.now(timezone.utc) - pav.modification_time < timedelta(1)