-
Notifications
You must be signed in to change notification settings - Fork 0
/
cancer.py
336 lines (251 loc) · 10.2 KB
/
cancer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
import os, urllib.request, pronto
ABS_PATH = os.path.dirname(os.path.abspath(__file__))
import pandas as pd
import daproli as dp
from lxml import etree
def download_do(file_path=f"{ABS_PATH}/resources/do.obo"):
'''
Downloads and stores the disease ontology.
Parameters
-----------
:param file_path: the file path at which the disease ontology should be stored.
Examples
-----------
>>> download_do()
'''
url = "https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/doid.obo"
with urllib.request.urlopen(url) as file:
ot = file.read().decode('utf-8')
with open(file_path, "w") as file:
file.write(ot)
def download_mesh(file_path=f"{ABS_PATH}/resources/mesh.xml"):
'''
Downloads and stores the MeSH data.
Parameters
-----------
:param file_path: the file path at which the mesh data should be stored.
Examples
-----------
>>> download_mesh()
'''
url = "https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2022.xml"
with urllib.request.urlopen(url) as file:
ot = file.read().decode('utf-8')
with open(file_path, "w") as file:
file.write(ot)
def load_do_cancers(file_path=f"{ABS_PATH}/resources/do.obo", expand_doids=False):
'''
Loads the disease ontology.
Parameters
-----------
:param file_path: the file path at which the disease ontology is stored.
:param expand_doids: a flag to decide whether cancer sub or superclasses should be considered.
:return: a tupel of cancer types with associated doids.
Examples
-----------
>>> cancer_types, doids = load_do_cancers()
'''
ot = pronto.Ontology(file_path)
cancer_types = []
doids = []
cancer_ids = [term.id for term in ot["DOID:162"].subclasses()]
for cancer_id in cancer_ids:
if cancer_id == "DOID:162": continue
term = ot[cancer_id]
names = [term.name]
# add doid synonyms
for synonym in term.synonyms:
if synonym.scope == "EXACT":
names.append(synonym.description)
names = set(names)
for name in names:
cancer_types.append(name)
doids.append(term.id)
if expand_doids is False:
continue
# add subclasses
for subclass in term.subclasses():
for name in names:
cancer_types.append(name)
doids.append(subclass.id)
# add superclasses
for superclass in term.superclasses():
for name in names:
cancer_types.append(name)
doids.append(superclass.id)
return cancer_types, doids
def load_mesh_cancers(file_path=f"{ABS_PATH}/resources/mesh.xml"):
'''
Loads the mesh cancer descriptors.
Parameters
-----------
:param file_path: the file path at which the mesh data is stored.
:return: a tupel of mesh descriptors with associated mesh ids.
Examples
-----------
>>> disease_types, mesh_ids = load_mesh_cancers()
'''
with open(file_path, 'r') as f:
tree = etree.parse(f, etree.HTMLParser())
disease_types, mesh_ids = [], []
for desc_record in tree.getroot().xpath("//descriptorrecord"):
# check that record is a neoplasm
tree_entries = [entry.text for entry in desc_record.xpath("treenumberlist/treenumber")]
if not any(entry.startswith("C04") for entry in tree_entries): continue
disease_types.append(desc_record.xpath("descriptorname/string")[0].text)
mesh_ids.append("MESH:" + desc_record.xpath("descriptorui")[0].text)
return disease_types, mesh_ids
def download_or_load_do_cancers(file_path=f"{ABS_PATH}/resources/do_cancers.obo", expand_doids=False):
'''
Downloads or loads the disease ontology (depending whether it is stored or not).
Parameters
-----------
:param file_path: the file path at which the disease ontology is/should be stored.
:param expand_doids: a flag to decide whether cancer sub or superclasses should be considered.
:return: a tupel of cancer types with associated doids.
Examples
-----------
>>> cancer_types, doids = download_or_load_do_cancers()
'''
if not os.path.exists(file_path):
download_do(file_path)
return load_do_cancers(file_path=file_path, expand_doids=expand_doids)
def download_or_load_mesh_cancers(file_path=f"{ABS_PATH}/resources/mesh.xml"):
'''
Downloads or loads the mesh cancer data (depending whether it is stored or not).
Parameters
-----------
:param file_path: the file path at which the mesh data is/should be stored.
:return: a tupel of mesh descriptors with associated mesh ids.
Examples
-----------
>>> disease_types, mesh_ids = download_or_load_mesh_cancers()
'''
if not os.path.exists(file_path):
download_mesh(file_path)
return load_mesh_cancers(file_path=file_path)
def load_do_flat_mapping(file_path=f"{ABS_PATH}/resources/do_cancers.obo"):
'''
Loads a dictionary that maps the first two layers from the cancer disease ontology
to all of their sub classes.
Parameters
-----------
:param file_path: the file path at which the disease ontology is stored.
:return: a dictionary that maps the first two layers from the disease ontology
to all of their sub classes.
Examples
-----------
>>> flat_mapping = load_do_flat_mapping()
'''
ot = pronto.Ontology(file_path)
mapping = dict()
start_ids = ["DOID:0050687", "DOID:0050686"]
for start_id in start_ids:
for term in ot[start_id].subclasses(distance=1):
for sub_term in term.subclasses():
if sub_term.id not in mapping: mapping[sub_term.id] = list()
mapping[sub_term.id].append(term.id)
return mapping
def apply_do_flat_mapping_to_ontology(cancer_types, doids, do_flat_mapping):
'''
Reduces the doids from the disease ontology to its first two cancer layers doids.
Parameter
-----------
:param cancer_types: cancer types from the disease ontology
:param doids: associated doids
:param do_flat_mapping: the flat mapping to reduce the doids
:return: a tupel of cancer types with associated doids.
Examples
-----------
>>> cancer_types, doids = download_or_load_do_cancers()
>>> do_flat_mapping = load_do_flat_mapping()
>>> cancer_types, doids = apply_do_flat_mapping_to_ontology(cancer_types, doids, do_flat_mapping)
'''
doids = dp.map(lambda doid: do_flat_mapping[doid], doids)
expand_cancer_types, expanded_doids = [], []
for cancer_type, entries in zip(cancer_types, doids):
for doid in entries:
expand_cancer_types.append(cancer_type)
expanded_doids.append(doid)
return expand_cancer_types, expanded_doids
def apply_do_flat_mapping_to_goldstandard(cancer_types, doids, do_flat_mapping):
'''
Reduces the doids from the provided gold standards to the first two cancer layers
from the disease ontology.
Parameter
-----------
:param cancer_types: cancer types from a gold standard
:param doids: associated doids
:param do_flat_mapping: the flat mapping to reduce the doids
:return: a tupel of cancer types with associated doids.
Examples
-----------
>>> cancer_types, doids = load_database_cancer_goldstandard()
>>> do_flat_mapping = load_do_flat_mapping()
>>> cancer_types, doids = apply_do_flat_mapping_to_goldstandard(cancer_types, doids, do_flat_mapping)
'''
new_cancer_types, new_doids = [], []
for cancer_type, entries in zip(cancer_types, doids):
if entries != [None]:
entries = dp.filter(lambda doid: doid in do_flat_mapping, entries, ret_type=list)
entries = dp.map(lambda doid: do_flat_mapping[doid], entries, ret_type=list)
entries = dp.flatten(entries, ret_type=list)
if len(entries) > 0:
new_cancer_types.append(cancer_type), new_doids.append(entries)
return new_cancer_types, new_doids
def load_database_cancer_goldstandard(file_path=f"{ABS_PATH}/resources/database_cancer_goldstandard.csv", return_source=False):
'''
Loads our provided database gold standard.
Parameters
-----------
:param file_path: the file path at which the gold standard is located.
:param return_source: return the data base from which an entry originates.
:return: a tupel of cancer types (and their sources) with associated doids and mesh ids.
Examples
-----------
>>> cancer_types, doids, mesh_ids = load_database_cancer_goldstandard()
'''
df = pd.read_csv(file_path, sep=";")
sources, cancer_types, doids, mesh_ids = [], [], [], []
for _, (cancer_type, doid, source, mesh_id) in df.iterrows():
if doid != doid:
doid = [None]
else:
doid = doid.split(',')
if mesh_id != mesh_id:
mesh_id = [None]
else:
mesh_id = mesh_id.split(',')
sources.append(source)
cancer_types.append(cancer_type)
doids.append(doid)
mesh_ids.append(mesh_id)
if return_source is True:
return cancer_types, sources, doids, mesh_ids
return cancer_types, doids, mesh_ids
def load_ncbi_cancer_goldstandard(file_path=f"{ABS_PATH}/resources/ncbi_cancer_goldstandard.csv"):
'''
Loads our provided ncbi gold standard.
Parameters
-----------
:param file_path: the file path at which the gold standard is located.
:return: a tupel of cancer types with associated doids and mesh ids.
Examples
-----------
>>> cancer_types, doids, mesh_ids = load_ncbi_cancer_goldstandard()
'''
df = pd.read_csv(file_path, sep=";")
cancer_types, dids, mids = [], [], []
for ncbi_name, df_group in df.groupby("cancer"):
cancer_types.append(ncbi_name)
doids = df_group["doid"].unique().tolist()
mesh_ids = df_group["mesh"].unique().tolist()
if len(doids) == 1 and doids[0] != doids[0]:
dids.append([None])
else:
dids.append(doids)
if len(mesh_ids) == 1 and mesh_ids[0] != mesh_ids[0]:
mids.append([None])
else:
mids.append(dp.flatten([_.split(',') for _ in mesh_ids if _ == _]))
return cancer_types, dids, mids