-
Notifications
You must be signed in to change notification settings - Fork 0
/
drug.py
231 lines (169 loc) · 7.29 KB
/
drug.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import os
import shutil
ABS_PATH = os.path.dirname(os.path.abspath(__file__))
EBI_COLS = ["Name", "Synonyms", "ChEMBL ID"]
DB_COLS = ["Common name", "Synonyms", "DrugBank ID"]
import numpy as np
import pandas as pd
def _store_resource_file(file_path, resource_file_name):
'''
Internal helper function to store a file in the resource directory.
Parameters
-----------
:param file_path: the path to the local file which should be included in the preon instance
:param resource_file_name: the file name in the resource folder
Examples
-----------
>>> _store_resource_file("/Users/username/Downloads/compounds.csv", "ebi_drugs.csv")
'''
resource_path = f"{ABS_PATH}/resources/"
if not os.path.exists(resource_path):
os.makedirs(resource_path)
resource_file = os.path.join(resource_path, resource_file_name)
shutil.copy(file_path, resource_file)
def store_ebi_drugs(file_path):
'''
Stores EBI drug names downloaded from https://www.ebi.ac.uk/chembl/g/#search_results/compounds
as a CSV file in the local resources folder.
Parameters
-----------
:param file_path: the path to the local file which should be included in the preon instance
Raises
------
ValueError
If the local file does not contain the columns "Name", "Synonyms" or "ChEMBL ID", which
are required to load and parse the EBI drug names.
Examples
-----------
>>> store_ebi_drugs(file_path="/Users/Username/Downloads/compounds.csv")
'''
header = pd.read_csv(file_path, delimiter=';', nrows=0)
if not all(column in header.columns for column in EBI_COLS):
raise ValueError(f"EBI drug names file must include: {EBI_COLS}")
_store_resource_file(file_path, "ebi_drugs.csv")
def load_ebi_drugs(file_path=f"{ABS_PATH}/resources/ebi_drugs.csv"):
'''
Loads and parses EBI drug names downloaded from https://www.ebi.ac.uk/chembl/g/#search_results/compounds.
Parameters
-----------
:param file_path: the file path at which the EBI compund file is located
:return: a tupel of drug names with associated chembl ids
Examples
-----------
>>> drug_names, chembl_ids = load_ebi_drugs()
'''
df = pd.read_csv(file_path, delimiter=';', low_memory=False, usecols=EBI_COLS)
# filter df
df = df[df.Name.notna() & df["ChEMBL ID"].notna()]
drug_names, chembl_ids = [], []
for idx, row in df.iterrows():
names = []
# check for empty, add name
if len(row["Name"]) > 0:
names.append(row["Name"])
# check for nan/empty, add synonyms
if isinstance(row["Synonyms"], str) and len(row["Synonyms"]) > 0:
for s in row["Synonyms"].split("|"):
names.append(s)
# check for nan/empty, add chembl id
if isinstance(row["ChEMBL ID"], str) and len(row["ChEMBL ID"]) > 0:
drug_names.extend(names)
chembl_ids.extend([row["ChEMBL ID"]] * len(names))
return drug_names, chembl_ids
def store_drugbank_drugs(file_path):
'''
Stores DrugBank drug names downloaded from https://go.drugbank.com/releases/latest#open-data
as a CSV file in the local resources folder.
Parameters
-----------
:param file_path: the path to the local file which should be included in the preon instance
Raises
------
ValueError
If the local file does not contain the columns "Common name", "Synonyms" or "DrugBank ID",
which are required to load and parse the DrugBank names.
Examples
-----------
>>> store_drugbank_drugs(file_path="/Users/Username/Downloads/compounds.csv")
'''
header = pd.read_csv(file_path, delimiter=',', nrows=0)
if not all(column in header.columns for column in DB_COLS):
raise ValueError(f"DrugBank drug names file must include: {DB_COLS}")
_store_resource_file(file_path, "drugbank_drugs.csv")
def load_drugbank_drugs(file_path=f"{ABS_PATH}/resources/drugbank_drugs.csv"):
'''
Loads and parses DrugBank drug names downloaded from https://go.drugbank.com/releases/latest#open-data.
Parameters
-----------
:param file_path: the file path at which the DB compund file is located
:return: a tupel of drug names with associated db ids
Examples
-----------
>>> drug_names, db_ids = load_drugbank_drugs()
'''
df = pd.read_csv(file_path, delimiter=',', low_memory=False)
drug_names, db_ids = [], []
for _, row in df.iterrows():
drug_names.append(row["Common name"])
db_ids.append(row["DrugBank ID"])
if row["Synonyms"] is np.nan:
continue
for synonym in row["Synonyms"].split(" | "):
drug_names.append(synonym)
db_ids.append(row["DrugBank ID"])
return drug_names, db_ids
def load_charite_drug_goldstandard(file_path=f"{ABS_PATH}/resources/charite_drug_goldstandard.csv"):
'''
Loads our provided charite gold standard.
Parameters
-----------
:param file_path: the file path at which the gold standard is located
:return: a tupel of drug names with associated chembl ids and drugbank ids
Examples
-----------
>>> drug_names, chembl_ids = load_charite_drug_goldstandard()
'''
df = pd.read_csv(file_path, delimiter=';')
df = df[df['drug_class'] == 'no']
drug_names = []
chembl_ids = []
drugbank_ids = []
for drug_name, df_group in df.groupby('treatment'):
drug_names.append(drug_name)
chembl_ids.append(df_group['chembl_id'].to_numpy().tolist())
drugbank_id = df_group['drugbank_id'].to_numpy().tolist()
drugbank_ids.append([None] if drugbank_id[0] is np.nan else drugbank_id)
return drug_names, chembl_ids, drugbank_ids
def load_database_drug_goldstandard(file_path=f"{ABS_PATH}/resources/database_drug_goldstandard.csv"):
'''
Loads our provided database gold standard.
Parameters
-----------
:param file_path: the file path at which the gold standard is located
:return: a tupel of drug names with associated chembl ids and drugbank ids
Examples
-----------
>>> drug_names, chembl_ids = load_database_drug_goldstandard()
'''
df = pd.read_csv(file_path, delimiter=";")
df = df[df['drug_class'] == 'no']
drug_names = df['treatment'].to_numpy().tolist()
chembl_ids = [[chembl_ids] for chembl_ids in df['chembl_id'].apply(lambda x: None if x != x else x)]
drugbank_ids = [[drugbank_ids] for drugbank_ids in df['drugbank_id'].apply(lambda x: None if x != x else x)]
return drug_names, chembl_ids, drugbank_ids
def load_ctg_drug_goldstandard(file_path=f"{ABS_PATH}/resources/ctg_drug_goldstandard.csv"):
'''
Loads our provided ctg gold standard.
Parameters
-----------
:param file_path: the file path at which the gold standard is located
:return: a tupel of drug names with associated chembl ids and drugbank ids
Examples
-----------
>>> drug_names, chembl_ids = load_ctg_drug_goldstandard()
'''
df = pd.read_csv(file_path, delimiter=";")
drug_names = df['treatment'].to_numpy().tolist()
chembl_ids = [[chembl_ids] for chembl_ids in df['chembl_id'].apply(lambda x: None if x != x else x)]
drugbank_ids = [[drugbank_ids] for drugbank_ids in df['drugbank_id'].apply(lambda x: None if x != x else x)]
return drug_names, chembl_ids, drugbank_ids