-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
89 lines (72 loc) · 2.85 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import logging
from typing import Sequence
import numpy as np
from rdkit.Chem import Descriptors, rdMolDescriptors
logging.basicConfig(
format='%(asctime)s %(levelname)-8s %(message)s',
level=logging.INFO,
datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger('main')
# Hardcode chars for consistency
chars = (
'9', '1', 'B', 'p', 'X', '3', 'i', '6', '/', 'l', '7', 'S', '(', 'N',
'F', 'c', 'g', 'I', '5', '8', 'e', 't', 'Z', '#', 'r', '%', 'O', 'o',
'n', 'C', '[', '4', 'M', '.', 'A', 'L', 'T', 's', '+', '0', ')', '\n',
'H', '@', 'b', '\\', 'K', '=', 'a', 'R', 'P', '-', '2', ']'
)
int2char = {i: char for i, char in enumerate(chars)}
char2int = {ch: ii for ii, ch in int2char.items()}
descriptors = {
'HBA': Descriptors.NumHAcceptors,
'HBD': Descriptors.NumHDonors,
'MW': Descriptors.MolWt,
'rotatable': rdMolDescriptors.CalcNumRotatableBonds,
'TPSA': Descriptors.TPSA,
'cLogP': Descriptors.MolLogP,
'fsp3': rdMolDescriptors.CalcFractionCSP3,
'stereo': rdMolDescriptors.CalcNumAtomStereoCenters
}
def one_hot_encode(arr: Sequence[Sequence[int]], n_labels: int) -> np.ndarray:
"""
Convert dense one-hot representation to matrix.
Args:
arr (Sequence[int]): Sequence of dense one-hot embedding
n_labels (int): Total number of possible labels
Returns:
one_hot (np.array)
"""
# Initialize the the encoded array
one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
# Fill the appropriate elements with ones
one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
# Finally reshape it to get back to the original array
one_hot = one_hot.reshape((*arr.shape, n_labels))
return one_hot
# Defining method to make mini-batches for training
def get_batches(arr: np.ndarray, batch_size: int, seq_length: int):
"""Create a generator that returns batches of size
batch_size x seq_length from arr.
Args:
arr (np.array): Dense one-hot, continuous encoding
batch_size (int): Batch size, the number of sequences per batch
seq_length (int): Number of encoded chars in a sequence
Returns:
"""
batch_size_total = batch_size * seq_length
# total number of batches we can make
n_batches = len(arr) // batch_size_total
# Keep only enough characters to make full batches
arr = arr[:n_batches * batch_size_total]
# Reshape into batch_size rows
arr = arr.reshape((batch_size, -1))
# iterate through the array, one sequence at a time
for n in range(0, arr.shape[1], seq_length):
# The features
x = arr[:, n:n + seq_length]
# The targets, shifted by one
y = np.zeros_like(x)
try:
y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n + seq_length]
except IndexError:
y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
yield x, y