你可以在这里找到源代码:
https://github.com/maxhodak/keras-molecules
我一直在玩它,输入和输出结构是 MxN 矩阵,其中 M 是 SMILES 字符串的最大长度(在本例中为 120),N 是字符集的大小。除了位置 M_i 处的字符与字符 N_j 匹配的位置外,每一行 M 都是一个零向量。要将输出矩阵解码为 SMILE,然后逐行匹配字符集中的字符位置。
这种编码的一个问题是它占用了大量的内存。使用 keras 图像迭代器方法,您可以执行以下操作:
首先将所有微笑编码为“稀疏”格式,这是您集中每个微笑的字符集位置列表。
现在,您在所有 SMILES(字符集)上定义了一个字符集,并且每个 SMILE 现在是一个数字列表,表示每个字符在字符集中的位置。然后,您可以在使用 fit_generator 函数训练 keras 模型的同时开始使用迭代器进行动态处理。
import numpy as np
import threading
import collections
class SmilesIterator(object):
def __init__(self, X, charset, max_length, batch_size=256, shuffle=False, seed=None):
self.X = X
self.charset = charset
self.max_length = max_length
self.N = len(X)
self.batch_size = batch_size
self.shuffle = shuffle
self.batch_index = 0
self.total_batches_seen = 0
self.lock = threading.Lock()
self.index_generator = self._flow_index(len(X), batch_size, shuffle, seed)
def reset(self):
self.batch_index = 0
def __iter__(self):
return self
def _flow_index(self, N, batch_size, shuffle=False, seed=None):
self.reset()
while True:
if self.batch_index == 0:
index_array = np.arange(N)
if shuffle:
if seed is not None:
np.random.seed(seed + total_batches_seen)
index_array = np.random.permutation(N)
current_index = (self.batch_index * batch_size) % N
if N >= current_index + batch_size:
current_batch_size = batch_size
self.batch_index += 1
else:
current_batch_size = N - current_index
self.batch_index = 0
self.total_batches_seen += 1
yield(index_array[current_index: current_index + current_batch_size],
current_index, current_batch_size)
def next(self):
with self.lock:
index_array, current_index, current_batch_size = next(self.index_generator)
#one-hot encoding is not under lock and can be done in parallel
#reserve room for the one-hot encoded
#batch, max_length, charset_length
batch_x = np.zeros(tuple([current_batch_size, self.max_length, len(self.charset)]))
for i, j in enumerate(index_array):
x = self._one_hot(self.X[j])
batch_x[i] = x
return (batch_x, batch_x) #fit_generator returns input and target
def _one_hot(self, sparse_smile):
ss = []
counter = 0
for s in sparse_smile:
cur = [0] * len(self.charset)
cur[s] = 1
ss.append(cur)
counter += 1
#handle end of line, make sure space ' ' is first in the charset
for i in range(counter, len(self.charset)):
cur = [0] * len(self.charset)
cur[0] = 1
ss.append(cur)
ss = np.array(ss)
return(ss)