Source code for ocelot.task.confgen

"""
given a molecular specification (smiles) get distinct conformers via rdkit

the molecule should already have hydrogen explicitly added, i.e. from hsmiles or addhs

workflow:
    rdkit conf gen [prune_in_gen] --> [ff minimize] --> prune_by_energy --> prune_by_rmsd

ref
http://rdkit.blogspot.com/2014/08/optimizing-diversity-picking-in-rdkit.html
https://squonk.it/docs/cells/RDKit%20Diverse%20Subset%20Picker/
https://github.com/skearnes/rdkit-utils/blob/master/rdkit_utils/conformers.py
https://new.pharmacelera.com/scripts/rdkit-conformation-generation-script/
https://www.rdkit.org/UGM/2012/Ebejer_20110926_RDKit_1stUGM.pdf
http://rdkit.org/docs_temp/Cookbook.html#parallel-conformation-generation
"""
__author__ = "qianxiang ai, parker sornberger"

import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from ocelot.schema.rdfunc import RdFunc
import warnings


[docs]class ConfGenError(Exception): pass
[docs]class ConfGen:
[docs] def __init__(self, m: Chem.Mol, mol_name='rdmol', nconf=None, mingen=50, maxgen=300, nthread=0, rngseed=-1, prune_in_gen=None, minimize=True, ff='MMFF94s', prune_by_energy=100): self.mol_name = mol_name self.m = Chem.Mol(m) # get a new mol self.m.RemoveAllConformers() self.mingen = mingen self.maxgen = maxgen self.rngseed = rngseed try: self.nconf = int(nconf) except TypeError: self.nconf = self.ideal_conf_num(self.m, self.mingen, self.maxgen) try: self.prune_in_gen = float(prune_in_gen) # this is the threshold except TypeError: warnings.warn('prune_in_gen will not be carried out') self.prune_in_gen = -1.0 self.minimize = minimize if ff not in ['UFF', 'MMFF94', 'MMFF94s']: raise ConfGenError('force field: {} not understood!'.format(ff)) self.ff = ff self.nthread = nthread self.prune_by_energy = float(prune_by_energy)
[docs] @staticmethod def is_hydrogen_explicit(m: Chem.Mol): if not any('h' == a.GetSymbol().lower() for a in m.GetAtoms()): raise ConfGenError('no explicit hydrogen in this rdkit mol!')
[docs] @classmethod def from_smiles(cls, smiles: str, mingen=50, maxgen=300): if 'h' not in smiles.lower(): raise ConfGenError('no h in the smiles!') m = Chem.MolFromSmiles(smiles) n = Chem.AddHs(m) return cls(n, mingen=mingen, maxgen=maxgen)
[docs] @staticmethod def ideal_conf_num(some_molecule: Chem.Mol, min=50, max=300): """ return the number of conformers that need to be generated to sample the conf space of a molecule """ rotatable_bonds = int(AllChem.CalcNumRotatableBonds(some_molecule)) if rotatable_bonds < 3: conf_number = min elif rotatable_bonds > 6: conf_number = max else: conf_number = (rotatable_bonds ** 3) return conf_number
[docs] @staticmethod def genconfs_ETKDG(m: Chem.Mol, nconf, nthreads, rmsprune, rngseed): # confids = AllChem.EmbedMultipleConfs(m, numConfs=nconf, params=AllChem.ETKDG(), pruneRmsThresh=rmsprune, # numThreads=nthreads, randomSeed=rngseed) params = AllChem.ETKDG() params.numThreads = nthreads params.randomSeed = rngseed params.pruneRmsThresh = rmsprune confids = AllChem.EmbedMultipleConfs(m, numConfs=nconf, params=params, ) return m, list(confids)
[docs] @staticmethod def cal_energy(m: Chem.Mol, cid: int = 0, ff='MMFF94s'): if 'MMFF' in ff: molecule_properties = AllChem.MMFFGetMoleculeProperties(m, mmffVariant=ff) force_field = AllChem.MMFFGetMoleculeForceField(m, molecule_properties, confId=cid) else: force_field = AllChem.UFFGetMoleculeForceField(m, confId=cid) return float(force_field.CalcEnergy())
[docs] def genconfs_prune_by_energy(self): # energy in kcal/mol m, confids = self.genconfs_ETKDG(self.m, self.nconf, self.nthread, self.prune_in_gen, self.rngseed) if self.minimize: if 'MMFF' in self.ff: ff_results = AllChem.MMFFOptimizeMoleculeConfs(m, mmffVariant=self.ff, numThreads=self.nthread) elif 'UFF' in self.ff: ff_results = AllChem.UFFOptimizeMoleculeConfs(m, numThreads=self.nthread) else: raise ConfGenError('force field {} not understood!'.format(self.ff)) energies = [r[1] for r in ff_results] else: energies = [self.cal_energy(m, cid, self.ff) for cid in confids] energies = np.array(energies, dtype=float) minenergy = min(energies) new = Chem.Mol(m) new.RemoveAllConformers() new_energies = [] for i in confids: if energies[i] - minenergy < self.prune_by_energy: conf = m.GetConformer(confids[i]) new.AddConformer(conf, assignId=True) new_energies.append(energies[i]) return new, new_energies
[docs] @staticmethod def prune_by_rmsd(m, energies, max_conformers=None, rmsd_threshold=0.5): """ conformer in m should be assigned consecutive ids with delta=1 :param energies: :param m: :param max_conformers: :param rmsd_threshold: :return: """ rmsd_condensed = AllChem.GetConformerRMSMatrix(m) # rmsd_condensed = ConfGen.GetBestRMSMatrix(m) if max_conformers is None: max_conformers = len(m.GetConformers()) # Returns the RMS matrix of the conformers of a molecule. # As a side-effect, the conformers will be aligned to the first conformer (i.e. the reference) # and will left in the aligned state. from scipy.spatial.distance import squareform rmsd_mat = squareform(rmsd_condensed) # print(rmsd_mat[0,2]) # print(ConfGen.GetBestRMSConf(m, 0, 2)) # print(ConfGen.GetBestRMSConf(m, 2, 0)) atom_list = [a.GetSymbol() for a in m.GetAtoms()] # RdFunc.conf2xyz(m.GetConformer(2), '2.xyz', atom_list, '2') # RdFunc.conf2xyz(m.GetConformer(0), '0.xyz', atom_list, '0') keep = [] discard = [] confids = [] for c in m.GetConformers(): cid = c.GetId() confids.append(cid) # always keep lowest-energy conformer if len(keep) == 0: keep.append(cid) continue # discard conformers after max_conformers is reached if len(keep) >= max_conformers: discard.append(cid) continue # get RMSD to selected conformers this_rmsd = rmsd_mat[cid][np.asarray(keep, dtype=int)] # discard conformers within the RMSD threshold if np.all(this_rmsd >= rmsd_threshold): keep.append(cid) else: discard.append(cid) m_after_prune = Chem.Mol(m) m_after_prune.RemoveAllConformers() n_energies = [] for i in keep: conf = m.GetConformer(i) n_energies.append(energies[i]) m_after_prune.AddConformer(conf, assignId=True) return m_after_prune, n_energies
[docs] @staticmethod def GetBestRMSConf(mol, i, j): from rdkit.Chem.rdMolAlign import GetBestRMS # Chem.AssignStereochemistry(mol,cleanIt=True,force=True,flagPossibleStereoCenters=True) mol_prob = Chem.Mol(mol) # from collections import defaultdict # equivs = defaultdict(set) # matches = m.GetSubstructMatches(m,uniquify=False) # for match in matches: # for idx1,idx2 in enumerate(match): equivs[idx1].add(idx2) # print(equivs) # from copy import deepcopy # mol_prob = deepcopy(mol) # map = [[(a.GetIdx(), a.GetIdx()) for a in mol.GetAtoms()]] # [atom.GetProp('_CIPRank') for atom in m.GetAtoms()] # map = [] # for a in mol.GetAtoms(): # symm_eqs = [a.GetIdx()] # for b in mol.GetAtoms(): # if b.GetIdx() != a.GetIdx() and a.GetProp('_CIPRank') == b.GetProp('_CIPRank'): # symm_eqs.append(b.GetIdx()) # map.append(symm_eqs) # print(map) # combs = [] # for i in range(len(map)): # possible_ids = map[i] # for pid in possible_ids: # if pid not in combs: # combs.append(pid) # break # import itertools # map = list(itertools.product(*map)) # print(len(map)) # print([a.GetSymbol() for a in mol.GetAtoms()]) # print(map) return GetBestRMS(mol_prob, mol, prbId=j, refId=i,)# maxMatches=2000000)
[docs] @staticmethod def GetBestRMSMatrix(mol): """ the problem of AllChem.GetConformerRMSMatrix(m) is that it uses AlignMolConformers this assumes atom order is the same, which may not be the case if the molecule has symmetry """ from rdkit.Chem.rdMolAlign import GetBestRMS mol_prob = Chem.Mol(mol) rmsvals = [] confIds = [conf.GetId() for conf in mol.GetConformers()] for i in range(1, len(confIds)): rmsvals.append( GetBestRMS(mol_prob, mol, prbId=i, refId=0) ) # loop over the conformations (except the reference one) cmat = [] for i in range(1, len(confIds)): cmat.append(rmsvals[i - 1]) for j in range(1, i): cmat.append(GetBestRMS(mol_prob, mol, prbId=j, refId=i)) return np.array(cmat, dtype=float)
[docs] @staticmethod def cluster_by_rmsd(m: Chem.Mol, energies, rmsd_threshold: float): """ conformer in m should be assigned consecutive ids with delta=1 :param energies: :param m: :param rmsd_threshold: :return: """ rmsd_condensed = AllChem.GetConformerRMSMatrix(m) # rmsd_condensed = ConfGen.GetBestRMSMatrix(m) from rdkit.ML.Cluster.Butina import ClusterData clusters = ClusterData(rmsd_condensed, len(m.GetConformers()), distThresh=rmsd_threshold, isDistData=True) m_after_prune = Chem.Mol(m) m_after_prune.RemoveAllConformers() n_energies = [] for c in clusters: conf = m.GetConformer(c[0]) n_energies.append(energies[c[0]]) m_after_prune.AddConformer(conf, assignId=True) return m_after_prune, n_energies
[docs] def genconfs(self, flavor="prune", write_confs=False, prune_max_conformer=20, rmsd_threshold=0.5): m_prune_by_eng, energies = self.genconfs_prune_by_energy() if flavor == "prune": m_after, e_after = self.prune_by_rmsd(m_prune_by_eng, energies=energies, max_conformers=prune_max_conformer, rmsd_threshold=rmsd_threshold) elif flavor == 'cluster': m_after, e_after = self.cluster_by_rmsd(m_prune_by_eng, energies=energies, rmsd_threshold=rmsd_threshold) else: raise ConfGenError('flavor {} not understood'.format(flavor)) if write_confs: atom_list = [a.GetSymbol() for a in m_after.GetAtoms()] for c in m_after.GetConformers(): cid = c.GetId() energy = e_after[cid] comment = '{} energy: {:.4f} kcal/mol'.format(self.ff, energy) RdFunc.conf2xyz(c, "{}_{}.xyz".format(self.mol_name, cid), atom_list, comment) return m_after, e_after
[docs] @staticmethod def show_confs_diversity(m: Chem.Mol, energies: [float]): # TODO find a way to display the diversity in/clusters of the conformers generated # m should be a rdkit molecule with conformers generated and confomer ids assigned # s.t. energies[conformer_id] = energy # possible descriptor of diversity: # energies, # shape descriptors, e.g. AllChem.ShapeTanimotoDist # rmsd spread, e.g. https://www.rdkit.org/docs/Cookbook.html#rmsd-calculation-between-n-molecules # dimension reduction method for rmsd matrix, e.g. http://sketchmap.org/ # notice we can take the advantage that all conformers share the same molecule, i.e. atom order does not matter pass