import copy
import itertools
import re
import warnings
from collections import defaultdict
import networkx as nx
import numpy as np
from pymatgen.core.periodic_table import Element
from rdkit import Chem
from rdkit.Chem import AllChem
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
"""
modified based on Jan H. Jensen's implementation in [xyz2mol](https://github.com/jensengroup/xyz2mol)
# TODO this is based on atom connectivity, is also good to include bond length information so bond type can be
determined such that a radical fragment can have bond type of its parent, this is why i cannot use rdkit AddHs in
conformer_addh
"""
[docs]def chiral_stereo_check(mol):
Chem.SanitizeMol(mol)
Chem.DetectBondStereochemistry(mol, -1)
Chem.AssignStereochemistry(mol, flagPossibleStereoCenters=True, force=True)
Chem.AssignAtomChiralTagsFromStructure(mol, -1)
return mol
[docs]def clean_charges(mol):
Chem.SanitizeMol(mol)
rxn_smarts = ['[#6,#7:1]1=[#6,#7:2][#6,#7:3]=[#6,#7:4][CX3-,NX3-:5][#6,#7:6]1=[#6,#7:7]>>\
[#6,#7:1]1=[#6,#7:2][#6,#7:3]=[#6,#7:4][-0,-0:5]=[#6,#7:6]1[#6-,#7-:7]',
'[#6,#7:1]1=[#6,#7:2][#6,#7:3](=[#6,#7:4])[#6,#7:5]=[#6,#7:6][CX3-,NX3-:7]1>>\
[#6,#7:1]1=[#6,#7:2][#6,#7:3]([#6-,#7-:4])=[#6,#7:5][#6,#7:6]=[-0,-0:7]1']
fragments = Chem.GetMolFrags(mol, asMols=True, sanitizeFrags=False)
for i, fragment in enumerate(fragments):
for smarts in rxn_smarts:
patt = Chem.MolFromSmarts(smarts.split(">>")[0])
while fragment.HasSubstructMatch(patt):
rxn = AllChem.ReactionFromSmarts(smarts)
ps = rxn.RunReactants((fragment,))
fragment = ps[0][0]
Chem.SanitizeMol(fragment)
# print(Chem.MolToSmiles(fragment))
if i == 0:
mol = fragment
else:
mol = Chem.CombineMols(mol, fragment)
return mol
[docs]def valence_electron(element):
"""
count valence electrons based on electronic configuration
if a subshell has > 10 electrons, this subshell is ignored
"""
configuration = element.data["Electronic structure"]
list_split = configuration.split('.')
valence_electrons = 0
for i in range(len(list_split)):
if 'sup' in list_split[i]:
electrons = re.search('<sup>(.*)</sup>', list_split[i])
if not int(electrons.group(1)) >= 10:
valence_electrons += int(electrons.group(1))
return valence_electrons
[docs]def pmgmol_to_rdmol(pmg_mol):
m = pmg_mol
charge = m.charge
numsites = len(m)
conf = Chem.Conformer(numsites)
coordmat = m.cart_coords
for i in range(numsites):
conf.SetAtomPosition(i, (coordmat[i][0], coordmat[i][1], coordmat[i][2]))
mat = np.zeros((numsites, numsites))
distmat = squareform(pdist(coordmat))
for i in range(numsites):
istring = pmg_mol[i].species_string
irad = Element(istring).atomic_radius
if irad is None:
continue
for j in range(i + 1, numsites):
jstring = pmg_mol[j].species_string
jrad = Element(jstring).atomic_radius
if jrad is None:
continue
cutoff = (irad + jrad) * 1.30
if 1e-5 < distmat[i][j] < cutoff:
mat[i][j] = 1
mat[j][i] = 1
ap = ACParser(mat, charge, m.atomic_numbers, sani=True)
try:
rdmol, smiles = ap.parse(charged_fragments=False, force_single=False, expliciths=True)
except Chem.rdchem.AtomValenceException:
warnings.warn('AP parser cannot use radical scheme, trying to use charged frag')
rdmol, smiles = ap.parse(charged_fragments=True, force_single=False, expliciths=True)
rdmol.AddConformer(conf)
return rdmol, smiles
[docs]class ACParser:
[docs] def __init__(self, ac:np.ndarray, charge, atomnumberlist, sani=True, apriori_radicals=None):
"""
:var self.valences_list: a list of possible valence assignment, valences_list[i] is one possbile way to assign jth atom
valence based on valences_list[i][j].
:var self.atomic_valence_electrons: atomic_valence_electrons[i] is the #_of_ve of ith atom
:var self.apriori_radicals: a dict to mark the atoms that can will have a lower valence in generating BO
"""
self.apriori_radicals = apriori_radicals
self.AC = ac.astype(int)
self.sani = sani
self.atomic_numbers = atomnumberlist
self.natoms = len(self.atomic_numbers)
self.charge = charge
self.valences_list, self.atomic_valence_electrons, self.AC_valence = self.get_valence_info()
[docs] def init_rdmol(self):
mol = Chem.MolFromSmarts("[#" + str(self.atomic_numbers[0]) + "]")
rwmol = Chem.RWMol(mol)
for s in self.atomic_numbers[1:]:
rwmol.AddAtom(Chem.Atom(s))
mol = rwmol.GetMol()
return mol
[docs] def get_valence_info(self):
atomic_valence = defaultdict(list)
atomic_valence_electrons = {}
for z in list(set(self.atomic_numbers)):
e = Element.from_Z(z)
atomic_valence[z] = list(set([abs(v) for v in e.common_oxidation_states]))
atomic_valence_electrons[z] = valence_electron(e)
AC_valence = list(
self.AC.sum(axis=1)) # valence based on # of neighbors, can be considered as an element of valences_list
# make a list of valences, e.g. for CO: [[4],[2,1]]
valences_list_of_lists = []
iatom = 0
for atomicNum, valence in zip(self.atomic_numbers, AC_valence):
# valence can't be smaller number of neighbourgs
if self.apriori_radicals:
try:
nradicals = self.apriori_radicals[iatom]
except KeyError:
nradicals = 0
possible_valence = [x-nradicals for x in atomic_valence[atomicNum] if x >= valence]
else:
possible_valence = [x for x in atomic_valence[atomicNum] if x >= valence]
valences_list_of_lists.append(possible_valence)
iatom += 1
# from pprint import pprint
# pprint(valences_list_of_lists)
# convert [[4],[2,1]] to [[4,2],[4,1]]
valences_list = list(itertools.product(*valences_list_of_lists))
return valences_list, atomic_valence_electrons, AC_valence
[docs] @staticmethod
def getUADU(maxValence_list, valence_list):
"""
get unsaturated atoms (UA) and degree of unsaturation (DU) between two possible assignments
:param maxValence_list:
:param valence_list:
:return:
"""
UA = []
DU = []
for i, (maxValence, valence) in enumerate(zip(maxValence_list, valence_list)):
if maxValence - valence > 0:
UA.append(i)
DU.append(maxValence - valence)
return UA, DU
[docs] @staticmethod
def get_bonds(UA, AC):
"""
get a list of unique bond tuples (i, j) between UAs
:param UA:
:param AC:
:return:
"""
bonds = []
for k, i in enumerate(UA):
for j in UA[k + 1:]:
if AC[i, j] == 1:
bonds.append(tuple(sorted([i, j])))
return bonds
[docs] @staticmethod
def get_UA_pairs(UA, AC):
"""
find the largest list of bonds in which all atom appears at most once
:param UA:
:param AC:
:return:
"""
bonds = ACParser.get_bonds(UA, AC)
if len(bonds) == 0:
return [()]
G = nx.Graph()
G.add_edges_from(bonds)
UA_pairs = [list(nx.max_weight_matching(G))]
return UA_pairs
[docs] @staticmethod
def get_BO(AC, DU_init, valences, UA_pairs):
"""
for a valence assignment, get BO
BO[i][j] is the bond order between ith and jth
AC is a BO with all single bond
the algo is to increase bond order s.t. degree of unsaturation (DU) does not change
notice DU is calculated based on the given valences
:param DU_init:
:param AC:
:param valences:
:param UA_pairs:
:return:
"""
BO = AC.copy()
DU_save = []
DU = copy.deepcopy(DU_init)
while DU_save != DU:
for i, j in UA_pairs:
BO[i, j] += 1
BO[j, i] += 1
BO_valence = list(BO.sum(axis=1))
DU_save = copy.copy(DU)
UA, DU = ACParser.getUADU(valences, BO_valence)
UA_pairs = ACParser.get_UA_pairs(UA, AC)[0]
return BO, UA_pairs
[docs] @staticmethod
def get_atomic_charge(atomic_number, atomic_valence_electrons, BO_valence):
"""
atomic charge from #_valence_electrons - bond_order
#TODO test robustness
:param atomic_number:
:param atomic_valence_electrons:
:param BO_valence:
:return:
"""
if atomic_number == 1:
charge = 1 - BO_valence
elif atomic_number == 5:
charge = 3 - BO_valence
elif atomic_number == 15 and BO_valence == 5:
charge = 0
elif atomic_number == 16 and BO_valence == 6:
charge = 0
else:
charge = atomic_valence_electrons - 8 + BO_valence
return charge
[docs] @staticmethod
def valences_not_too_large(BO, vs):
number_of_bonds_list = BO.sum(axis=1)
for valence, number_of_bonds in zip(vs, number_of_bonds_list):
if number_of_bonds > valence:
return False
return True
[docs] def BO_is_OK(self, BO, DU_from_AC, atomicNumList, charged_fragments, valences):
"""
check bond order matrix based on
:param BO:
:param DU_from_AC: based on valences arg
:param atomicNumList:
:param charged_fragments:
:param valences: valence assignment related to current BO
:return:
"""
if not self.valences_not_too_large(BO, valences):
return False
Q = 0 # total charge
q_list = []
if charged_fragments:
BO_valences = list(BO.sum(axis=1))
for i, atom in enumerate(atomicNumList):
q = ACParser.get_atomic_charge(atom, self.atomic_valence_electrons[atom], BO_valences[i])
Q += q
if atom == 6:
number_of_single_bonds_to_C = list(BO[i, :]).count(1)
if number_of_single_bonds_to_C == 2 and BO_valences[i] == 2:
Q += 1
q = 2
if number_of_single_bonds_to_C == 3 and Q + 1 < self.charge:
Q += 2
q = 1
if q != 0:
q_list.append(q)
if (BO - self.AC).sum() == sum(DU_from_AC) and self.charge == Q: # and len(q_list) <= abs(charge):
return True
else:
return False
[docs] def parse_bonds(self, charged_fragments):
"""
find the best BO
:param charged_fragments:
:return:
"""
best_BO = self.AC.copy()
for valences in self.valences_list:
UA, DU_from_AC = self.getUADU(valences, self.AC_valence)
if len(UA) == 0 and self.BO_is_OK(self.AC, DU_from_AC, self.atomic_numbers, charged_fragments, valences):
return self.AC
UA_pairs_list = self.get_UA_pairs(UA, self.AC)
for UA_pairs in UA_pairs_list:
BO, fin_UA_pairs = self.get_BO(self.AC, DU_from_AC, valences, UA_pairs)
if self.BO_is_OK(BO, DU_from_AC, self.atomic_numbers, charged_fragments, valences):
return BO
elif BO.sum() >= best_BO.sum() and self.valences_not_too_large(BO, valences):
best_BO = BO.copy()
return best_BO
[docs] def addBO2mol(self, rdmol, BO_matrix, charged_fragments, force_single=False):
# based on code written by Paolo Toscani
l = len(BO_matrix)
l2 = len(self.atomic_numbers)
BO_valences = list(BO_matrix.sum(axis=1))
if l != l2:
raise RuntimeError('sizes of adjMat ({0:d}) and atomicNumList '
'{1:d} differ'.format(l, l2))
rwMol = Chem.RWMol(rdmol)
bondTypeDict = {
1: Chem.BondType.SINGLE,
2: Chem.BondType.DOUBLE,
3: Chem.BondType.TRIPLE
}
for i in range(l):
for j in range(i + 1, l):
bo = int(round(BO_matrix[i, j]))
if bo == 0:
continue
if force_single:
bt = Chem.BondType.SINGLE
else:
bt = bondTypeDict.get(bo, Chem.BondType.SINGLE)
rwMol.AddBond(i, j, bt)
mol = rwMol.GetMol()
if charged_fragments:
mol = self.set_atomic_charges(mol, BO_valences, BO_matrix)
else:
mol = self.set_atomic_radicals(mol, BO_valences)
return mol
[docs] def set_atomic_charges(self, mol, BO_valences, BO_matrix):
q = 0
for i, atom in enumerate(self.atomic_numbers):
a = mol.GetAtomWithIdx(i)
charge = self.get_atomic_charge(atom, self.atomic_valence_electrons[atom], BO_valences[i])
q += charge
if atom == 6:
number_of_single_bonds_to_C = list(BO_matrix[i, :]).count(1)
if number_of_single_bonds_to_C == 2 and BO_valences[i] == 2:
q += 1
charge = 0
if number_of_single_bonds_to_C == 3 and q + 1 < self.charge:
q += 2
charge = 1
if abs(charge) > 0:
a.SetFormalCharge(int(charge))
if self.sani:
mol = clean_charges(mol)
return mol
[docs] def set_atomic_radicals(self, mol, BO_valences):
# The number of radical electrons = absolute atomic charge
for i, atom in enumerate(self.atomic_numbers):
a = mol.GetAtomWithIdx(i)
charge = self.get_atomic_charge(atom, self.atomic_valence_electrons[atom], BO_valences[i])
if abs(charge) > 0:
a.SetNumRadicalElectrons(abs(int(charge)))
return mol
[docs] def parse(self, charged_fragments=False, force_single=False, expliciths=True):
BO = self.parse_bonds(charged_fragments)
mol = self.init_rdmol()
mol = self.addBO2mol(mol, BO, charged_fragments, force_single)
if self.sani:
mol = chiral_stereo_check(mol)
smiles = Chem.MolToSmiles(mol, allHsExplicit=expliciths, isomericSmiles=True)
m = Chem.MolFromSmiles(smiles, sanitize=self.sani)
smiles = Chem.MolToSmiles(m, isomericSmiles=True, allHsExplicit=expliciths)
return mol, smiles # m is just used to get canonical smiles