Source code for ocelot.task.readcif

import warnings
from collections import OrderedDict

from pymatgen.core.composition import CompositionError
from pymatgen.core.structure import Composition

from ocelot.routines.fileop import stringkey
from ocelot.routines.disparser import DisParser
from ocelot.schema.configuration import Config
from ocelot.schema.conformer import MolConformer

"""
ReadCif implements a set of checkers/functions as the first step of reading cif file
1. is one type of molecule?
2. is the molecule legit? (can it be parsed to rdmol)
3. where is the disorder --> bone or side group or no
4. get all configurations (during which molconformers for each config will be obtained)
5. is there any hydrogen missing?
"""


[docs]class ReadCif:
[docs] def __init__(self, cifstring, source, identifier=None): self.cifstring = cifstring self.source = source self.dp = DisParser(self.cifstring) if identifier is None: self.identifier = self.dp.identifier else: self.identifier = identifier self.lattice = self.dp.lattice self.was_fitted = self.dp.was_fitted self.disorder_class = self.dp.classification self.results = OrderedDict()
[docs] def read(self): dis_pstructure, dis_unwrap_str, dis_mols, config_infos = self.dp.to_configs(write_files=False, vanilla=True) # if True writes conf_x.cif, configs is a list of pmg Structure self.disorder_class = self.dp.classification self.results['disordered_pstructure'] = dis_unwrap_str self.results['disordered_pmgmols'] = dis_mols config_structures = [] config_occupancies = [] for item in config_infos: config_structures.append(item[0]) config_occupancies.append(item[1]) self.results['config_sturcutures'] = config_structures self.results['config_occupancies'] = config_occupancies configs = [] missh = [] for i in range(len(config_structures)): structure = config_structures[i] conf = Config.from_labeled_clean_pstructure(structure, occu=config_occupancies[i]) config_missingh = False for conformer in conf.molconformers: if conformer.is_missing_hydrogen(): config_missingh = True break if config_missingh: conf.pstructure.to('cif', '{}_mhconf_{}.cif'.format(self.identifier, i)) warnings.warn('missing hydrogens in {}_mhconf_{}.cif'.format(self.identifier, i)) configs.append(conf) missh.append(config_missingh) self.results['configurations'] = configs self.results['missingh'] = missh # these are checked against to configs[0] check_config = configs[0] try: self.results['n_unique_molecule'] = len(check_config.molgraph_set()) self.results['n_molconformers'] = len(check_config.molconformers) self.results['all_molconformers_legit'] = check_config.molconformers_all_legit() self.results['disorder_location'] = self.where_is_disorder(check_config) except: warnings.warn('there are problems in readcif.results, some fileds will be missing!') try: comp = Composition(self.dp.cifdata['_chemical_formula_sum']) self.results['cif_sum_composition'] = comp if not all(self.results['cif_sum_composition'] == mc.composition for mc in check_config.molconformers): self.results['sum_composition_match'] = False print('cif_sum_composition: {}'.format(self.results['cif_sum_composition'])) for mc in check_config.molconformers: print('mc composition: {}'.format(mc.composition)) warnings.warn('moiety sum composition does not match that specified in cif file!') else: self.results['sum_composition_match'] = True except (KeyError, CompositionError) as e: self.results['cif_sum_composition'] = None self.results['sum_composition_match'] = None try: comp_str = self.dp.cifdata['_chemical_formula_moiety'] comps = [Composition(s) for s in comp_str.split(',')] comps = sorted(comps, key=lambda x:len(x), reverse=True) if len(comps) > 1: warnings.warn('more than 1 moiety from cif file! only the largest one is checked!') self.results['cif_moiety_composition'] = comps[0] if not all(self.results['cif_moiety_composition'] == mc.composition for mc in check_config.molconformers): self.results['moiety_composition_match'] = False print('cif_moiety_composition: {}'.format(self.results['cif_moiety_composition'])) for mc in check_config.molconformers: print('mc composition: {}'.format(mc.composition)) warnings.warn('moiety composition does not match that specified in cif file!') else: self.results['moiety_composition_match'] = True except (KeyError, CompositionError) as e: self.results['cif_moiety_composition'] = None self.results['moiety_composition_match'] = None
# def as_dict(self): # d = OrderedDict() # d['cifstring'] = self.cifstring # d['clean_pstructures'] = [s.as_dict() for s in self.config_structures] # d['occus'] = self.occus # d['disordered_pmgmols'] = [m.as_dict() for m in self.disordered_pmgmols] # d['disordered_pstructure'] = self.disordered_pstructure.as_dict() # d['disparser'] = self.dp.as_dict() # d['configs'] = [c.as_dict() for c in self.configs] # d['properties'] = self.properties # return d # # @classmethod # def from_dict(cls, d): # cifstring = d['cifstring'] # return cls(cifstring)
[docs] @classmethod def from_ciffile(cls, ciffile, source, identifier=None): with open(ciffile, 'r') as f: s = f.read() return cls(s, source, identifier)
[docs] @staticmethod def where_is_disorder(c: Config): """ data[imol] = disorder info in conformer_properties """ disorderinfo = {} mc: MolConformer for imol in range(len(c.molconformers)): mc = c.molconformers[imol] try: disordered_siteid = [s for s in mc if abs(s.properties['occu'] - 1) > 1e-3] except KeyError: warnings.warn('not all sites have occu field, cannot decide disorder location!') disorderinfo[imol] = 'not sure' continue if len(disordered_siteid) == 0: disorderinfo[imol] = 'no disorder' else: if mc.backbone is None: disorderinfo[imol] = 'sc disorder' elif set(mc.backbone.siteids).intersection(set(disordered_siteid)): disorderinfo[imol] = 'bone disorder' else: disorderinfo[imol] = 'sc disorder' disorderinfo = stringkey(disorderinfo) return disorderinfo