Source code for pdb2sql.StructureSimilarity

import warnings
import numpy as np
from .pdb2sqlcore import pdb2sql
from .interface import interface
from .superpose import get_trans_vect, get_rotation_matrix, superpose_selection

from . import transform
import os
import pickle


[docs]class StructureSimilarity(object):

    def __init__(self, decoy, ref, verbose=False, enforce_residue_matching=True):
        """Compute structure similarity between two structures.

        This class allows to compute the i-RMSD, L-RMSD, Fnat and DockQ
        score of a given conformation.
        This can be a replacement for ProFIT.
        Note that the calculation of the zones are done by the class
        itself and does not require any extra input.

        Note:
            1. The decoy and pdb must have consistent residue numbering.
            2. The lzone files here are different with those from ProFit.
                lzone: here need only zone residues for fitting, no need
                of residue for rms calculation. RMS residues are
                automatically assumed as the other chain,
                Be careful with ProFit zone files that contain RZONE/RATOMS.
            3. Missing residues/atoms will be ignored.

        Args:
            decoy : pdb file or sql database of the decoy conformation
            ref : pdb file or sql database of the reference conformation
            verbose (bool) : verbosity option

        Examples:
            >>> from pdb2sql import StructureSimilarity
            >>> decoy = '1AK4_5w.pdb'
            >>> ref = '1AK4.pdb'
            >>> sim = StructureSimilarity(decoy,ref)
            >>> irmsd_fast = sim.compute_irmsd_fast(method='svd',
            ...     izone='1AK4.izone')
            >>> irmsd = sim.compute_irmsd_pdb2sql(method='svd',
            ...     izone='1AK4.izone')
            >>> lrmsd_fast = sim.compute_lrmsd_fast(method='svd',
            ...     lzone='1AK4.lzone',check=True)
            >>> lrmsd = sim.compute_lrmsd_pdb2sql(exportpath=None,
            ...     method='svd')
            >>> Fnat = sim.compute_fnat_pdb2sql()
            >>> Fnat_fast = sim.compute_fnat_fast(
            ...     ref_pairs='1AK4.ref_pairs')
            >>> dockQ = sim.compute_DockQScore(Fnat_fast,
            ...     lrmsd_fast,irmsd_fast)
        """

        self.decoy = decoy
        self.ref = ref
        self.verbose = verbose
        self.origin = [0., 0., 0.]
        self.enforce_residue_matching = enforce_residue_matching

    def __repr__(self):
        return f'{self.__module__}.{self.__class__.__name__}({self.decoy}, {self.ref}, {self.verbose})'

    def check_residues(self, **kwargs):
        """Check if the residue numbering matches."""

        sql_ref = pdb2sql(self.ref)
        sql_dec = pdb2sql(self.decoy)

        res_ref = sql_ref.get_residues(**kwargs)
        res_dec = sql_dec.get_residues(**kwargs)

        if res_ref != res_dec:
            print('Residues are different in the reference and decoy.')
            only_ref = set(res_ref).difference(set(res_dec))
            only_dec = set(res_dec).difference(set(res_ref))
            if only_ref:
                print('Residues found only in %s and not in %s:\n%s' %
                    (self.ref, self.decoy, only_ref))
            if only_dec:
                print('Residues found only in %s and not in %s:\n%s' %
                    (self.decoy, self.ref, only_dec))

            if self.enforce_residue_matching == True:
                raise ValueError(
                    'Residue numbering not identical in ref and decoy.\n Set enforce_residue_matching=False to bypass this error.')
            else:
                warnings.warn('Residue numbering not identical in ref and decoy.')

            return False
        
        for r_dec, r_ref in zip(res_dec, res_ref):
 
            at_ref = sql_ref.get('name', chainID=r_ref[0], resName=r_ref[1], resSeq=r_ref[2], **kwargs)
            at_dec = sql_dec.get('name', chainID=r_dec[0], resName=r_dec[1], resSeq=r_dec[2], **kwargs)
            
            if at_ref != at_dec:
                if self.enforce_residue_matching == True:
                    raise ValueError(
                        'Atoms not identical in ref and decoy.\n Set enforce_residue_matching=False to bypass this error.')
                else:
                    warnings.warn('Atoms not identical in ref and decoy.')

                return False
     

        return True
    ##########################################################################
    #
    #   FAST ROUTINE TO COMPUTE THE L-RMSD
    #   Require the precalculation of the lzone
    #   A dedicated routine is implemented to comoute the lzone
    #   if lzone is not given in argument the routine will compute them automatically
    #
    ##########################################################################

    # compute the L-RMSD
[docs]    def compute_lrmsd_fast(self, lzone=None, method='svd', check=True, name=['C', 'CA', 'N', 'O']):
        """Fast routine to compute the L-RMSD.

        L-RMSD is computed by aligning the longest chain of the decoy to
        the one of the reference and computing the RMSD of the shortest
        chain between decoy and reference. By default, both fitting and
         rms calculation use only backbone atoms. See reference:

        DockQ: A Quality Measure for Protein-Protein Docking Models
        https://doi.org/10.1371/journal.pone.0161879

        Args:
            lzone (None, optional): name of the file containing the zone
                definition. If None the file will be calculated first.
            method (str, optional): Method to align the fragments,
                'svd' or 'quaternion'.
            check (bool, optional): Check if the sequences are aligned
                and fix it if not. Defaults to True.
            name (list, optional): atom name to include in the zone.
                                   Defaults to ['C', 'CA', 'N', 'O']

        Returns:
            float: L-RMSD value of the conformation

        See also:
            :meth:`compute_lrmsd_pdb2sql`
        """

        # create/read the lzone file
        if lzone is None:
            resData = self.compute_lzone(save_file=False)
        elif not os.path.isfile(lzone):
            resData = self.compute_lzone(
                save_file=True, filename=lzone)
        else:
            resData = self.read_zone(lzone)

        if check or self.enforce_residue_matching:

            # Note:
            # 1. get_data_zone_backbone returns in_zone and not_in_zone
            #  here the in_zone defines the zone for fitting,
            #  and not_in_zone defines the zone for rms calculation.

            self.check_residues(name=name)

            data_decoy_long, data_decoy_short = self.get_data_zone_backbone(
                self.decoy, resData, return_not_in_zone=True, name=name)

            data_ref_long, data_ref_short = self.get_data_zone_backbone(
                self.ref, resData, return_not_in_zone=True, name=name)

            atom_long = data_ref_long.intersection(data_decoy_long)
            xyz_decoy_long = self._get_xyz(self.decoy, atom_long)
            xyz_ref_long = self._get_xyz(self.ref, atom_long)

            atom_short = data_ref_short.intersection(data_decoy_short)
            xyz_decoy_short = self._get_xyz(self.decoy, atom_short)
            xyz_ref_short = self._get_xyz(self.ref, atom_short)

        # extract the xyz
        else:

            xyz_decoy_long, xyz_decoy_short = self.get_xyz_zone_backbone(
                self.decoy, resData, return_not_in_zone=True, name=name)

            xyz_ref_long, xyz_ref_short = self.get_xyz_zone_backbone(
                self.ref, resData, return_not_in_zone=True, name=name)

        # print(xyz_decoy_long)

        xyz_decoy_short = superpose_selection(
            xyz_decoy_short, xyz_decoy_long, xyz_ref_long, method)

        # compute the RMSD
        return self.get_rmsd(xyz_decoy_short, xyz_ref_short)

    # compute the lzone file
[docs]    def compute_lzone(self, save_file=True, filename=None):
        """Compute the zone for L-RMSD calculation.

        Note:
            It only provides the zone of long chain(s) which is used for
            fitting. The zone used for calculating RMSD is defined in
            the function `compute_lrmsd_fast`.

        Args:
            save_file (bool, optional): save the zone file
            filename (str, optional): name of the file

        Returns:
            dict: definition of the zone.
        """
        sql_ref = pdb2sql(self.ref)
        chains = list(sql_ref.get_chains())
        if len(chains) != 2:
            raise ValueError(
                'exactly two chains are needed for lrmsd calculation but we found %d' % len(chains), chains)

        nA = len(sql_ref.get('x,y,z', chainID=chains[0]))
        nB = len(sql_ref.get('x,y,z', chainID=chains[1]))

        # detect which chain is the longest
        long_chain = chains[0]
        if nA < nB:
            long_chain = chains[1]

        # extract data about the residue
        data_test = [
            tuple(data) for data in sql_ref.get(
                'chainID,resSeq',
                chainID=long_chain)]

        data_test = sorted(set(data_test))

        # close the sql
        sql_ref._close()

        if save_file:
            if filename is None:
                f = open(self.ref.split('.')[0] + '.lzone', 'w')
            else:
                f = open(filename, 'w')
            for res in data_test:
                chain = res[0]
                num = res[1]
                f.write('zone %s%d-%s%d\n' % (chain, num, chain, num))
            f.close()

        resData = {}
        for res in data_test:
            chain = res[0]
            num = res[1]

            if chain not in resData.keys():
                resData[chain] = []
            resData[chain].append(num)

        return resData

    ##########################################################################
    #
    #   FAST ROUTINE TO COMPUTE THE I-RMSD
    #   Require the precalculation of the izone
    #   A dedicated routine is implemented to comoute the izone
    #   if izone is not given in argument the routine will compute them automatcally
    #
    ##########################################################################

[docs]    def compute_irmsd_fast(self, izone=None, method='svd',
                           cutoff=10, check=True):
        """Fast method to compute the i-rmsd.

        i-RMSD is computed by selecting the backbone atoms of reference
        interface that is defined as any pair of heavy atoms from two
        chains within 10Å of each other.
        Align these backbone atoms as best as possible with their
        coutner part in the decoy and compute the RMSD. See reference:

        DockQ: A Quality Measure for Protein-Protein Docking Models
        https://doi.org/10.1371/journal.pone.0161879

        Args:
            izone (None, optional): file name of the zone.
                if None the zones will be calculated automatically.
            method (str, optional): Method to align the fragments,
                'svd' or 'quaternion'.
            cutoff (float, optional): cutoff for the contact atoms
            check (bool, optional): Check if the sequences are aligned
                and fix it if not. Should be True.

        Returns:
            float: i-RMSD value of the conformation

        See also:
            :meth:`compute_irmsd_pdb2sql`
        """

        # read the izone file
        if izone is None:
            resData = self.compute_izone(cutoff, save_file=False)
        elif not os.path.isfile(izone):
            resData = self.compute_izone(
                cutoff, save_file=True, filename=izone)
        else:
            resData = self.read_zone(izone)

        if check or self.enforce_residue_matching:

            self.check_residues()

            data_decoy = self.get_data_zone_backbone(
                self.decoy, resData, return_not_in_zone=False)
            data_ref = self.get_data_zone_backbone(
                self.ref, resData, return_not_in_zone=False)

            atom_common = data_ref.intersection(data_decoy)
            xyz_contact_decoy = self._get_xyz(self.decoy, atom_common)
            xyz_contact_ref = self._get_xyz(self.ref, atom_common)

        # extract the xyz
        else:
            xyz_contact_decoy = self.get_xyz_zone_backbone(
                self.decoy, resData)
            xyz_contact_ref = self.get_xyz_zone_backbone(
                self.ref, resData)

        # superpose the fragments
        xyz_contact_decoy = superpose_selection(xyz_contact_decoy,
                                                xyz_contact_decoy,
                                                xyz_contact_ref, method)

        # return the RMSD
        return self.get_rmsd(xyz_contact_decoy, xyz_contact_ref)

[docs]    def compute_izone(self, cutoff=10.0, save_file=True, filename=None):
        """Compute the zones for i-rmsd calculationss.

        Args:
            cutoff (float, optional): cutoff for the contact atoms
            save_file (bool, optional): svae file containing the zone
            filename (str, optional): filename

        Returns:
            dict: i-zone definition
        """

        sql_ref = interface(self.ref)
        chains = list(sql_ref.get_chains())
        if len(chains) != 2:
            raise ValueError(
                'exactly two chains are needed for irmsd calculation but we found %d' % len(chains), chains)

        contact_ref = sql_ref.get_contact_atoms(
            cutoff=cutoff, extend_to_residue=True, chain1=chains[0], chain2=chains[1])

        index_contact_ref = []
        for _, v in contact_ref.items():
            index_contact_ref += v

        # get the xyz and atom identifier of the decoy contact atoms
        data_test = [tuple(data) for data in sql_ref.get(
            'chainID,resSeq',
            rowID=index_contact_ref,
            name=sql_ref.backbone_atoms)]

        data_test = sorted(set(data_test))

        # close the sql
        sql_ref._close()

        if save_file:

            if filename is None:
                f = open(self.ref.split('.')[0] + '.izone', 'w')
            else:
                f = open(filename, 'w')

            for res in data_test:
                chain = res[0]
                num = res[1]
                f.write('zone %s%d-%s%d\n' % (chain, num, chain, num))
            f.close()

        resData = {}
        for res in data_test:
            chain = res[0]
            num = res[1]

            if chain not in resData.keys():
                resData[chain] = []
            resData[chain].append(num)
        return resData

    ##########################################################################
    #
    #   ROUTINE TO COMPUTE THE fnat QUICKLY
    #
    ##########################################################################

[docs]    def compute_fnat_fast(self, cutoff=5):
        """Fast method to cmpute the FNAT of the conformation.

        Fnat is the fraction of reference interface contacts preserved
        in the interface of decoy. The interface is defined as any pair
        of heavy atoms from two chains within 5Å of each other.

        Args:
            cutoff (int, optional): cutoff for the contact atoms

        Returns:
            float: FNAT value

        Raises:
            ValueError: if the decoy file is not found

        See also:
            :meth:`compute_fnat_pdb2sql`
        """
        # compute ref residue pairs
        residue_pairs_ref = self.compute_residue_pairs_ref(
            cutoff, save_file=False)

        # create a dict of the decoy data
        data_decoy = pdb2sql.read_pdb(self.decoy)

        # read the decoy data
        residue_xyz = {}
        residue_name = {}
        for line in data_decoy:

            if line.startswith('ATOM'):

                chainID = line[21]
                if chainID == ' ':
                    chainID = line[72]

                resSeq = int(line[22:26])
                resName = line[17:20].strip()
                name = line[12:16].strip()

                x = float(line[30:38])
                y = float(line[38:46])
                z = float(line[46:54])

                key = (chainID, resSeq, resName)

                if key not in residue_xyz.keys():
                    residue_xyz[key] = []
                    residue_name[key] = []

                # if name in ['CA','C','N','O']
                # exclude Hydrogen
                if name[0] != 'H':
                    residue_xyz[key].append([x, y, z])
                    residue_name[key].append(name)

        # loop over the residue pairs of the ref
        nCommon, nTotal = 0, 0
        for resA, resB_list in residue_pairs_ref.items():
            if resA in residue_xyz.keys():
                xyzA = residue_xyz[resA]
                for resB in resB_list:
                    if resB in residue_xyz.keys():
                        xyzB = residue_xyz[resB]
                        dist_min = np.min(np.array(
                            [np.sqrt(np.sum((np.array(p1) - np.array(p2))**2))
                             for p1 in xyzA for p2 in xyzB]))
                        if dist_min <= cutoff:
                            nCommon += 1
                    nTotal += 1
            else:
                msg = f'\t FNAT: not find residue: {resA}'
                warnings.warn(msg)

        # normalize
        return round(nCommon / nTotal, 6)

    # compute the residue pair of the reference
[docs]    def compute_residue_pairs_ref(
            self,
            cutoff=5.0,
            save_file=True,
            filename=None):
        """Compute the residue pair on the reference conformation.

        Args:
            cutoff (float, optional): cutoff for the contact atoms
            save_file (bool, optional): save the file containing the
                residue pairs
            filename (None, optional): filename

        Returns:
            dict: defintition of the residue pairs
        """

        sql_ref = interface(self.ref)
        chains = list(sql_ref.get_chains())
        if len(chains) != 2:
            raise ValueError(
                'exactly two chains are needed for fnat calculation but we found %d' % len(chains), chains)
        residue_pairs_ref = sql_ref.get_contact_residues(
            cutoff=cutoff, return_contact_pairs=True, excludeH=True,
            chain1=chains[0], chain2=chains[1])
        sql_ref._close()

        if save_file:
            if filename is None:
                f = open(
                    self.ref.split('.')[0] +
                    'residue_contact_pairs.pckl',
                    'wb')
            else:
                f = open(filename, 'wb')

            # save as pickle
            pickle.dump(residue_pairs_ref, f)
            f.close()

        return residue_pairs_ref

    ##########################################################################
    #
    #   ROUTINE TO COMPUTE THE L-RMSD USING PDB2SQL
    #   DOES NOT REQUIRE THE PRECALCULATION OF ANYTHONG
    #   CAN OUTPUT THE SUPERIMPOSED STRUCTURES
    #   MUCH SLOWER THAN THE FAST ROUTINES BUT EASIER TO USE
    #
    ##########################################################################

    # compute the L-RMSD
[docs]    def compute_lrmsd_pdb2sql(self, exportpath=None, method='svd', **kwargs):
        """Slow routine to compute the L-RMSD.

        L-RMSD is computed by aligning the longest chain of the decoy to
        the one of the reference and computing the RMSD of the shortest
        chain between decoy and reference. Both fitting and rms calculation
        use only backbone atoms. See reference:

        DockQ: A Quality Measure for Protein-Protein Docking Models
        https://doi.org/10.1371/journal.pone.0161879

        Args:
            exportpath (str, optional): file name where the aligned pdbs
                are exported.
            method (str, optional): Method to align the fragments,
            'svd' or 'quaternion'.

        Kwargs: selection keywords used in the pdb2sql.get() method :
                'rowID', 'serial', 'name', 'altLoc',
                'resName', 'resSeq', 'iCode',
                'x', 'y', 'z', 'occ', 'temp', 'element', 'model'


        Returns:
            float: L-RMSD value of the conformation

        See also:
            :meth:`compute_lrmsd_fast`
        """

        backbone = ['CA', 'C', 'N', 'O']
        if 'name' not in kwargs:
            kwargs['name'] = backbone

        if 'chainID' in kwargs:
            raise ValueError(
                'do not specify chainID in compute_lrmsd_pdb2sql')

        # create the sql
        sql_decoy = pdb2sql(self.decoy, sqlfile='decoy.db')
        sql_ref = pdb2sql(self.ref, sqlfile='ref.db')


        # get the chains
        chains_decoy = sql_decoy.get_chains()
        chains_ref = sql_ref.get_chains()

        if chains_decoy != chains_ref:
            raise ValueError(
                'Chains are different in decoy and reference structure')

        chain1 = chains_decoy[0]
        chain2 = chains_decoy[1]

        # extract the pos of chains A
        xyz_decoy_A = np.array(
            sql_decoy.get('x,y,z', chainID=chain1, **kwargs))
        xyz_ref_A = np.array(sql_ref.get(
            'x,y,z', chainID=chain1, **kwargs))

        # extract the pos of chains B
        xyz_decoy_B = np.array(
            sql_decoy.get('x,y,z', chainID=chain2, **kwargs))
        xyz_ref_B = np.array(sql_ref.get(
            'x,y,z', chainID=chain2, **kwargs))

        # check the lengthes
        if self.check_residues(**kwargs) is False:
            xyz_decoy_A, xyz_ref_A = self.get_identical_atoms(
                sql_decoy, sql_ref, chain1, **kwargs)

            xyz_decoy_B, xyz_ref_B = self.get_identical_atoms(
                sql_decoy, sql_ref, chain2, **kwargs)

        # detect which chain is the longest
        nA, nB = len(xyz_decoy_A), len(xyz_decoy_B)
        if nA > nB:
            xyz_decoy_long = xyz_decoy_A
            xyz_ref_long = xyz_ref_A

            xyz_decoy_short = xyz_decoy_B
            xyz_ref_short = xyz_ref_B

        else:
            xyz_decoy_long = xyz_decoy_B
            xyz_ref_long = xyz_ref_B

            xyz_decoy_short = xyz_decoy_A
            xyz_ref_short = xyz_ref_A

        # get the translation so that both A chains are centered
        tr_decoy = get_trans_vect(xyz_decoy_long)
        tr_ref = get_trans_vect(xyz_ref_long)

        # translate everything for 1
        xyz_decoy_short += tr_decoy
        xyz_decoy_long += tr_decoy

        # translate everuthing for 2
        xyz_ref_short += tr_ref
        xyz_ref_long += tr_ref

        # get the ideal rotation matrix
        # to superimpose the A chains
        U = get_rotation_matrix(
            xyz_decoy_long, xyz_ref_long, method=method)

        # rotate the entire fragment
        xyz_decoy_short = transform.rotate(
            xyz_decoy_short, U, center=self.origin)

        # compute the RMSD
        lrmsd = self.get_rmsd(xyz_decoy_short, xyz_ref_short)

        # export the pdb for verifiactions
        if exportpath is not None:

            # extract the pos of the dimer
            xyz_decoy = np.array(sql_decoy.get('x,y,z'))
            xyz_ref = np.array(sql_ref.get('x,y,z'))

            # translate
            xyz_ref += tr_ref
            xyz_decoy += tr_decoy

            # rotate decoy
            xyz_decoy = transform.rotate(
                xyz_decoy, U, center=self.origin)

            # update the sql database
            sql_decoy.update_column('x', xyz_decoy[:, 0])
            sql_decoy.update_column('y', xyz_decoy[:, 1])
            sql_decoy.update_column('z', xyz_decoy[:, 2])

            sql_ref.update_column('x', xyz_ref[:, 0])
            sql_ref.update_column('y', xyz_ref[:, 1])
            sql_ref.update_column('z', xyz_ref[:, 2])

            # export
            sql_decoy.exportpdb(exportpath + '/lrmsd_decoy.pdb')
            sql_ref.exportpdb(exportpath + '/lrmsd_ref.pdb')

        # close the db
        sql_decoy._close()
        sql_ref._close()

        return lrmsd

    # RETURN THE ATOMS THAT ARE SHARED BY THE TWO DB
    # FOR A GIVEN CHAINID
    @staticmethod
    def get_identical_atoms(db1, db2, chain, **kwargs):
        """Return that atoms shared by both databse for a specific chain.

        Args:
            db1 (TYPE): pdb2sql database of the first conformation
            db2 (TYPE): pdb2sql database of the 2nd conformation
            chain (str): chain name

        Kwargs: selection keywords used in the pdb2sql.get() method :
                'rowID', 'serial', 'name', 'altLoc',
                'resName', 'chainID', 'resSeq', 'iCode',
                'x', 'y', 'z', 'occ', 'temp', 'element', 'model'

        Returns:
            list, list: list of xyz for both database
        """

        # get data
        data1 = db1.get('chainID,resSeq,name',
                        chainID=chain, **kwargs)
        data2 = db2.get('chainID,resSeq,name',
                        chainID=chain, **kwargs)

        # tuplify
        data1 = [tuple(d1) for d1 in data1]
        data2 = [tuple(d2) for d2 in data2]

        # get the intersection
        shared_data = list(set(data1).intersection(data2))
        
        # get the xyz
        xyz1, xyz2 = [], []
        for data in shared_data:
            query = 'SELECT x,y,z from ATOM WHERE chainID=? AND resSeq=? and name=?'
            xyz1.append(list(list(db1.c.execute(query, data))[0]))
            xyz2.append(list(list(db2.c.execute(query, data))[0]))

        return xyz1, xyz2

    ##########################################################################
    #
    #   ROUTINE TO COMPUTE THE I-RMSD USING PDB2SQL
    #   DOES NOT REQUIRE THE PRECALCULATION OF ANYTHiNG
    #   BUT CAN READ AN IZONE FILE AS WELL
    #   CAN OUTPUT THE SUPERIMPOSED STRUCTURES
    #   MUCH SLOWER THAN THE FAST ROUTINES BUT EASIER TO USE
    #
    ##########################################################################

[docs]    def compute_irmsd_pdb2sql(
            self,
            cutoff=10,
            method='svd',
            izone=None,
            exportpath=None):
        """Slow method to compute the i-rmsd.

        i-RMSD is computed by selecting the backbone atoms of reference
        interface that is defined as any pair of heavy atoms from two
        chains within 10Å of each other.
        Align these backbone atoms as best as possible with their
        coutner part in the decoy and compute the RMSD. See reference:

        DockQ: A Quality Measure for Protein-Protein Docking Models
        https://doi.org/10.1371/journal.pone.0161879

        Args:
            izone (None, optional): file name of the zone.
                if None the zones will be calculated first.
            method (str, optional): Method to align the fragments,
                'svd' or 'quaternion'.
            cutoff (float, optional): cutoff for the contact atoms
            exportpath (str, optional): file name where the aligned pdbs
                are exported.

        Returns:
            float: i-RMSD value of the conformation

        See also:
            :meth:`compute_irmsd_fast`
        """

        # create thes sql
        sql_decoy = interface(self.decoy)
        sql_ref = interface(self.ref)

        # get the chains
        chains_decoy = sql_decoy.get_chains()
        chains_ref = sql_ref.get_chains()

        if chains_decoy != chains_ref:
            raise ValueError(
                'Chains are different in decoy and reference structure')

        # get the contact atoms
        if izone is None:

            contact_ref = sql_ref.get_contact_atoms(
                cutoff=cutoff,
                extend_to_residue=True,
                chain1=chains_ref[0],
                chain2=chains_ref[1])

            index_contact_ref = []
            for v in contact_ref.values():
                index_contact_ref += v
            index_contact_ref = sql_ref.get(
                'rowID', rowID=index_contact_ref, name=sql_ref.backbone_atoms)
        else:
            index_contact_ref = self.get_izone_rowID(
                sql_ref, izone, return_only_backbone_atoms=True)

        # get the xyz and atom identifier of the decoy contact atoms
        xyz_contact_ref = sql_ref.get(
            'x,y,z', rowID=index_contact_ref)
        data_contact_ref = sql_ref.get(
            'chainID,resSeq,resName,name',
            rowID=index_contact_ref)

        # get the xyz and atom indeitifier of the reference
        xyz_decoy = sql_decoy.get('x,y,z')
        data_decoy = sql_decoy.get('chainID,resSeq,resName,name')

        # loop through the ref label
        # check if the atom is in the decoy
        # if yes -> add xyz to xyz_contact_decoy
        # if no  -> remove the corresponding to xyz_contact_ref
        xyz_contact_decoy = []
        index_contact_decoy = []
        clean_ref = False
        for iat, atom in enumerate(data_contact_ref):

            try:
                index = data_decoy.index(atom)
                index_contact_decoy.append(index)
                xyz_contact_decoy.append(xyz_decoy[index])
            except Exception:
                xyz_contact_ref[iat] = None
                index_contact_ref[iat] = None
                clean_ref = True

        # clean the xyz
        if clean_ref:
            xyz_contact_ref = [
                xyz for xyz in xyz_contact_ref if xyz is not None]
            index_contact_ref = [
                ind for ind in index_contact_ref if ind is not None]

        # check that we still have atoms in both chains
        chain_decoy = list(
            set(sql_decoy.get('chainID', rowID=index_contact_decoy)))
        chain_ref = list(
            set(sql_ref.get('chainID', rowID=index_contact_ref)))

        if len(chain_decoy) < 1 or len(chain_ref) < 1:
            raise ValueError(
                'Error in i-rmsd: only one chain represented in one chain')

        # get the translation so that both A chains are centered
        tr_decoy = get_trans_vect(xyz_contact_decoy)
        tr_ref = get_trans_vect(xyz_contact_ref)

        # translate everything
        xyz_contact_decoy += tr_decoy
        xyz_contact_ref += tr_ref

        # get the ideql rotation matrix
        # to superimpose the A chains
        rot_mat = get_rotation_matrix(
            xyz_contact_decoy,
            xyz_contact_ref,
            method=method)

        # rotate the entire fragment
        xyz_contact_decoy = transform.rotate(
            xyz_contact_decoy, rot_mat, center=self.origin)

        # compute the RMSD
        irmsd = self.get_rmsd(xyz_contact_decoy, xyz_contact_ref)

        # export the pdb for verifiactions
        if exportpath is not None:

            # update the sql database
            sql_decoy.update_xyz(
                xyz_contact_decoy, rowID=index_contact_decoy)
            sql_ref.update_xyz(
                xyz_contact_ref, rowID=index_contact_ref)

            sql_decoy.exportpdb(
                exportpath + '/irmsd_decoy.pdb',
                rowID=index_contact_decoy)
            sql_ref.exportpdb(
                exportpath + '/irmsd_ref.pdb',
                rowID=index_contact_ref)

        # close the db
        sql_decoy._close()
        sql_ref._close()

        return irmsd

    # get the rowID of all the atoms
    def get_izone_rowID(self, sql, izone, return_only_backbone_atoms=True):
        """Compute the index of the izone atoms.

        Args:
            sql (pdb2sql): database of the conformation
            izone (str): filename to store the zone
            return_only_backbone_atoms (bool, optional): Returns only
                the backbone atoms

        Returns:
            lis(int): index of the atoms in the zone

        Raises:
            FileNotFoundError: if the izone file is not found
        """
        # read the file
        if not os.path.isfile(izone):
            raise FileNotFoundError('i-zone file not found', izone)

        with open(izone, 'r') as f:
            data = f.readlines()

        # get the data out of it
        resData = {}
        for line in data:

            res = line.split()[1].split('-')[0]
            chainID, resSeq = res[0], int(res[1:])

            if chainID not in resData.keys():
                resData[chainID] = []

            resData[chainID].append(resSeq)

        # get the rowID
        index_contact = []

        for chainID, resSeq in resData.items():
            if return_only_backbone_atoms:
                index_contact += sql.get('rowID',
                                         chainID=chainID,
                                         resSeq=resSeq,
                                         name=['C',
                                               'CA',
                                               'N',
                                               'O'])
            else:
                index_contact += sql.get('rowID',
                                         chainID=chainID, resSeq=resSeq)

        return index_contact

    ##########################################################################
    #
    #   ROUTINE TO COMPUTE THE fnat USING PDB2SQL
    #
    ##########################################################################

[docs]    def compute_fnat_pdb2sql(self, cutoff=5.0):
        """Slow method to compute the FNAT of the conformation.

        Fnat is the fraction of reference interface contacts preserved
        in the interface of decoy. The interface is defined as any pair
        of heavy atoms from two chains within 5Å of each other.

        Args:
            cutoff (int, optional): cutoff for the contact atoms

        Returns:
            float: FNAT value

        See also:
            :meth:`compute_fnat_fast`
        """

        # create the sql
        sql_decoy = interface(self.decoy, fix_chainID=True)
        sql_ref = interface(self.ref, fix_chainID=True)
        chains = list(sql_ref.get_chains())
        if len(chains) != 2:
            raise ValueError(
                'exactly two chains are needed for irmsd calculation but we found %d' % len(chains), chains)

        # get the contact atoms
        residue_pairs_decoy = sql_decoy.get_contact_residues(
            cutoff=cutoff, return_contact_pairs=True, excludeH=True,
            chain1=chains[0], chain2=chains[1])
        residue_pairs_ref = sql_ref.get_contact_residues(
            cutoff=cutoff, return_contact_pairs=True, excludeH=True,
            chain1=chains[0], chain2=chains[1])

        # form the pair data
        data_pair_decoy = []
        for resA, resB_list in residue_pairs_decoy.items():
            data_pair_decoy += [(resA, resB) for resB in resB_list]

        # form the pair data
        data_pair_ref = []
        for resA, resB_list in residue_pairs_ref.items():
            data_pair_ref += [(resA, resB) for resB in resB_list]

        # find the umber of residue that ref and decoys hace in common
        nCommon = len(
            set(data_pair_ref).intersection(data_pair_decoy))

        # normalize
        fnat = nCommon / len(data_pair_ref)

        sql_decoy._close()
        sql_ref._close()

        return round(fnat, 6)

    ##########################################################################
    #
    #   HELPER ROUTINES TO HANDLE THE ZONE FILES
    #
    ##########################################################################
    @staticmethod
    def get_xyz_zone_backbone(pdb_file, resData, return_not_in_zone=False, name=['C', 'CA', 'N', 'O']):
        """Get the xyz of zone backbone atoms.

        Args:
            pdb_file (str): filename containing the pdb of the molecule
            resData (dict): information about the zone residues
            return_not_in_zone (bool, optional): Do we return the
                backbone atoms not in the zone and the chains used
                in the zone.

        Returns:
            list(float): XYZ of of backbone atoms in the zone.
        """

        # read the ref file
        data = pdb2sql.read_pdb(pdb_file)

        # get the xyz of the
        xyz_in_zone = []
        xyz_not_in_zone = []

        for line in data:
            if line.startswith('ATOM'):
                chainID = line[21]
                if chainID == ' ':
                    chainID = line[72]

                resSeq = int(line[22:26])
                atname = line[12:16].strip()

                x = float(line[30:38])
                y = float(line[38:46])
                z = float(line[46:54])

                if atname in name:
                    if chainID in resData.keys():
                        if resSeq in resData[chainID]:
                            xyz_in_zone.append([x, y, z])
                    else:
                        xyz_not_in_zone.append([x, y, z])

        if return_not_in_zone:
            return xyz_in_zone, xyz_not_in_zone

        else:
            return xyz_in_zone

    @staticmethod
    def get_data_zone_backbone(pdb_file, resData, return_not_in_zone=False, name=['C', 'CA', 'N', 'O']):
        """Get the data (chainID, resSeq, name) of backbone atoms in the zone.

        Args:
            pdb_file (str): filename containing the pdb of the molecule
            resData (dict): information about the zone residues
            return_not_in_zone (bool, optional): Do we return the atoms
                not in the zone and the chains used in the zone

        Returns:
            set(float): data of the backbone atoms in the zone
        """
        # read the ref file
        data = pdb2sql.read_pdb(pdb_file)

        # get the xyz of the
        data_in_zone = []
        data_not_in_zone = []

        for line in data:

            if line.startswith('ATOM'):

                chainID = line[21]
                if chainID == ' ':
                    chainID = line[72]

                resSeq = int(line[22:26])
                atname = line[12:16].strip()

                if atname in name:
                    if chainID in resData.keys():
                        if resSeq in resData[chainID]:
                            data_in_zone.append((chainID, resSeq, atname))
                    else:
                        data_not_in_zone.append((chainID, resSeq, atname))

        if return_not_in_zone:
            return set(data_in_zone), set(data_not_in_zone)

        else:
            return set(data_in_zone)

    @staticmethod
    def read_zone(zone_file):
        """Read the zone file.

        Args:
            zone_file (str): name of the file

        Returns:
            dict: Info about the residues in the zone

        Raises:
            FileNotFoundError: if the zone file is not found
        """
        # read the izone file
        if not os.path.isfile(zone_file):
            raise FileNotFoundError('zone file not found', zone_file)

        with open(zone_file, 'r') as f:
            data = f.readlines()

        # get the data out of it
        resData = {}
        for line in data:
            # line = zone A4-A4   for positive resNum
            # or line = zone A-4-A-4 for negative resNum
            # that happens for example in 2OUL

            # split the line
            res = line.split()[1].split('-')

            # if the resnum was positive
            # we have e.g res = [A4,A4]
            if len(res) == 2:
                res = res[0]
                chainID, resSeq = res[0], int(res[1:])

            # if the resnum was negative was negtive
            # we have e.g res = [A,4,A,4]
            elif len(res) == 4:
                chainID, resSeq = res[0], -int(res[1])

            if chainID not in resData.keys():
                resData[chainID] = []

            resData[chainID].append(resSeq)

        return resData

    @staticmethod
    def _get_xyz(pdb_file, index):
        """Get xyz using (chainID, resSeq, name) index.

        Args:
            pdb_file(file): pdb file or data
            index(set): set of index represeneted with (chainID, resSeq, name)

        Returns:
            list: list of xyz
        """
        data = pdb2sql.read_pdb(pdb_file)
        xyz = []

        for line in data:
            if line.startswith('ATOM'):
                chainID = line[21]
                if chainID == ' ':
                    chainID = line[72]

                resSeq = int(line[22:26])
                name = line[12:16].strip()

                x = float(line[30:38])
                y = float(line[38:46])
                z = float(line[46:54])

                if (chainID, resSeq, name) in index:
                    xyz.append([x, y, z])

        return xyz

    ##########################################################################
    #
    #   CAPRI categories and DockQ score
    #
    ##########################################################################
[docs]    @staticmethod
    def compute_CapriClass(fnat, lrmsd, irmsd, system='protein-protein'):
        """Compute CAPRI ranking classes.

        Note:
            Criteria of CAPRI classes:
            https://doi.org/10.1371/journal.pone.0161879
            https://doi.org/10.1002/prot.21804
            The protocol for classifying predicted model into the four CAPRI
            categories should start with those defining incorrect predictions.

        Args:
            fnat(float): fnat
            lrmsd(float): ligand rmsd
            irmsd(float ): interface rmsd
            system (str): the type of complex system.
                Defaults to 'protein-protein'.

        Returns:
            str: CAPRI rank class, i.e. high, medium, acceptable or incorrect.
        """

        if system == 'protein-protein':
            if fnat < 0.1 or (lrmsd > 10.0 and irmsd > 4.0):
                label = 'incorrect'
            elif 0.1 <= fnat < 0.3 and (lrmsd <= 10.0 or irmsd <= 4.0) or \
                (fnat >= 0.3 and lrmsd > 5.0 and irmsd > 2.0):
                label = 'acceptable'
            elif 0.3 <= fnat < 0.5 and (lrmsd <= 5.0 or irmsd <= 2.0) or \
                (fnat >= 0.5 and lrmsd > 1.0 and irmsd > 1.0):
                label = 'medium'
            elif fnat >= 0.5 and (lrmsd <= 1.0 or irmsd <= 1.0):
                label = 'high'
        else:
            warnings.warn(
                f'Invalid complex type {system} for CAPRI class calculation')

        return label

    # compute the DockQ score from the different elements
[docs]    @staticmethod
    def compute_DockQScore(fnat, lrmsd, irmsd, d1=8.5, d2=1.5):
        """Compute the DockQ Score.

        Args:
            Fnat (float): Fnat value
            lrmsd (float): lrmsd value
            irmsd (float): irmsd value
            d1 (float, optional): first coefficient for the DockQ
                calculations
            d2 (float, optional): second coefficient for the DockQ
                calculations

        Returns:
            float: dockQ value
        """

        def scale_rms(rms, d):
            return(1. / (1 + (rms / d)**2))

        dockq = 1. / 3 * \
            (fnat + scale_rms(lrmsd, d1) + scale_rms(irmsd, d2))
        return round(dockq, 6)

    ##########################################################################
    #
    #   clahses
    #
    ##########################################################################

[docs]    @staticmethod
    def compute_clashes(pdb, chain1='A', chain2='B'):
        """Compute number of atomic clashes.

        Note:
            Clashes were defined as contacts between nonhydrogen atoms
            separated by <3.0Å. Structural models where number of clashes
            was 2 SD away from the average are excluded for assessment in
            CAPRI. see ref:  https://doi.org/10.1002/prot.10393

        Args:
            pdb(file): pdb file or data
            chain1 (str): first chain ID. Defaults to 'A'.
            chain2 (str): second chain ID. Defaults to 'B'.

        Returns:
            int: number of atomic clashes.
        """
        db = interface(pdb)
        atom_contact_pairs = db.get_contact_atoms(
            cutoff=3.0, excludeH=True,
            return_contact_pairs = True,
            chain1=chain1, chain2=chain2)
        db._close()
        nclash = 0
        for v in atom_contact_pairs.values():
            nclash += len(v)
        return nclash

    ##########################################################################
    #
    #   ROUTINES TO ACTUALY SUPERPOSE THE MOLECULES
    #
    ##########################################################################

    # compute the RMSD of two sets of points
    @staticmethod
    def get_rmsd(P, Q):
        """compute the RMSD.

        Args:
            P (np.array(nx3)): position of the points in the first
                molecule
            Q (np.array(nx3)): position of the points in the second
                molecule

        Returns:
            float: RMSD value
        """
        n = len(P)
        return round(np.sqrt(1. / n * np.sum((P - Q)**2)), 3)