Source code for cfoldseeker.classes

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import operator
import itertools as it
import logging
import shutil
import polars as pl
import networkx as nx
from abc import ABC, abstractmethod
from pathlib import Path
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from cblaster.classes import Session
from cblaster.plot import plot_session
from cblaster.plot_clusters import plot_clusters


LOG = logging.getLogger(__name__)


[docs] class Hit: """ Represents a single protein structure hit from a FoldSeek search. This class encapsulates information about a homologous protein structure match, including its database identifiers, sequence similarity metrics, genomic location, and taxonomic data. Attributes: query (str): ID of the homologous query protein. db_id (str): ID of the hit in its structure database. db (str): Structure database the hit was found in. crossref_id (list): ID used for cross-referencing (either KEGG or GenPept ID). crossref_method (str): Method used for cross-referencing (either KEGG, GenPept, WGS-GenPept, or local). name (str): Annotation or description of the hit. taxon_name (str): Name of the taxon in which this hit was found. taxon_id (int): Identifier of the taxon in which this hit was found. evalue (float): E-value of the FoldSeek hit. score (int): FoldSeek alignment score. seqid (float): Sequence identity percentage with the query protein. qcov (float): Query coverage percentage. tcov (float): Target coverage percentage. scaff (str): RefSeq or GenBank ID of the scaffold encoding the hit. coords (list): Genomic coordinates of the encoding gene's exons. strand (str): DNA strand the encoding gene is located on ('+' or '-'). """ def __init__(self, db_id, query, crossref_id = [], crossref_method = '', name = '', taxon_name = '', taxon_id = 0, db = "", evalue = 1, score = 0, seqid = 0, qcov = 0, tcov = 0, scaff = '', coords = [], strand = ''): """ Initialise a Hit object with FoldSeek search results and genomic information. Args: db_id (str): ID of the hit in the structure database. query (str): ID of the homologous query protein. crossref_id (list, optional): Cross-reference ID. Defaults to []. crossref_method (str, optional): Cross-referencing method. Defaults to ''. name (str, optional): Hit annotation. Defaults to ''. taxon_name (str, optional): Taxon name. Defaults to ''. taxon_id (int, optional): Taxonomic ID. Defaults to 0. db (str, optional): Structure database name. Defaults to "". evalue (float, optional): E-value threshold. Defaults to 1. score (int, optional): FoldSeek score. Defaults to 0. seqid (float, optional): Sequence identity percentage. Defaults to 0. qcov (float, optional): Query coverage percentage. Defaults to 0. tcov (float, optional): Target coverage percentage. Defaults to 0. scaff (str, optional): Scaffold ID. Defaults to ''. coords (list, optional): Exon coordinates. Defaults to []. strand (str, optional): DNA strand. Defaults to ''. """ self.query: str = query #ID of the homologous query protein # ID attributes self.db_id: str = db_id #ID of the hit in its DB self.db: str = db #Structure database the hit was found in self.crossref_id: list = crossref_id #ID used for crossreffing (either ID from KEGG or GenPept) self.crossref_method: str = crossref_method #Method used for crossreffing (either KEGG or GenPept) # FoldSeek hit properties self.name: list = name #annotation self.taxon_name: str = taxon_name #name of the taxon in which this hit was found self.taxon_id: int = taxon_id self.evalue: float = evalue #evalue of the FoldSeek hit self.score: int = score #FoldSeek score self.seqid: float = seqid #Sequence identity with the query protein self.qcov: float = qcov #Query coverage self.tcov: float = tcov #Target coverage # Genomic properties self.scaff: str = scaff #RefSeq or GenBank ID of the scaffold encoding the hit self.coords: list = coords #list of genomic coordinates of the encoding gene's exons self.strand: str = strand #DNA strand the encoding gene is part from def __repr__(self) -> str: return f"{self.query} Hit {self.db_id}\t {self.scaff} {self.start()}-{self.end()} ({self.strand})"
[docs] def as_dict(self) -> dict: """ Convert the Hit object to a dictionary. Returns: dict: Dictionary with all Hit attributes; coordinates are formatted as double-dot-separated exon pairs joined by commas. """ return {'query': self.query, 'db_id': self.db_id, 'db': self.db, 'crossref_id': self.crossref_id, 'crossref_method': self.crossref_method, 'name': self.name, 'taxon_name': self.taxon_name, 'taxon_id': self.taxon_id, 'evalue': self.evalue, 'score': self.score, 'seqid': self.seqid, 'qcov': self.qcov, 'tcov': self.tcov, 'scaff': self.scaff, 'coords': ','.join(['..'.join([str(i) for i in seq]) for seq in self.coords]), 'strand': self.strand}
# Returns start coordinate of the first exon
[docs] def start(self) -> int | None: """ Return the start coordinate of the first exon. Returns: int | None: Minimum genomic coordinate across all exons, or None if no coordinates are defined. """ try: return min(it.chain(*self.coords)) except ValueError: return None
# Returns end coordinate of the last exon
[docs] def end(self) -> int | None: """ Return the end coordinate of the last exon. Returns: int | None: Maximum genomic coordinate across all exons, or None if no coordinates are defined. """ try: return max(it.chain(*self.coords)) except ValueError: return None
# Returns the sum of the exon lengths
[docs] def length(self) -> int: """ Return the total length in base pairs of all exons. Returns: int: Sum of lengths across all exons, calculated as (end - start + 1) for each exon. """ return sum([abs(c[1] - c[0] + 1) for c in self.coords])
# Returns the intergenic distance between two genes. Negative if they overlap
[docs] def intergenic_distance(self, other_hit: 'Hit') -> int: """ Calculate the intergenic distance between this hit and another hit. For genes on the same scaffold, computes the distance between the end of the upstream gene and the start of the downstream gene. If genes overlap, returns the negative of the length of the overlapping gene. Args: other_hit (Hit): The other Hit object to measure distance to. Returns: int: Intergenic distance in base pairs (positive for gaps, negative for overlaps). Returns the negative of the length of the smaller gene in case of a full overlap. """ first = min([self, other_hit], key = operator.methodcaller('start')) first_start = first.start() first_end = first.end() last = max([self, other_hit], key = operator.methodcaller('start')) last_start = last.start() last_end = last.end() # In case of full overlap, return the negative of the length of the smaller gene if last_start >= first_start and last_end <= first_end: return -last.length() # In case of at most a partial overlap, return the intergenic distance. else: return last_start - first_end
# Checks whether two hits are at exactly the same genomic coordinates
[docs] def same_location(self, other_hit: 'Hit') -> bool: """ Check if two hits are at exactly the same genomic coordinates. Args: other_hit (Hit): The other Hit object to compare. Returns: bool: True if both hits are on the same scaffold and their genomic coordinates completely overlap, False otherwise. """ first = min([self, other_hit], key = operator.methodcaller('start')) last = max([self, other_hit], key = operator.methodcaller('start')) return last.start() == first.start() and last.end() == first.end() and first.scaff == last.scaff
[docs] class Cluster: """ Represents a gene cluster containing one or more protein hits. A cluster groups proximal hits on the same genomic scaffold that meet specified clustering criteria. All hits in a cluster are expected to share the same scaffold and taxon. Attributes: hits (list[Hit]): List of Hit objects in the cluster. number (int): Cluster identifier/rank number. score (int): Cumulative score of all hits in the cluster. start (int): Minimum genomic coordinate across all hits. end (int): Maximum genomic coordinate across all hits. length (int): Total length in base pairs of all exons across hits. scaff (str): Scaffold/contig ID (taken from first hit). taxon_id (str): Taxonomic ID (taken from first hit). taxon_name (str): Taxonomic name (taken from first hit). """ def __init__(self, hits, number = 0): """ Initialise a Cluster object with a list of hits. Args: hits (list[Hit]): List of Hit objects to cluster together. number (int, optional): Cluster identifier number. Defaults to 0. Raises: ValueError: If hits have non-unique scaffold, taxon ID, or taxon name attributes. """ self.hits: list[Hit] = hits self.number: int = number # Calculate cluster scores by summing hit scores self.score: int = sum([h.score for h in self.hits]) # Cluster coordinates are defined as the most extreme coordinates self.start: int = min([h.start() for h in self.hits]) self.end: int = max([h.end() for h in self.hits]) # Cluster length is defined by the sum of hits' exons self.length: int = sum([h.length() for h in self.hits]) # Take over the shared scaffold ID if it's unique among the hits common_scaff = {h.scaff for h in self.hits} if len(common_scaff) > 1: msg = f"Different scaffolds found among the gene hits in cluster {' '.join([h.db_id for h in self.hits])}" LOG.error(msg) raise ValueError(msg) else: self.scaff: str = self.hits[0].scaff # Same for taxon ID common_taxon_id = {h.taxon_id for h in self.hits} if len(common_taxon_id) > 1: msg = f"Different taxon IDs found among the gene hits in cluster {' '.join([h.db_id for h in self.hits])}." LOG.error(msg) raise ValueError(msg) else: self.taxon_id: str = self.hits[0].taxon_id common_taxon_name = {h.taxon_name for h in self.hits} if len(common_taxon_name) > 1: msg = f"Different taxon names found ammong the gene hits in cluster {' '.join([h.db_id for h in self.hits])}." LOG.error(msg) raise ValueError(msg) else: self.taxon_name: str = self.hits[0].taxon_name return None def __repr__(self) -> str: return f"Cluster {self.number}: {len(self.hits)} proteins from {self.scaff} ({self.start} - {self.end}), ({self.strand})\tScore: {self.score}"
[docs] def as_dict(self) -> dict: """ Convert the Cluster object to a dictionary. Returns: dict: Dictionary with cluster attributes including comma-separated hit IDs and all genomic coordinates. """ return {'hits': ','.join([h.db_id for h in self.hits]), 'number': self.number, 'score': self.score, 'start': self.start, 'end': self.end, 'length': self.length, 'scaff': self.scaff, 'taxon_id': self.taxon_id, 'taxon_name': self.taxon_name}