Source code for cfoldseeker.remote

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import json
import logging
import warnings
import polars as pl
import itertools as it
from pathlib import Path
from copy import deepcopy
from tqdm.contrib.concurrent import thread_map
from tqdm.contrib.logging import logging_redirect_tqdm
from kegg_pull.pull import MultiProcessMultiplePull
from Bio import Entrez

from cfoldseeker.classes import Hit, Search
from cfoldseeker.communication import (submit_foldseek_query, retrieve_foldseek_results, 
                                       pull_dict_from_unisave, pull_from_ena)
from cfoldseeker.remote_parsers import (extract_genomic_information_ena, 
                                        extract_genomic_information_kegg, 
                                        extract_scaffold_mapping_kegg)


LOG = logging.getLogger(__name__)



[docs]
class RemoteSearch(Search):
    """
    Subclass executing the workflow for gene cluster identification from remote protein searches.
    
    Extends the Search base class to perform FoldSeek-based searches against remote
    databases, parse results, and cross-reference hits to genomic data.
    Uses a local copy of the UniProt mapping table to retrieve genomic coordinates for each protein
    with fallback strategies using KEGG and ENA.
    
    New attributes:
        mapping_table: Polars LazyFrame containing UniProt ID cross-references to
            various databases (KEGG, EMBL-CDS, etc.).
            
    Inherits from:
        Search: Base class providing the cluster identification and output generation capabilities.
        
    See also:
        Local: Sister class providing the search and parsing capabilities for local database searches
    """
    
    def __init__(self, query, mapping_table_path, params = {}, hits = [], clusters = [], 
                 output_flags = {}, output_folder = Path('.'), temp_folder = Path('.')):
        """
        Initialise a RemoteSearch instance.
        
        Sets up the RemoteSearch object by calling the parent Search class
        initialiser and scanning the UniProt ID mapping table from a TSV file.
        
        Args:
            query: Dictionary mapping query protein names to file paths.
            mapping_table_path: Path to a TSV file containing UniProt cross-references.
                Expected format: three columns (Uniprot, DB, ID) without header.
            params: Dictionary of search parameters and configuration options.
                Defaults to an empty dictionary.
            hits: List of Hit objects from previous searches. Defaults to empty list.
            clusters: List of identified gene clusters. Defaults to empty list.
            output_flags (dict, optional): Output parameters dictionary. Defaults to {}.
            output_folder: Path object for final output directory. Defaults to current
                directory ('.').
            temp_folder: Path object for temporary file directory. Defaults to current
                directory ('.').
        
        Returns:
            None
        """
        
        super().__init__(query, params, hits, clusters, output_flags, output_folder, temp_folder)
        
        LOG.debug(f'Scanning ID mapping table from {str(mapping_table_path)}')
        self.mapping_table: pl.LazyFrame = pl.scan_csv(mapping_table_path, has_header = False, separator = "\t",
                                                       new_columns = ['Uniprot', 'DB', 'ID'])
                
        return None
    
    
    def __repr__(self) -> str:
        return f"Remote Search of {','.join(list(self.query.keys()))} in {self.params['db']} with {len(self.clusters)} clusters identified"
    
    
    def _sanitise_hit_attr(self, hits: list[Hit], attr: str) -> list[Hit]:
        """
        Sanitise the hit list for the provided attribute.
        
        Sanitising means standardising the hit table. In case of multiple values for a given attribute,
        it splits a hit instance into multiple hit instances that are identical except for that exact
        crossref attribute. The split hits are first introduced in a sublist replacing the attribute value
        in the old hit instance. Then the full hit list is flattened. In case of an absent attribute, the hit is discarded.
        
        Args:
            hits (list[Hit]): hits that must be sanitised.
            attr (str): name of the attribute that must be sanitised in each hit
            
        Mutates:
            Hit objects in list: split or unpack attribute values in case of multiple ones
            hits: empty attribute values make an instance be discarded from the list
        
        Returns:
            hits (list[Hit]): sanitised hits
        """
        sanitised_hits = []
        split_counter = 0
        discard_counter = 0
        untouched_counter = 0
        
        # Loop over all hits, but keep track of the index
        for h in hits:
            # Determine whether we should split a record (# attribute values > 1)
            nb_copies = len(getattr(h, attr))
            # If we need to split a record, make as much copies as there are attribute values 
            # and distribute these over each split instance's attribute value
            if nb_copies > 1:
                split_counter += 1
                split_records = []
                copy_counter = 0
                while copy_counter < nb_copies:
                    new_record = deepcopy(h)
                    setattr(new_record, attr, getattr(h, attr)[copy_counter])
                    split_records.append(new_record)
                    copy_counter += 1
                    
                # Insert the list of the split hit instances at the index of the old record
                sanitised_hits.append(split_records)
                
            # If we don't need to split the record, just unpack the singleton list
            elif nb_copies == 1:
                untouched_counter += 1
                setattr(h, attr, getattr(h, attr)[0])
                
                # Insert the unpacked list at the index of the old record
                sanitised_hits.append([h])
                
            # If the attribute is empty, don't include it in the new hit set
            else:
                discard_counter += 1
                continue
        
        # Flatten the full hit list again
        hits = list(it.chain(*sanitised_hits))
        
        LOG.debug(f"{split_counter} hit records have been exploded while sanitising.")
        LOG.debug(f"{discard_counter} hit records have been discarded while sanitising.")
        LOG.debug(f"{untouched_counter} hit records have been left untouched while sanitising.")
        
        return hits
    
    

[docs]
    def run_foldseek(self) -> None:
        """
        Submit protein queries to FoldSeek and retrieve the results.
        
        Sends all query structures to the FoldSeek webserver in parallel, collects
        submission tickets, monitors job status, and downloads completed results.
        Stores raw JSON results in the temporary folder.
        
        Returns:
            None
        """

        # First submit all query proteins to FoldSeek
        LOG.info('Submitting queries to FoldSeek webserver')
        query_foldseek = lambda x: submit_foldseek_query(x, self.params['db'], self.params['taxfilters'])
        query_paths = list(self.query.values())
        query_labels = list(self.query.keys())
        
        with logging_redirect_tqdm(loggers = [LOG]):
            runs = thread_map(query_foldseek, query_paths, 
                              max_workers = self.params['max_workers'],
                              leave = False,
                              disable = self.params['no_progress'])
            tickets = dict(zip(query_labels, runs))
        all_job_ids = [ticket['id'] for ticket in tickets.values()]
                
        # Then wait for the results and retrieve them automatically when completed
        LOG.info('Checking status and retrieving results when ready')
        with logging_redirect_tqdm(loggers = [LOG]):
            runs_results = thread_map(retrieve_foldseek_results, all_job_ids,
                                      max_workers = self.params['max_workers'],
                                      leave = False,
                                      disable = self.params['no_progress'])
            all_results = dict(zip(self.query.keys(), runs_results))
        
        LOG.info('All queries have been processed and downloaded!')
        self.hits = all_results
        
        LOG.info('Saving results in temporary folder')
        for query,result in all_results.items():
            with open((self.TEMP_DIR / f"foldseek_result_{query}").with_suffix('.json'), "w") as handle:
                json.dump(result, handle)
        
        return None

    
    

[docs]
    def passes_criteria(self, hit: Hit):
        """
        Check if a hit passes the criteria set for this search.
        
        Returns:
            (bool): Dit the hit pass all criteria?
        """
        criteria_passes = (hit.evalue <= self.params['max_eval'],
                           hit.score >= self.params['min_score'],
                           hit.seqid >= self.params['min_seqid'],
                           hit.qcov >= self.params['min_qcov'],
                           hit.tcov >= self.params['min_tcov'])
        
        return all(criteria_passes)

    
    

[docs]
    def identify_hits(self) -> None:
        """
        Parse FoldSeek results and create Hit objects for hits passing the predefined criteria.
        
        Extracts hit information from the raw FoldSeek results, and applies filtering
        thresholds based on e-value, bit score, sequence identity, and coverage.
        Creates Hit objects for hits meeting all criteria and removes redundant
        hits found in multiple databases, keeping only the first instance per
        UniProt ID.
        
        Returns:
            None
            
        Raises:
            RuntimeError: If the hit list is empty after applying the criteria
            
        Note:
            Logs a warning when there are no hits for a certain query and DB pair.
        """
        
        LOG.debug('Applying the following hit filtering thresholds:')
        LOG.debug(f'bitscore >= {self.params["min_score"]}')
        LOG.debug(f'query coverage >= {self.params["min_qcov"]}')
        LOG.debug(f'target coverage >= {self.params["min_tcov"]}')
        LOG.debug(f'minimum sequence identity threshold >= {self.params["min_seqid"]}')
        LOG.debug(f'maximum evalue threshold <= {self.params["max_eval"]}')
        
        ## Parse the FoldSeek results
        all_hits = []
        # Each query requires parsing a separate result
        for query, task in self.hits.items():
            LOG.debug(f"Parsing FoldSeek results for {query}")
            # In each result, multiple databases may have been queried
            for results_by_db in task['results']:
                db = results_by_db['db']
                # Since every query has been submitted separately there is only one set of alignments to parse per result set
                alignments = results_by_db['alignments']
                
                # Warn if no hit has been found in this DB
                if len(alignments) == 0:
                    LOG.warning(f'No hit has been found for query {query} in database {db}')
                    continue
                
                for hit_entry in alignments[0]:
                    target = hit_entry['target'] # uniprot ID and name
                    if 'afdb' in db:
                        db_id = target.split('-')[1]
                    elif db == 'pdb100':
                        db_id = target[:4].upper() + '.' + target[22]
                    else:
                        LOG.error(f"Unsupported DB ({db}) for ID {target}")
                        continue
                    
                    # Gather the information for each hit
                    name = ' '.join(target.split(' ')[1:])
                    taxon_name = hit_entry['taxName'] # taxon name
                    taxon_id = hit_entry['taxId'] # taxon ID
                    evalue = float(hit_entry['eval']) # FoldSeek e-value
                    score = int(hit_entry['score']) # FoldSeek hit score
                    seqid = float(hit_entry['seqId']) # Sequence identity with the query protein
                    qcov = (int(hit_entry['qEndPos']) - int(hit_entry['qStartPos'])) / int(hit_entry['qLen']) * 100 # Query coverage (in %)
                    tcov = (int(hit_entry['dbEndPos']) - int(hit_entry['dbStartPos'])) / int(hit_entry['dbLen']) * 100 # Target coverage (in %)
                    
                    # Create Hit object and add it if it passes all criteria
                    hit = Hit(db_id, query, name = name, taxon_name = taxon_name, taxon_id = taxon_id, db = db,
                                  evalue = evalue, score = score, seqid = seqid, qcov = qcov, tcov = tcov)
                    if self.passes_criteria(hit):
                        all_hits.append(hit)
        
        ## Stop if hit list is empty
        if len(all_hits) == 0:
            msg = "No hits pass the criteria!"
            LOG.error(msg)
            raise RuntimeError(msg)
            
        ## Filter out redundant hit instances of hits found in multiple databases
        # Group by Uniprot ID and DB to find potentially redundant hits
        LOG.debug('Removing redundant hits found in multiple DBs')
        hit_db_groups = {}
        for hit in all_hits:
            if (hit.db_id, hit.db) in hit_db_groups.keys():
                hit_db_groups[(hit.db_id, hit.db)].append(hit)
            else:
                hit_db_groups[(hit.db_id, hit.db)] = [hit]
                
        # Keep the first DB hit instance for every Uniprot-DB group
        filtered_hits = []
        all_hit_ids = {h.db_id for h in all_hits}
        for hit_id in all_hit_ids:
            hits_this_hit_id = [g for g in hit_db_groups.keys() if g[0] == hit_id] # all hits with this hit ID
            db_hits_to_keep = hits_this_hit_id[0] # Keep the first one
            filtered_hits.append(hit_db_groups[db_hits_to_keep])
                
        # Flatten out and save
        filtered_hits = list(it.chain(*filtered_hits))
        self.hits = filtered_hits
        
        LOG.info(f'Found {len(filtered_hits)} gene hits.')
        
        return None

    
    

[docs]
    def prepare_mapping_dict(self, all_uniprot_ids: list, db: str) -> dict:
        """
        Extract a mapping dictionary from the local UniProt mapping table.
        
        Filters the full UniProt cross-reference LazyFrame to extract IDs
        for a specific target database, grouping multiple cross-references
        per UniProt ID into a dictionary of lists.
        
        Args:
            all_uniprot_ids: List of UniProt accession numbers to extract.
            db: Target database name (e.g., 'KEGG', 'EMBL-CDS').
        
        Returns:
            A dictionary mapping UniProt IDs (keys) to lists of cross-reference
            IDs in the target database (values).
        """
        LOG.debug(f"Preparing a mapping dictionary from UniProt to {db}")
        
        mapping = self.mapping_table.filter(pl.col('Uniprot').is_in(all_uniprot_ids)
                                        ).filter(pl.col('DB') == db).drop('DB')
        mapping = mapping.group_by('Uniprot').all()
        mapping = mapping.collect() # Materialise the LazyFrame into a DataFrame
        mapping = dict(zip(mapping['Uniprot'], mapping['ID'])) # Create a mapping dictionary
        
        return mapping

    
    

[docs]
    def crossref_afdb_via_kegg(self, afdb_hits: list[Hit]) -> list[Hit]:
        """
        Cross-reference AFDB hits to KEGG IDs using the UniProt mapping table
        
        Retrieves the KEGG IDs for all hits and updates the the crossref ID and
        method if any.
        
        Args:
            afdb_hits: List of Hit objects without genomic information and cross-references
            
        Mutates:
            Hit objects in afdb_hits: Fills cross-reference ID and method attributes
        
        Returns:
            hits_failed_legg (list[Hit]): list of Hits that have no cross-reference to KEGG in 
            the UniProt mapping table. These need to be processed by a different cross-referencing
            method.
        """
        all_uniprot_ids = list({h.db_id for h in afdb_hits})
        hits_failed_kegg = []
        
        # Extract a mapping table for KEGG IDs from the Uniprot crossref mapping table
        all_uniprot_kegg = self.prepare_mapping_dict(all_uniprot_ids, 'KEGG')
        
        # Fill all KEGG IDs
        for h in afdb_hits:
            if h.db_id in all_uniprot_kegg.keys():
                h.crossref_id = all_uniprot_kegg[h.db_id]
                h.crossref_method = "KEGG"
            else:
                hits_failed_kegg.append(h)
        
        return hits_failed_kegg

    
    

[docs]
    def crossref_afdb_via_genpept(self, afdb_hits: list[Hit]) -> list[Hit]:
        """
        Cross-reference AFDB hits to GenPept IDs in ENA using the UniProt mapping table
        
        Retrieves the GenPept IDs in ENA for all hits and updates the the crossref ID and
        method if any.
        
        Args:
            afdb_hits: List of Hit objects without genomic information and cross-references
            
        Mutates:
            Hit objects in afdb_hits: Fills cross-reference ID and method attributes
        
        Returns:
            hits_failed_legg (list[Hit]): list of Hits that have no cross-reference to GenPept in 
            the UniProt mapping table. These need to be processed by a different cross-referencing
            method.
        """
        all_uniprot_ids = list({h.db_id for h in afdb_hits})
        hits_failed_genpept = []
        
        # Extract a mapping table for GenPept IDs from the Uniprot crossref mapping table
        all_uniprot_genpept = self.prepare_mapping_dict(all_uniprot_ids, 'EMBL-CDS')
        
        # Fill the GenPept IDs
        for h in afdb_hits:
            if h.db_id in all_uniprot_genpept.keys():
                genpept_id = all_uniprot_genpept[h.db_id]
                
                # Filter out empty crossrefs (often mRNA records)
                genpept_id = [i for i in genpept_id if i != '-']
                if len(genpept_id) == 0:
                    hits_failed_genpept.append(h)
                    continue
                
                h.crossref_id = genpept_id
                h.crossref_method = "GenPept"
            else:
                hits_failed_genpept.append(h)
                
        return hits_failed_genpept

    
    

[docs]
    def crossref_afdb_via_wgs_genpept(self, afdb_hits: list[Hit]) -> list[Hit]:
        """
        Cross-reference AFDB hits to WGS-GenPept IDs in UniSave using the UniSave API.
        
        Retrieves the WGS-GenPept IDs in UniSave for all hits and updates the the crossref ID and
        method if any.
        
        Args:
            afdb_hits: List of Hit objects without genomic information and cross-references
            
        Mutates:
            Hit objects in afdb_hits: Fills cross-reference ID and method attributes
        
        Returns:
            hits_failed_legg (list[Hit]): list of Hits that have no cross-reference to WGS-GenPept in UniSave.
            These need to be processed by a different cross-referencing method.
        """
        all_uniprot_ids = list({h.db_id for h in afdb_hits})
        hits_failed_wgs_genpept = []
        
        # Pull all UniSave records
        all_unisaves = pull_dict_from_unisave(all_uniprot_ids,
                                              max_workers = self.params['max_workers'],
                                              no_progress = self.params['no_progress'])
        
        # Fill the WGS-GenPept IDs
        for h in afdb_hits:
            unisave_record = all_unisaves[h.db_id]
            wgs_genpept_lines = [l for l in unisave_record.split('\n') 
                                 if 'DR   EMBL' in l 
                                 and 'NOT_ANNOTATED_CDS' not in l]
            
            # Skip if there is no cross-reference information
            if len(wgs_genpept_lines) == 0:
                hits_failed_wgs_genpept.append(h)
                continue
            
            # Extract what should be the WGS-GenPept ID
            wgs_genpept_id = wgs_genpept_lines[0].split(';')[2].strip()
            
            # Check if it has the right format
            if len(re.findall(r'[A-Z0-9]+\.[1-9]', wgs_genpept_id)) == 0:
                hits_failed_wgs_genpept.append(h)
                continue
            
            h.crossref_id = [wgs_genpept_id]
            h.crossref_method = "WGS-GenPept"
            
        return hits_failed_wgs_genpept

    
    

[docs]
    def pull_and_parse_kegg_records(self, afdb_hits: list[Hit]) -> list[Hit]:
        """
        Pull and parse the KEGG records associated with an AFDB hit. Update the hit attributes.
        
        Pulls the KEGG records associated with an AFDB hit. It immediately extracts the scaffold and strand
        information, and the genomic coordinates from the record and updates the hit's attributes accordingly.
        
        Args:
            afdb_hits: List of Hit objects with cross-references
        
        Mutates:
            Hit objects in afdb_hits: Fills scaffold, strand and coordinates attributes
            
        Returns:
            processed_hits (list[Hit]): Hit objects with updated genomic location attributes retrieved from
            this cross-referencing method.
        """
        processed_hits = []
        pull = MultiProcessMultiplePull(n_workers = self.params['max_workers'])
        
        # KEGG Gene IDs
        all_kegg_gene_ids = list({h.crossref_id for h in afdb_hits if h.crossref_method == 'KEGG'})
        LOG.info(f'Going to pull {len(all_kegg_gene_ids)} KEGG Gene entries')
        _, gene_records = pull.pull_dict(all_kegg_gene_ids)
            
        # KEGG Genome IDs
        all_kegg_genome_ids = list({'genome:' + i.split(':')[0] for i in all_kegg_gene_ids})
        LOG.info(f'Going to pull {len(all_kegg_genome_ids)} KEGG Genome entries')
        _, genome_records = pull.pull_dict(all_kegg_genome_ids)
        genome_records = {k.split(':')[-1]: v for k,v in genome_records.items()}
        
        # Extract scaffold and coordinate information from the pulled KEGG records
        LOG.debug('Parsing downloaded KEGG entries')
        genomic_info = {k: extract_genomic_information_kegg(v) for k,v in gene_records.items()}
        scaffold_mappings = {k: extract_scaffold_mapping_kegg(v) for k,v in genome_records.items()}
        
        # Pass the extracted information on to the Hit objects
        LOG.debug('Extract CDS coordinates')
        for h in afdb_hits:
            try:
                position_info = genomic_info[h.crossref_id]
                scaffold_info = scaffold_mappings[h.crossref_id.split(':')[0]]
            except KeyError:
                continue
            if len(position_info) == 0 or len(scaffold_info) == 0:
                continue
            h.coords = position_info['coords']
            h.strand = position_info['strand']
            h.scaff = scaffold_info[position_info['scaffold']]
            
            processed_hits.append(h)
            
        return processed_hits

    
    

[docs]
    def pull_and_parse_genpept_records(self, afdb_hits: list[Hit]) -> list[Hit]:
        """
        Pull and parse the (WGS-)GenPept records associated with an AFDB hit. Update the hit attributes.
        
        Pulls the (WGS-)GenPept records associated with an AFDB hit. It immediately extracts the scaffold and strand
        information, and the genomic coordinates from the record and updates the hit's attributes accordingly.
        
        Args:
            afdb_hits: List of Hit objects with cross-references
        
        Mutates:
            Hit objects in afdb_hits: Fills scaffold, strand and coordinates attributes
            
        Returns:
            processed_hits (list[Hit]): Hit objects with updated genomic location attributes retrieved from
            this cross-referencing method.
        """
        processed_hits = []
        
        all_genpept_gene_ids = list({h.crossref_id for h in afdb_hits if "GenPept" in h.crossref_method})
        
        # Pull GenPept ENA records
        LOG.info(f'Going to pull {len(all_genpept_gene_ids)} ENA GenPept entries')
        ena_records_pulled = thread_map(pull_from_ena, all_genpept_gene_ids,
                                        max_workers = self.params['max_workers'],
                                        leave = False,
                                        disable = self.params['no_progress'])
        ena_records = dict(zip(all_genpept_gene_ids, ena_records_pulled))
            
        # Extract scaffold and coordinate information from the pulled ENA records
        LOG.debug("Parsing downloaded GenPept entries")
        genomic_info = {k: extract_genomic_information_ena(v) for k,v in ena_records.items()}
        # Pass the extracted information on the Hit objects
        LOG.debug('Extract CDS coordinates')
        for h in afdb_hits:
            try:
                position_info = genomic_info[h.crossref_id]
                if position_info == None:
                    continue
                h.scaff = position_info['scaffold']
                h.coords = position_info['coords']
                h.strand = position_info['strand']
            except KeyError:
                continue
            
            processed_hits.append(h)
            
        return processed_hits

    
    

[docs]
    def update_version_digits(self, processed_hits: list[Hit], max_attempts: int = 3) -> list[Hit]:
        """
        Update the version digits of the scaffold IDs of every hit.
        
        Adds or updates the version digit of the scaffold ID of every hit. Retrieves the
        latest scaffold ID from the NCBI Entrez API with retry logic, and updates the hit's scaffold
        IDs accordingly.
        
        Args:
            processed_hits (list[Hit]): hits with filled cross-reference and genomic location attributes
            max_attempts (int): Maximum number of times to try getting the most recent version digits from Entrez.
        
        Mutates:
            Hit objects in processed_hits: Updates the scaffold attribute with the most recent version digit.
        
        Returns:
            processed_hits (list[Hit]): hits with an updated version digit in the scaffold attribute of every hit.            
        """
        # Get scaffold IDs with version digits using the NCBI Entrez API.
        all_old_scaffold_ids = [h.scaff for h in processed_hits]
        with warnings.catch_warnings(action = "ignore"):
            for attempt in range(max_attempts):
                try:
                    with Entrez.efetch(db = 'nucleotide',
                                       id = all_old_scaffold_ids,
                                       rettype = 'acc',
                                       retmode = 'text') as handle:
                        all_new_scaffold_ids = [l.rstrip() for l in handle.readlines()]
                    break
                except:
                    if attempt < max_attempts:
                        LOG.warning(f'Failed getting version digits from NCBI in attempt {attempt+1}. Max. attempts: {max_attempts}')
                    else:
                        msg = f'Failed getting version digits from NCBI in {max_attempts} attempts!'
                        LOG.error(msg)
                        raise RuntimeError(msg)
        
        ## Make mapping between old scaffold IDs and new ones
        version_mappings = {}
        for id_without in all_old_scaffold_ids:
            mapped_id = next((id_with for id_with in all_new_scaffold_ids if id_with.startswith(id_without)), None)
            if mapped_id:
                version_mappings[id_without] = mapped_id
        
        ## Replace the scaffold IDs
        for h in processed_hits:
            if h.scaff in version_mappings.keys():
                h.scaff = version_mappings[h.scaff]
                
        return processed_hits

    
                

[docs]
    def crossref_afdb(self) -> list:
        """
        Cross-reference AFDB hits and retrieve genomic neighbourhood information.
        
        Systematically cross-references AlphaFold DB (AFDB) hits to genomic data
        through three methods: KEGG IDs from KEGG, GenPept IDs from ENA, and WGS-GenPept IDs
        from UniSave. For each hit, extracts scaffold IDs, CDS coordinates, and strand information.
        Updates scaffold IDs to their latest versions using NCBI Entrez.
        
        Returns:
            A list of Hit objects with populated genomic information (scaffold,
            coordinates, and strand data).
        """
        
        ### Identify the AFDB hits
        all_afdb_hits = [h for h in self.hits if 'afdb' in h.db]
            
        ### Get the crossreffing IDs            
        ## Method 1: try to get KEGG IDs
        LOG.info('Crossref method 1: Searching crossref IDs in KEGG')
        hits_failed_kegg = self.crossref_afdb_via_kegg(all_afdb_hits)
        LOG.info(f'{len(hits_failed_kegg)} hits have no crossref in KEGG.')
        
        ## Method 2: try to get GenPept IDs for the hits that did not get a KEGG ID.
        LOG.info('Crossref method 2: Searching crossref IDs in GenPept')
        hits_failed_genpept = self.crossref_afdb_via_genpept(hits_failed_kegg)
        LOG.info(f'..., of which {len(hits_failed_genpept)} hits have no crossref in GenPept.')
        
        ## Method 3: try to get WGS GenPept IDs for the hits that failed the previous two methods
        LOG.info('Crossref method 3: Searching crossref IDs in WGS-GenPept')
        hits_failed_wgs_genpept = self.crossref_afdb_via_wgs_genpept(hits_failed_genpept)
        LOG.info(f'..., of which {len(hits_failed_wgs_genpept)} hits have no crossref in WGS-GenPept.')
            
        ## Split records with multiple crossrefs and discard the ones without crossref
        LOG.debug("Sanitising the hit list")
        all_afdb_hits = self._sanitise_hit_attr(all_afdb_hits, 'crossref_id')
        
        ### Pulling the crossreffed records
        ## Get the scaffold and genome positions for the KEGG crossreffing method
        ## Pull all KEGG Gene and Genome records
        LOG.info("Crossref method 1: Pulling and parsing crossreffed KEGG records")
        processed_hits_kegg = self.pull_and_parse_kegg_records(all_afdb_hits)
                
        ## Get the scaffold and genome positions for the GenPept crossreffing method
        ## Pull all GenPept records from ENA in EMBL format
        LOG.info("Crossref method 2 & 3: Pulling and parsing crossreffed GenPept crossrefs from ENA")
        processed_hits_genpept = self.pull_and_parse_genpept_records(all_afdb_hits)
        
        ## Gather everything
        processed_hits = processed_hits_kegg + processed_hits_genpept
        
        ### Get the latest version digit of all accession codes
        LOG.info('Adding latest version digits to accession codes')
        processed_hits = self.update_version_digits(processed_hits)
        
        ### Final update of the hit set
        all_afdb_hits = processed_hits
        LOG.info(f'{len(all_afdb_hits)} hits have been processed.')
        
        return all_afdb_hits

    


[docs]
    def run(self) -> None:
        """
        Execute the complete remote search workflow.
        
        Orchestrates all processing steps in sequence: running FoldSeek remotely via the webserver,
        parsing the results, cross-referencing using different sources, pulling the cross-referenced
        records, parsing the genomic neighbourhood coordinates from these records, and identifying 
        the gene clusters.
        
        Returns:
            None
        """
        
        LOG.info("STARTING PART 1: Executing FoldSeek search")
        self.run_foldseek()
        LOG.info('FINISHED PART 1')
        
        LOG.info("STARTING PART 2: Identifying hits in FoldSeek results")
        self.identify_hits()
        LOG.info('FINISHED PART 2')
        
        LOG.info('STARTING PART 2B: Fetching CDS coordinates via crossreffing')
        afdb_hits = self.crossref_afdb()
        self.hits = afdb_hits
        LOG.info("FINISHED PART 2B")
        
        LOG.info('STARTING PART 3: Identifying gene clusters')
        self.identify_clusters()
        LOG.info('FINISHED PART 3')
        
        return None