Source code for cfoldseeker.remote_parsers

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import io
import logging
from Bio import SeqIO

LOG = logging.getLogger(__name__)


[docs] def extract_genomic_information_kegg(gene_entry: str) -> dict: """ Extracts the genomic information from a pulled KEGG Gene record. """ # Genomic positions are at the POSITION line position_line = [line for line in gene_entry.split('\n') if 'POSITION' in line] position_info = {} if len(position_line) == 0: return position_info else: position_line = position_line[0] # If there is a scaffold mentioned, get it if ':' in position_line: internal_scaffold_id, coords = position_line.split(':') internal_scaffold_id = internal_scaffold_id[12:] # If not, leave empty. The downstream processing will handle this else: internal_scaffold_id, coords = '', position_line # Extract the coordinates of all exons, ignoring indefinite boundaries coords = coords.translate(str.maketrans('', '', '<>')) coord_groups = re.findall(r'\d+\.\.\d+', coords) # If no coordinate information, return the response dictionary empty if len(coord_groups) == 0: return position_info # Else, parse it coord_groups = [i.split('..') for i in coord_groups] coord_groups = [[int(j) for j in i] for i in coord_groups] if 'complement' in coords: strand = "-" else: strand = "+" # Gather the results position_info['scaffold'] = internal_scaffold_id position_info['coords'] = coord_groups position_info['strand'] = strand return position_info
[docs] def extract_scaffold_mapping_kegg(genome_entry: str) -> dict: """ Maps all KEGG scaffold IDs for a Genome entry to the associated GenBank/RefSeq IDs. """ lines = genome_entry.split('\n') ## First the CHROMOSOME field # Find the start start_chromosome = [idx for idx,line in enumerate(lines) if 'CHROMOSOME' in line] if len(start_chromosome) == 0: mapping_scaffolds = {} else: # Expand it index = start_chromosome[0] chromosome_field = [lines[index][12:]] while index < len(lines) and lines[index+1].startswith(' '): index += 1 chromosome_field.append(lines[index][12:]) # Parse it internal_scaffold_ids = [re.split(r'[;\s]', l)[0] for l in chromosome_field] internal_scaffold_ids = ['' if l == 'Circular' else l for l in internal_scaffold_ids] scaffolds = [l.split(':')[1][:-1] for l in chromosome_field] mapping_scaffolds = dict(zip(internal_scaffold_ids, scaffolds)) ## Then the PLASMIDS field # Find the start start_plasmid = [idx for idx,line in enumerate(lines) if 'PLASMID' in line] if len(start_plasmid) == 0: mapping_plasmids = {} else: # Expand it index = start_plasmid[0] plasmid_field = [lines[index][12:]] while index < len(lines) and lines[index+1].startswith(' '): index += 1 plasmid_field.append(lines[index][12:]) # Parse it internal_plasmid_ids = [re.split(r'[;\s]', l)[0] for l in plasmid_field] plasmids = [l.split(':')[1][:-1] for l in plasmid_field] mapping_plasmids = dict(zip(internal_plasmid_ids, plasmids)) ## Wrap it in a dictionary mapping = mapping_scaffolds | mapping_plasmids return mapping
[docs] def extract_genomic_information_ena(record: str) -> dict: """ Extracts the genomic information from a pulled ENA GenPept record. """ position_info = {} # catch for empty or bad results if record == None: return None embl = io.StringIO(record) seq_record = list(SeqIO.parse(embl, format = 'embl'))[0] cds = [f for f in seq_record.features if f.type == 'CDS'][0] # Genome coordinates parts = cds.location.parts coord_groups = [[int(p.start)+1, int(p.end)] for p in parts] # start coordinate is one off in BioPython parsing # Scaffold scaffold = list({p.ref for p in parts})[0] # Strand strand = cds.location.strand if strand == 1: strand = '+' elif strand == -1: strand = '-' # collect position_info['coords'] = coord_groups position_info['strand'] = strand position_info['scaffold'] = scaffold return position_info