Get one representative protein sequence for all human ACE2 orthologs

Each ortholog gene may have many transcripts. Here we describe how you can filter the resulting metadata to only the longest transcript.

Get one representative protein sequence for all human ACE2 orthologs

Each ortholog gene may have many transcripts. Here we describe how you can filter the resulting metadata to only the longest transcript.

Advanced filtering of metadata should be done in a programming language, such as python.

# [START ortholog_longest_transcript]
from typing import Iterator, List

from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets.openapi import GeneApi as DatasetsGeneApi

input_symbol = "ACE2"
input_taxon = "human"


def find_longest_transcript(gene):
    if not gene.transcripts:
        return None
    num_trs = len(gene.transcripts)
    if num_trs == 0:
        return None
    if num_trs == 1:
        return gene.transcripts[0]

    def get_tr_length(tr):
        return tr.length

    tr_by_len = sorted(gene.transcripts, key=get_tr_length, reverse=True)
    maxlen = get_tr_length(tr_by_len[0])
    longest = []
    for tr in tr_by_len:
        trlen = get_tr_length(tr)
        if trlen < maxlen:
            break
        longest.append(tr)
    if len(longest) == 1:
        return longest[0]
    # select ealiest accession.version (oldest sequence)
    return sorted(longest, key=lambda tr: tr.accession_version)[0]


def _genes_for_ortholog(ortholog_set) -> Iterator:
    for gene_match in ortholog_set.genes.genes:
        if gene_match.messages:
            for m in gene_match.messages:
                if m.error:
                    print(f"error: [{m.error.reason}] {m.error.message}")
                if m.warning:
                    print(f"warning: [{m.warning.reason}] {m.warning.message}")
        if gene_match.warnings:
            for m in gene_match.warnings:
                print(f"warning: [{m.reason}] {m.message}")
        if gene_match.gene:
            yield gene_match.gene


def retrieve_orthologs_by_gene_id(gene_api: DatasetsGeneApi, gene_id: int):
    ortholog_set = gene_api.gene_orthologs_by_id(gene_id)
    for gene in _genes_for_ortholog(ortholog_set):
        longest_tr = find_longest_transcript(gene)
        print(
            {
                "gene_id": gene.gene_id,
                "symbol": gene.symbol,
                "tr-accession": longest_tr.accession_version,
                "tr-name": longest_tr.name,
                "tr-length": longest_tr.length,
                "prot-accession": longest_tr.protein.accession_version,
                "prot-name": longest_tr.protein.name,
                "prot-length": longest_tr.protein.length,
            }
        )


def gene_ids_for_symbol_and_taxon(gene_api: DatasetsGeneApi, symbols: List[str], taxon: str) -> List[int]:
    """Return a list of gene-ids for a set of symbols and taxon"""
    gene_ids_for_symbols: List[int] = []
    try:
        gene_reply = gene_api.gene_metadata_by_tax_and_symbol(
            symbols=symbols,
            taxon=taxon,
        )
        for gene in gene_reply.genes:
            gene_ids_for_symbols.append(gene.gene.gene_id)
    except DatasetsApiException as e:
        print(f"Exception when calling GeneApi: {e}\n")
    return gene_ids_for_symbols


def retrieve_orthologs_for_symbol(symbol: str, taxon: str):
    with DatasetsApiClient() as api_client:
        gene_api = DatasetsGeneApi(api_client)
        try:
            gene_ids = gene_ids_for_symbol_and_taxon(gene_api, [symbol], taxon)
            if len(gene_ids) != 1:
                raise Exception(f"Unexpected number of gene-ids: {gene_ids}")

            retrieve_orthologs_by_gene_id(gene_api, gene_ids[0])
        except DatasetsApiException as e:
            print(f"Exception when calling GeneApi: {e}\n")


# [END ortholog_longest_transcript]


if __name__ == "__main__":
    retrieve_orthologs_for_symbol(symbol=input_symbol, taxon=input_taxon)
Generated October 22, 2021