Get gene metadata
Retrieve gene metadata by NCBI Gene ID, gene symbol or RefSeq accession.
Get gene metadata
Gene metadata is available through the command line tool or by using Python or R.
For an overview of what metadata is available, see Datasets gene data report schemas .
Using NCBI Gene IDs
Get gene metadata by NCBI GeneID.
datasets summary gene gene-id 1,2,3,9,10,11,12,13,14,15,16
To get started with the Python library, see the Datasets Python API reference documentation.
For more information on the api call see the
gene_metadata_by_id()
method in the Datasets Python API reference documentation.
To see the fields in the returned V1GeneMatch
object, see the response object schema
here.
from typing import List
from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets import GeneApi as DatasetsGeneApi
from ncbi.datasets.openapi.models import V1GeneMatch
def print_gene_information(gene_record: V1GeneMatch):
# print query value that returned this specific gene
print("query", gene_record.query)
# print any errors that occured while executing the above query, e.g. invalid identifier
if gene_record.warnings:
print(gene_record.warnings)
if gene_record.errors:
print(gene_record.errors)
# print gene metadata fields
if gene_record.gene:
# use returned gene structure
print(gene_record.gene.taxname)
# another option is to convert gene_record to python dictionary and get fields that way:
gene_dictionary = gene_record.gene.to_dict()
print(gene_dictionary["symbol"])
# Provide gene ids as a list of integers
gene_ids: List[int] = [2, 3, 9, 10, 11, 12, 13, 14, 15, 16]
with DatasetsApiClient() as api_client:
gene_api = DatasetsGeneApi(api_client)
try:
# Retrieve gene metadata for the list of gene ids
gene_reply = gene_api.gene_metadata_by_id(gene_ids)
for gene in gene_reply.genes:
print_gene_information(gene)
except DatasetsApiException as e:
print(f"Exception when calling GeneApi: {e}\n")
api.gene_instance <- GeneApi$new()
result_gene <- api.gene_instance$GeneMetadataById(
'1,2,3,9,10,11,12,13,14,15,16',
returned.content='COMPLETE',
sort.schema.field='SORT_FIELD_GENE_ID'
)
prettify(result_gene$toJSONString())
for (gene_match in result_gene$genes) {
cat(gene_match$gene$gene_id, " - ", gene_match$gene$symbol, "\n")
}
Using gene symbols
Get gene metadata by gene symbol.
datasets summary gene symbol ACRV1 A2M --taxon human
To get started with the Python library, see the Datasets Python API reference documentation.
For more information on the api call see the
gene_metadata_by_tax_and_symbol()
method in the Datasets Python API reference documentation.
To see the fields in the returned V1GeneMatch
object, see the response object schema
here.
from typing import List
from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets import GeneApi as DatasetsGeneApi
from ncbi.datasets.openapi.models import V1GeneMatch
def print_gene_information(gene_record: V1GeneMatch):
# print query value that returned this specific gene
print("query", gene_record.query)
# print any errors that occured while executing the above query, e.g. invalid identifier
if gene_record.warnings:
print(gene_record.warnings)
if gene_record.errors:
print(gene_record.errors)
# print gene metadata fields
if gene_record.gene:
# get organism from returned gene structure
print("organism: ", gene_record.gene.taxname)
# another option is to convert gene_record to python dictionary and get fields that way:
gene_dictionary = gene_record.gene.to_dict()
# print the annotation release name, if available
if "annotations" in gene_dictionary:
print(gene_dictionary["annotations"][0]["release_name"])
# Provide 1 taxon - may be an NCBI taxid, scientific name or common name
taxon = "human"
# Provide gene identifiers as a list of gene symbols
gene_symbols: List[str] = ["A2M", "ACRV1"]
with DatasetsApiClient() as api_client:
gene_api = DatasetsGeneApi(api_client)
try:
# For a single species retrieve gene metadata using a list of gene symbols
gene_reply = gene_api.gene_metadata_by_tax_and_symbol(gene_symbols, taxon)
for gene in gene_reply.genes:
print_gene_information(gene)
except DatasetsApiException as e:
print(f"Exception when calling GeneApi: {e}\n")
Using transcript or protein accessions
Get gene metadata by RefSeq transcript or protein accession.
datasets summary gene accession NM_020107.5 NP_001334352.2
To get started with the Python library, see the Datasets Python API reference documentation.
For more information on the api call see the
gene_metadata_by_accession()
method in the Datasets Python API reference documentation.
To see the fields in the returned V1GeneMatch
object, see the response object schema
here.
from typing import List
from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets import GeneApi as DatasetsGeneApi
from ncbi.datasets.openapi.models import V1GeneMatch
def print_gene_information(gene_record: V1GeneMatch):
# print query value that returned this specific gene
print("query", gene_record.query)
# print any errors that occured while executing the above query, e.g. invalid identifier
if gene_record.warnings:
print(gene_record.warnings)
if gene_record.errors:
print(gene_record.errors)
# print gene metadata fields
if gene_record.gene:
# get organism from returned gene structure
print("organism: ", gene_record.gene.taxname)
# another option is to convert gene_record to python dictionary and get fields that way
gene_dictionary = gene_record.gene.to_dict()
# Print the accession and length for all transcripts
if "transcripts" in gene_dictionary:
for transcript in gene_dictionary["transcripts"]:
print(
"transcript accession: ",
transcript["accession_version"],
"length: ",
transcript["length"],
)
# Provide accessions as a list of strings
accessions: List[str] = ["NM_020107.5", "NP_001334352.2"]
with DatasetsApiClient() as api_client:
gene_api = DatasetsGeneApi(api_client)
try:
# Retrieve metadata for genes with the given transcript and protein accessions
gene_reply = gene_api.gene_metadata_by_accession(accessions)
for gene in gene_reply.genes:
print_gene_information(gene)
except DatasetsApiException as e:
print(f"Exception when calling GeneApi: {e}\n")