Download a genome data package

Download an NCBI Datasets Genome Data Package via the command line tool or programming languages

Download a genome data package

Download an NCBI Datasets Genome Data Package via the command line tool or programming languages

Download an NCBI Datasets Genome Data Package , including sequences, annotation and detailed data report.

Genome data packages can be downloaded by NCBI Taxonomy ID or taxonomic name, NCBI Assembly accession, or NCBI BioProject accession.

This How-to guide works best for smaller downloads (< 5 animal genomes or < 500 prokaryote genomes). For larger downloads, try our How-to for large downloads .

Using a taxonomic name

Run the following command to download a zip archive containing genome data:

datasets download genome taxon human --filename human_dataset.zip

To get started with the Python library, see the Datasets Python API reference documentation.

To download genomes for a taxonomic name, first retrieve all NCBI asssembly accessions for the selected organism using the get_assembly_metadata_by_taxon method from ncbi-datasets-pylib. Next, download the data package for those accessions using the download_assembly_package method. Lastly, open the zip file and extract the names of protein fasta files and some genome metadata using the AssemblyDataset class.

import sys
from typing import List

from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets import GenomeApi as DatasetsGenomeApi
from ncbi.datasets.metadata.genome import get_assembly_metadata_by_taxon

from ncbi.datasets.package import dataset


taxname = "pacific white shrimp"
zipfile_name = "shrimp.zip"

# download command takes ncbi genome accessions so get accesions for taxname first
accessions: List[str] = [
    asm_rec.assembly.assembly_accession
    for asm_rec in get_assembly_metadata_by_taxon(taxname, returned_content="ASSM_ACC")
]
if not accessions:
    sys.exit()

print(f"found {len(accessions)} genomes for {taxname}: ", accessions)

# download an NCBI Datasets Genome Data Package given a list of NCBI accessions
with DatasetsApiClient() as api_client:
    genome_api = DatasetsGenomeApi(api_client)
    try:
        print("Begin download of genome data package ...")
        genome_ds_download = genome_api.download_assembly_package(
            accessions,
            include_annotation_type=["RNA_FASTA", "PROT_FASTA"],
            _preload_content=False,
        )

        with open(zipfile_name, "wb") as f:
            f.write(genome_ds_download.data)
        print(f"Download completed -- see {zipfile_name}")
    except DatasetsApiException as e:
        sys.exit(f"Exception when calling download_assembly_package: {e}\n")

# open the package zip archive so we can retrieve files from it
package = dataset.AssemblyDataset(zipfile_name)
# print the names and types of all files in the downloaded zip file
print(package.get_catalog())

# search by file type to get the names of all the protein fasta files in the package
for file_name in package.get_file_names_by_type("PROTEIN_FASTA"):
    print(file_name)

# get the data report and print the organism name and assembly level for each genome
for report in package.get_data_reports():
    print(f"{report.organism_name}\t{report.assembly_info.assembly_level}")

Using an Assembly accession

Get the genome data package using an NCBI Assembly accession, for example for the human reference assembly, GRCh38.

Run the following command to download a zip archive containing genome data:

datasets download genome accession GCF_000001405.40 --filename human_GRCh38_dataset.zip

To get started with the Python library, see the Datasets Python API reference documentation.

First download the data package for the selected NCBI Assembly accessions using the download_assembly_package method. Next, open the zip file and print the genomic fasta file names and some genome metadata using the AssemblyDataset class.

import sys
from typing import List

from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets import GenomeApi as DatasetsGenomeApi

from ncbi.datasets.package import dataset


accessions: List[str] = ["GCF_000001405.40"]
zipfile_name = "human_reference.zip"

# download an NCBI Datasets Genome Data Package given a list of NCBI Assembly accessions
with DatasetsApiClient() as api_client:
    genome_api = DatasetsGenomeApi(api_client)
    try:
        print("Begin download of genome data package ...")
        genome_ds_download = genome_api.download_assembly_package(
            accessions,
            include_annotation_type=["RNA_FASTA", "PROT_FASTA"],
            _preload_content=False,
        )

        with open(zipfile_name, "wb") as f:
            f.write(genome_ds_download.data)
        print(f"Download completed -- see {zipfile_name}")
    except DatasetsApiException as e:
        sys.exit(f"Exception when calling download_assembly_package: {e}\n")

# open the package zip archive so we can retrieve files from it
package = dataset.AssemblyDataset(zipfile_name)
# print the names and types of all files in the downloaded zip file
print(package.get_catalog())

# search by file type to get the names of all the genomic fasta files in the package
for file_name in package.get_file_names_by_type("GENOMIC_NUCLEOTIDE_FASTA"):
    print(file_name)

# get the data report and print the organism name and assembly level for each genome
for report in package.get_data_reports():
    print(f"{report.organism_name}\t{report.assembly_info.assembly_level}")
For more information, see the Datasets R API reference documentation.
api.genome_instance <- GenomeApi$new()
result_genome <- api.genome_instance$DownloadAssemblyPackage(accessions='GCF_000001405.40', filename='grch38.zip')

Using BioProject accession

Get data for genome assemblies belonging to an NCBI BioProject, for example, the Sanger 25 Genomes Project, PRJEB33226.
datasets download genome accession PRJEB33226 --filename sanger_bioproject_dataset.zip

To get started with the Python library, see the Datasets Python API reference documentation.

To download genomes for one or more NCBI Bioprojects, first retrieve all NCBI Asssembly accessions for the selected bioprojects using the get_assembly_metadata_by_bioproject_accessions method from ncbi-datasets-pylib. Next, download the data package for those accessions using the download_assembly_package method.Lastly, open the zip file and print the genomic fasta file names and some genome metadata using the AssemblyDataset class.

import sys
from typing import List

from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets import GenomeApi as DatasetsGenomeApi
from ncbi.datasets.metadata.genome import get_assembly_metadata_by_bioproject_accessions

from ncbi.datasets.package import dataset


bioprojects: List[str] = ["PRJEB33226"]
zipfile_name = "PRJEB33226.zip"

# download command takes ncbi genome accessions so get accesions for taxname first
accessions = [
    asm_rec.assembly.assembly_accession
    for asm_rec in get_assembly_metadata_by_bioproject_accessions(bioprojects, returned_content="ASSM_ACC")
]
if not accessions:
    sys.exit()

print(f"found {len(accessions)} genomes for bioprojects {bioprojects}: ", accessions)

# download an NCBI Datasets Genome Data Package given a list of NCBI accessions
with DatasetsApiClient() as api_client:
    genome_api = DatasetsGenomeApi(api_client)
    try:
        print("Begin download of genome data package ...")
        genome_ds_download = genome_api.download_assembly_package(
            accessions,
            include_annotation_type=["RNA_FASTA", "PROT_FASTA"],
            _preload_content=False,
        )

        with open(zipfile_name, "wb") as f:
            f.write(genome_ds_download.data)
        print(f"Download completed -- see {zipfile_name}")
    except DatasetsApiException as e:
        sys.exit(f"Exception when calling download_assembly_package: {e}\n")

# open the package zip archive so we can retrieve files from it
package = dataset.AssemblyDataset(zipfile_name)
# print the names and types of all files in the downloaded zip file
print(package.get_catalog())

# get the data report and print the organism name and assembly level for each genome
for report in package.get_data_reports():
    print(f"{report.organism_name}\t{report.assembly_info.assembly_level}")

Filtering by genome assembly properties

When downloading a genome data package by either taxon, Assembly or BioProject accession, you can filter the results by different genome assembly properties, including the following:

  • reference status
  • annotation status
  • assembly level
  • year released
  • infraspecies name
  • assembly name
  • submitter name

Get data for the human reference genome:

datasets download genome taxon human --reference
Get data for annotated human genomes:
datasets download genome taxon human --annotated
Get data for human genomes with the Assembly level of "complete genome" (all chromosomes are gapless):
datasets download genome taxon human --assembly-level complete_genome
Get data for human genomes released after January 1, 2020:
datasets download genome taxon human --released-since 01/01/2020
Get data for human genomes submitted by the T2T Consortium:
datasets download genome taxon human --search 'T2T Consortium'

To get started with the Python library, see the Datasets Python API reference documentation.

All of the genome metadata retrieval functions support filtering, but for this example use the get_assembly_metadata_by_taxon method from ncbi-datasets-pylib to get filtered NCBI Assembly accessions. Next, download the data package for the selected accessions using the download_assembly_package method. Lastly, open the zip file and print the genomic fasta file names and some genome metadata using the AssemblyDataset class.

import sys
from typing import List

from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets import GenomeApi as DatasetsGenomeApi

from ncbi.datasets.package import dataset

from ncbi.datasets.metadata.genome import get_assembly_metadata_by_taxon

taxon_name = "Zea mays"
zipfile_name = "zea_mays.zip"


accessions: List[str] = [
    asm_rec.assembly.assembly_accession
    for asm_rec in get_assembly_metadata_by_taxon(
        taxon_name,
        filters_search_text=["B73", "Maize Genome Sequencing Project"],
        returned_content="ASSM_ACC",
    )
]
print(accessions)

if not accessions:
    sys.exit()


# download an NCBI Datasets Genome Data Package given a list of NCBI Assembly accessions
with DatasetsApiClient() as api_client:
    genome_api = DatasetsGenomeApi(api_client)
    try:
        print("Begin download of genome data package ...")
        genome_ds_download = genome_api.download_assembly_package(
            accessions,
            include_annotation_type=["RNA_FASTA", "PROT_FASTA"],
            _preload_content=False,
        )

        with open(zipfile_name, "wb") as f:
            f.write(genome_ds_download.data)
        print(f"Download completed -- see {zipfile_name}")
    except DatasetsApiException as e:
        sys.exit(f"Exception when calling download_assembly_package: {e}\n")

# open the package zip archive so we can retrieve files from it
package = dataset.AssemblyDataset(zipfile_name)
# print the names and types of all files in the downloaded zip file
print(package.get_catalog())

# search by file type to get the names of all the genomic fasta files in the package
for file_name in package.get_file_names_by_type("GENOMIC_NUCLEOTIDE_FASTA"):
    print(file_name)

# get the data report and print the organism name and assembly level for each genome
for report in package.get_data_reports():
    print(f"{report.organism_name}\t{report.assembly_info.assembly_level}")
Generated December 6, 2022