Download a genome data package

Download an NCBI Datasets Genome Data Package via the website, command line tool or programming languages

Download a genome data package

Download an NCBI Datasets Genome Data Package via the website, command line tool or programming languages

Download an NCBI Datasets Genome Data Package , including sequences, annotation and detailed data report.

Genome data packages can be downloaded by NCBI Taxonomy ID or taxonomic name, NCBI Assembly accession, or NCBI BioProject accession.

This How-to guide works best for smaller downloads (< 5 animal genomes or < 500 prokaryote genomes). For larger downloads, try our How-to for large downloads .

Using a taxonomic name

  1. Start at the NCBI Datasets Genome page
  2. Click the name Homo sapiens in the list of popular species or type homo sapiens in the Taxonomic Name search box
  3. Select one or more genome assemblies
  4. Click Download
  5. Select the sequence and annotation files you want to download and name your file
  6. Click Download again

Run the following command to download a zip archive containing genome data:

datasets download genome taxon human --filename human_dataset.zip

To get started with the Python library, see the Datasets Python API reference documentation.

To download genomes for a taxonomic name, first retrieve all NCBI asssembly accessions for the selected organism using the get_assembly_metadata_by_taxon method from ncbi-datasets-pylib. Next, download the data package for those accessions using the download_assembly_package method. Lastly, open the zip file and extract the names of protein fasta files and some genome metadata using the AssemblyDataset class.

import sys
from typing import List

from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets.openapi import GenomeApi as DatasetsGenomeApi
from ncbi.datasets.metadata.genome import get_assembly_metadata_by_taxon

from ncbi.datasets.package import dataset


taxname = "pacific white shrimp"
zipfile_name = "shrimp.zip"

# download command takes ncbi genome accessions so get accesions for taxname first
accessions: List[str] = [
    asm_rec.assembly.assembly_accession
    for asm_rec in get_assembly_metadata_by_taxon(taxname, returned_content="ASSM_ACC")
]
if not accessions:
    sys.exit()

print(f"found {len(accessions)} genomes for {taxname}: ", accessions)

# download an NCBI Datasets Genome Data Package given a list of NCBI accessions
with DatasetsApiClient() as api_client:
    genome_api = DatasetsGenomeApi(api_client)
    try:
        print("Begin download of genome data package ...")
        genome_ds_download = genome_api.download_assembly_package(
            accessions, include_annotation_type=["RNA_FASTA", "PROT_FASTA"], _preload_content=False
        )

        with open(zipfile_name, "wb") as f:
            f.write(genome_ds_download.data)
        print(f"Download completed -- see {zipfile_name}")
    except DatasetsApiException as e:
        sys.exit(f"Exception when calling download_assembly_package: {e}\n")

# open the package zip archive so we can retrieve files from it
package = dataset.AssemblyDataset(zipfile_name)
# print the names and types of all files in the downloaded zip file
print(package.get_catalog())

# search by file type to get the names of all the protein fasta files in the package
for file_name in package.get_file_names_by_type("PROTEIN_FASTA"):
    print(file_name)

# get the data report and print the organism name and assembly level for each genome
for report in package.get_data_reports():
    print(f"{report.organism_name}\t{report.assembly_info.assembly_level}")

Using an Assembly accession

Get the genome data package using an NCBI Assembly accession, for example for the human reference assembly, GRCh38.

  1. Visit the NCBI Assembly page
  2. Paste the NCBI Assembly Accession GCF_000001405.39 into the search box at the top of the page
  3. Click Search
  4. Find the desired assembly in the search results and click the Assembly name underlined in blue to go to the Assembly record
  5. In the column on the right side of the page, under Access the data, click NCBI Datasets

Next, download the selected genome.

  1. Select the genome assembly by clicking the checkbox in the row with the Assembly name
  2. Click Download
  3. Select the sequence and annotation files you want to download and name your file
  4. Click Download again

Run the following command to download a zip archive containing genome data:

datasets download genome accession GCF_000001405.39 --filename human_GRCh38_dataset.zip

To get started with the Python library, see the Datasets Python API reference documentation.

First download the data package for the selected NCBI Assembly accessions using the download_assembly_package method. Next, open the zip file and print the genomic fasta file names and some genome metadata using the AssemblyDataset class.

import sys
from typing import List

from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets.openapi import GenomeApi as DatasetsGenomeApi

from ncbi.datasets.package import dataset


accessions: List[str] = ["GCF_000001405.39"]
zipfile_name = "human_reference.zip"

# download an NCBI Datasets Genome Data Package given a list of NCBI Assembly accessions
with DatasetsApiClient() as api_client:
    genome_api = DatasetsGenomeApi(api_client)
    try:
        print("Begin download of genome data package ...")
        genome_ds_download = genome_api.download_assembly_package(
            accessions, include_annotation_type=["RNA_FASTA", "PROT_FASTA"], _preload_content=False
        )

        with open(zipfile_name, "wb") as f:
            f.write(genome_ds_download.data)
        print(f"Download completed -- see {zipfile_name}")
    except DatasetsApiException as e:
        sys.exit(f"Exception when calling download_assembly_package: {e}\n")

# open the package zip archive so we can retrieve files from it
package = dataset.AssemblyDataset(zipfile_name)
# print the names and types of all files in the downloaded zip file
print(package.get_catalog())

# search by file type to get the names of all the genomic fasta files in the package
for file_name in package.get_file_names_by_type("GENOMIC_NUCLEOTIDE_FASTA"):
    print(file_name)

# get the data report and print the organism name and assembly level for each genome
for report in package.get_data_reports():
    print(f"{report.organism_name}\t{report.assembly_info.assembly_level}")
For more information, see the Datasets R API reference documentation.
api.genome_instance <- GenomeApi$new()
result_genome <- api.genome_instance$DownloadAssemblyPackage(accessions='GCF_000001405.39', filename='grch38.zip')

Using BioProject accession

Get data for genome assemblies belonging to an NCBI BioProject, for example, the Sanger 25 Genomes Project, PRJEB33226.
  1. Start at the NCBI Datasets Homepage
  2. Enter a BioProject accession, for example PRJEB33226 into the search box at the top of the page
  3. Click Search
  4. In the BioProject box, click browse a table of Genomes for this project
  5. Select one or more genome assemblies
  6. Click Download
  7. Select the sequence and annotation files you want to download and name your file
  8. Click Download again
datasets download genome accession PRJEB33226 --filename sanger_bioproject_dataset.zip

To get started with the Python library, see the Datasets Python API reference documentation.

To download genomes for one or more NCBI Bioprojects, first retrieve all NCBI Asssembly accessions for the selected bioprojects using the get_assembly_metadata_by_bioproject_accessions method from ncbi-datasets-pylib. Next, download the data package for those accessions using the download_assembly_package method.Lastly, open the zip file and print the genomic fasta file names and some genome metadata using the AssemblyDataset class.

import sys
from typing import List

from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets.openapi import GenomeApi as DatasetsGenomeApi
from ncbi.datasets.metadata.genome import get_assembly_metadata_by_bioproject_accessions

from ncbi.datasets.package import dataset


bioprojects: List[str] = ["PRJEB33226"]
zipfile_name = "PRJEB33226.zip"

# download command takes ncbi genome accessions so get accesions for taxname first
accessions = [
    asm_rec.assembly.assembly_accession
    for asm_rec in get_assembly_metadata_by_bioproject_accessions(bioprojects, returned_content="ASSM_ACC")
]
if not accessions:
    sys.exit()

print(f"found {len(accessions)} genomes for bioprojects {bioprojects}: ", accessions)

# download an NCBI Datasets Genome Data Package given a list of NCBI accessions
with DatasetsApiClient() as api_client:
    genome_api = DatasetsGenomeApi(api_client)
    try:
        print("Begin download of genome data package ...")
        genome_ds_download = genome_api.download_assembly_package(
            accessions, include_annotation_type=["RNA_FASTA", "PROT_FASTA"], _preload_content=False
        )

        with open(zipfile_name, "wb") as f:
            f.write(genome_ds_download.data)
        print(f"Download completed -- see {zipfile_name}")
    except DatasetsApiException as e:
        sys.exit(f"Exception when calling download_assembly_package: {e}\n")

# open the package zip archive so we can retrieve files from it
package = dataset.AssemblyDataset(zipfile_name)
# print the names and types of all files in the downloaded zip file
print(package.get_catalog())

# get the data report and print the organism name and assembly level for each genome
for report in package.get_data_reports():
    print(f"{report.organism_name}\t{report.assembly_info.assembly_level}")

Filtering by genome assembly properties

When downloading a genome data package by either taxon, Assembly or BioProject accession, you can filter the results by different genome assembly properties, including the following:

  • reference status
  • annotation status
  • assembly level
  • year released
  • infraspecies name
  • assembly name
  • submitter name
  1. Start at the NCBI Datasets Genome page
  2. Click the name Homo sapiens in the list of popular species or type homo sapiens in the Taxonomic Name search box and click the species name
  3. Expand the Filters box
  4. To filter by reference status, annotation status, assembly level or year released, use the appropriate slider or switch.
  5. To filter by infraspecies name, assembly name or submitter name, enter the term into the Text Filter box.
  6. Select one or more genome assemblies
  7. Click Download
  8. Select the sequence and annotation files you want to download and name your file
  9. Click Download again

Get data for the human reference genome:

datasets download genome taxon human --reference
Get data for annotated human genomes:
datasets download genome taxon human --annotated
Get data for human genomes with the Assembly level of "complete genome" (all chromosomes are gapless):
datasets download genome taxon human --assembly-level complete_genome
Get data for human genomes released after January 1, 2020:
datasets download genome taxon human --released-since 01/01/2020
Get data for human genomes submitted by the T2T Consortium:
datasets download genome taxon human --search 'T2T Consortium'

To get started with the Python library, see the Datasets Python API reference documentation.

All of the genome metadata retrieval functions support filtering, but for this example use the get_assembly_metadata_by_taxon method from ncbi-datasets-pylib to get filtered NCBI Assembly accessions. Next, download the data package for the selected accessions using the download_assembly_package method. Lastly, open the zip file and print the genomic fasta file names and some genome metadata using the AssemblyDataset class.

import sys
from typing import List

from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets.openapi import GenomeApi as DatasetsGenomeApi

from ncbi.datasets.package import dataset

from ncbi.datasets.metadata.genome import get_assembly_metadata_by_taxon

taxon_name = "Zea mays"
zipfile_name = "zea_mays.zip"


accessions: List[str] = [
    asm_rec.assembly.assembly_accession
    for asm_rec in get_assembly_metadata_by_taxon(
        taxon_name, filters_search_text=["B73", "Maize Genome Sequencing Project"], returned_content="ASSM_ACC"
    )
]
print(accessions)

if not accessions:
    sys.exit()


# download an NCBI Datasets Genome Data Package given a list of NCBI Assembly accessions
with DatasetsApiClient() as api_client:
    genome_api = DatasetsGenomeApi(api_client)
    try:
        print("Begin download of genome data package ...")
        genome_ds_download = genome_api.download_assembly_package(
            accessions, include_annotation_type=["RNA_FASTA", "PROT_FASTA"], _preload_content=False
        )

        with open(zipfile_name, "wb") as f:
            f.write(genome_ds_download.data)
        print(f"Download completed -- see {zipfile_name}")
    except DatasetsApiException as e:
        sys.exit(f"Exception when calling download_assembly_package: {e}\n")

# open the package zip archive so we can retrieve files from it
package = dataset.AssemblyDataset(zipfile_name)
# print the names and types of all files in the downloaded zip file
print(package.get_catalog())

# search by file type to get the names of all the genomic fasta files in the package
for file_name in package.get_file_names_by_type("GENOMIC_NUCLEOTIDE_FASTA"):
    print(file_name)

# get the data report and print the organism name and assembly level for each genome
for report in package.get_data_reports():
    print(f"{report.organism_name}\t{report.assembly_info.assembly_level}")
Generated October 22, 2021