942 lines
57 KiB
XML
942 lines
57 KiB
XML
<?xml version="1.0" encoding="utf-8"?>
|
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
|
|
|
<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
|
<!-- AppResources meta begin -->
|
|
<meta name="paf-app-resources" content="" />
|
|
<!-- AppResources meta end -->
|
|
|
|
<!-- TemplateResources meta begin -->
|
|
<meta name="paf_template" content="StdNCol" />
|
|
|
|
<!-- TemplateResources meta end -->
|
|
|
|
<!-- Page meta begin -->
|
|
|
|
<!-- Page meta end -->
|
|
|
|
<!-- Logger begin -->
|
|
<meta xmlns:ncbi-portal="http://ncbi.gov/portal/XSLT/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" name="ncbi_app" content="genbank" /><meta xmlns:ncbi-portal="http://ncbi.gov/portal/XSLT/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" name="ncbi_pdid" content="custom-page" />
|
|
<!-- Logger end -->
|
|
|
|
<title>Annotating Genomes with GFF3 or GTF files</title>
|
|
|
|
<!-- PageFixtures headcontent begin -->
|
|
|
|
<meta name="cms-local-nav-url" content="https://cms.ncbi.nlm.nih.gov//genbank/_nav" />
|
|
|
|
<!-- PageFixtures headcontent end -->
|
|
|
|
<!-- AppResources external_resources begin -->
|
|
<script type="text/javascript" src="/core/jig/1.15.6/js/jig.min.js"></script>
|
|
|
|
<!-- AppResources external_resources end -->
|
|
|
|
<!-- Page headcontent begin -->
|
|
<meta name="subsite" content="genbank" />
|
|
<meta name="path" content="genbank/genomes_gff" />
|
|
<meta name="modified" content="2024-03-08T13:58:42Z" /><meta xmlns:ncbi-portal="http://ncbi.gov/portal/XSLT/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" name="cms-edit-aux-url" content="http://cms.ncbi.nlm.nih.gov/node//edit" />
|
|
<!-- Page headcontent end -->
|
|
<!-- PageFixtures resources begin -->
|
|
<link xmlns="http://www.w3.org/1999/xhtml" type="text/css" rel="stylesheet" href="//static.pubmed.gov/portal/portal3rc.fcgi/4218191/css/4207974/4206132.css" xml:base="http://127.0.0.1/sites/static/header_footer" />
|
|
|
|
<!-- PageFixtures resources end -->
|
|
<link rel="shortcut icon" href="//www.ncbi.nlm.nih.gov/favicon.ico" /><meta name="ncbi_phid" content="CE8B25027C7F008100000000011300E0.m_5" />
|
|
<meta name='referrer' content='origin-when-cross-origin'/><link type="text/css" rel="stylesheet" href="//static.pubmed.gov/portal/portal3rc.fcgi/4218137/css/4121862/3974050/3917732/251717/4108189/14534/45193/3534283/4128070/3407145/4005757/4062871.css" /><link type="text/css" rel="stylesheet" href="//static.pubmed.gov/portal/portal3rc.fcgi/4218137/css/3529741/3529739.css" media="print" /></head>
|
|
<body class=" col2 custom-page">
|
|
<div class="grid">
|
|
<div class="col twelve_col nomargin shadow">
|
|
<!-- System messages like service outage or JS required; this is handled by the TemplateResources portlet -->
|
|
<div class="sysmessages">
|
|
<noscript>
|
|
<p class="nojs">
|
|
<strong>Warning:</strong>
|
|
The NCBI web site requires JavaScript to function.
|
|
<a href="/guide/browsers/#enablejs" title="Learn how to enable JavaScript" target="_blank">more...</a>
|
|
</p>
|
|
</noscript>
|
|
</div>
|
|
<!--/.sysmessage-->
|
|
<div class="wrap">
|
|
<div class="page">
|
|
<div xmlns:xi="http://www.w3.org/2001/XInclude">
|
|
<div xmlns="http://www.w3.org/1999/xhtml" id="universal_header" xml:base="http://127.0.0.1/sites/static/header_footer">
|
|
<section class="usa-banner">
|
|
<div class="usa-accordion">
|
|
<header class="usa-banner-header">
|
|
<div class="usa-grid usa-banner-inner">
|
|
<img src="https://www.ncbi.nlm.nih.gov/coreutils/uswds/img/favicons/favicon-57.png" alt="U.S. flag" />
|
|
<p>An official website of the United States government</p>
|
|
<button class="non-usa-accordion-button usa-banner-button" aria-expanded="false" aria-controls="gov-banner-top" type="button">
|
|
<span class="usa-banner-button-text">Here's how you know</span>
|
|
</button>
|
|
</div>
|
|
</header>
|
|
<div class="usa-banner-content usa-grid usa-accordion-content" id="gov-banner-top" aria-hidden="true">
|
|
<div class="usa-banner-guidance-gov usa-width-one-half">
|
|
<img class="usa-banner-icon usa-media_block-img" src="https://www.ncbi.nlm.nih.gov/coreutils/uswds/img/icon-dot-gov.svg" alt="Dot gov" />
|
|
<div class="usa-media_block-body">
|
|
<p>
|
|
<strong>The .gov means it's official.</strong>
|
|
<br />
|
|
Federal government websites often end in .gov or .mil. Before
|
|
sharing sensitive information, make sure you're on a federal
|
|
government site.
|
|
</p>
|
|
</div>
|
|
</div>
|
|
<div class="usa-banner-guidance-ssl usa-width-one-half">
|
|
<img class="usa-banner-icon usa-media_block-img" src="https://www.ncbi.nlm.nih.gov/coreutils/uswds/img/icon-https.svg" alt="Https" />
|
|
<div class="usa-media_block-body">
|
|
<p>
|
|
<strong>The site is secure.</strong>
|
|
<br />
|
|
The <strong>https://</strong> ensures that you are connecting to the
|
|
official website and that any information you provide is encrypted
|
|
and transmitted securely.
|
|
</p>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</section>
|
|
<div class="usa-overlay"></div>
|
|
<header class="ncbi-header" role="banner" data-section="Header">
|
|
|
|
<div class="usa-grid">
|
|
<div class="usa-width-one-whole">
|
|
|
|
<div class="ncbi-header__logo">
|
|
<a href="/" class="logo" aria-label="NCBI Logo" data-ga-action="click_image" data-ga-label="NIH NLM Logo">
|
|
<img src="https://www.ncbi.nlm.nih.gov/coreutils/nwds/img/logos/AgencyLogo.svg" alt="NIH NLM Logo" />
|
|
</a>
|
|
</div>
|
|
|
|
<div class="ncbi-header__account">
|
|
<a id="account_login" href="https://account.ncbi.nlm.nih.gov" class="usa-button header-button" style="display:none" data-ga-action="open_menu" data-ga-label="account_menu">Log in</a>
|
|
<button id="account_info" class="header-button" style="display:none" aria-controls="account_popup" type="button">
|
|
<span class="fa fa-user" aria-hidden="true">
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="20px" height="20px">
|
|
<g style="fill: #fff">
|
|
<ellipse cx="12" cy="8" rx="5" ry="6"></ellipse>
|
|
<path d="M21.8,19.1c-0.9-1.8-2.6-3.3-4.8-4.2c-0.6-0.2-1.3-0.2-1.8,0.1c-1,0.6-2,0.9-3.2,0.9s-2.2-0.3-3.2-0.9 C8.3,14.8,7.6,14.7,7,15c-2.2,0.9-3.9,2.4-4.8,4.2C1.5,20.5,2.6,22,4.1,22h15.8C21.4,22,22.5,20.5,21.8,19.1z"></path>
|
|
</g>
|
|
</svg>
|
|
</span>
|
|
<span class="username desktop-only" aria-hidden="true" id="uname_short"></span>
|
|
<span class="sr-only">Show account info</span>
|
|
</button>
|
|
</div>
|
|
|
|
<div class="ncbi-popup-anchor">
|
|
<div class="ncbi-popup account-popup" id="account_popup" aria-hidden="true">
|
|
<div class="ncbi-popup-head">
|
|
<button class="ncbi-close-button" data-ga-action="close_menu" data-ga-label="account_menu" type="button">
|
|
<span class="fa fa-times">
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 48 48" width="24px" height="24px">
|
|
<path d="M38 12.83l-2.83-2.83-11.17 11.17-11.17-11.17-2.83 2.83 11.17 11.17-11.17 11.17 2.83 2.83 11.17-11.17 11.17 11.17 2.83-2.83-11.17-11.17z"></path>
|
|
</svg>
|
|
</span>
|
|
<span class="usa-sr-only">Close</span></button>
|
|
<h4>Account</h4>
|
|
</div>
|
|
<div class="account-user-info">
|
|
Logged in as:<br />
|
|
<b><span class="username" id="uname_long">username</span></b>
|
|
</div>
|
|
<div class="account-links">
|
|
<ul class="usa-unstyled-list">
|
|
<li><a id="account_myncbi" href="/myncbi/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_myncbi">Dashboard</a></li>
|
|
<li><a id="account_pubs" href="/myncbi/collections/bibliography/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_pubs">Publications</a></li>
|
|
<li><a id="account_settings" href="/account/settings/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_settings">Account settings</a></li>
|
|
<li><a id="account_logout" href="/account/signout/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_logout">Log out</a></li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
</div>
|
|
</div>
|
|
</header>
|
|
<div role="navigation" aria-label="access keys">
|
|
<a id="nws_header_accesskey_0" href="https://www.ncbi.nlm.nih.gov/guide/browsers/#ncbi_accesskeys" class="usa-sr-only" accesskey="0" tabindex="-1">Access keys</a>
|
|
<a id="nws_header_accesskey_1" href="https://www.ncbi.nlm.nih.gov" class="usa-sr-only" accesskey="1" tabindex="-1">NCBI Homepage</a>
|
|
<a id="nws_header_accesskey_2" href="/myncbi/" class="set-base-url usa-sr-only" accesskey="2" tabindex="-1">MyNCBI Homepage</a>
|
|
<a id="nws_header_accesskey_3" href="#maincontent" class="usa-sr-only" accesskey="3" tabindex="-1">Main Content</a>
|
|
<a id="nws_header_accesskey_4" href="#" class="usa-sr-only" accesskey="4" tabindex="-1">Main Navigation</a>
|
|
</div>
|
|
<section data-section="Alerts">
|
|
<div class="ncbi-alerts-placeholder"></div>
|
|
</section>
|
|
</div>
|
|
</div>
|
|
<!--/.header-->
|
|
<div class="header">
|
|
<div class="res_logo"><h1 class="res_name"><a href="/genbank/" title="GenBank home">GenBank</a></h1><h2 class="res_tagline">Public nucleic acid sequence repository</h2></div>
|
|
<div class="search"><form method="get" action="/nuccore/"><div class="search_form"><label for="database" class="offscreen_noflow">Search database</label><select id="database"><optgroup label="Recent"><option value="nuccore" selected="selected">Nucleotide</option><option value="sra">SRA</option><option value="books">Books</option><option value="clinvar" class="last">ClinVar</option></optgroup><optgroup label="All"><option value="gquery">All Databases</option><option value="assembly">Assembly</option><option value="biocollections">Biocollections</option><option value="bioproject">BioProject</option><option value="biosample">BioSample</option><option value="books">Books</option><option value="clinvar">ClinVar</option><option value="cdd">Conserved Domains</option><option value="gap">dbGaP</option><option value="dbvar">dbVar</option><option value="gene">Gene</option><option value="genome">Genome</option><option value="gds">GEO DataSets</option><option value="geoprofiles">GEO Profiles</option><option value="gtr">GTR</option><option value="ipg">Identical Protein Groups</option><option value="medgen">MedGen</option><option value="mesh">MeSH</option><option value="nlmcatalog">NLM Catalog</option><option value="nuccore">Nucleotide</option><option value="omim">OMIM</option><option value="pmc">PMC</option><option value="protein">Protein</option><option value="proteinclusters">Protein Clusters</option><option value="protfam">Protein Family Models</option><option value="pcassay">PubChem BioAssay</option><option value="pccompound">PubChem Compound</option><option value="pcsubstance">PubChem Substance</option><option value="pubmed">PubMed</option><option value="snp">SNP</option><option value="sra">SRA</option><option value="structure">Structure</option><option value="taxonomy">Taxonomy</option><option value="toolkit">ToolKit</option><option value="toolkitall">ToolKitAll</option><option value="toolkitbookgh">ToolKitBookgh</option></optgroup></select><div class="nowrap"><label for="term" class="offscreen_noflow" accesskey="/">Search term</label><div class="nowrap"><input type="text" name="term" id="term" title="Search Nucleotide" value="" class="jig-ncbiclearbutton jig-ncbiautocomplete" data-jigconfig="isEnabled:false,disableUrl:'NcbiSearchBarAutoComplCtrl'" autocomplete="off" data-sbconfig="ds:'no',pjs:'no',afs:'yes'" /></div><button id="search" type="submit" class="button_search nowrap" cmd="go">Search</button></div></div></form></div>
|
|
|
|
</div>
|
|
<div class="nav_and_browser">
|
|
<div class="localnav"><ul class="jig-ncbilocalnav">
|
|
<li><a href="#">GenBank</a><ul>
|
|
<li><a href="/genbank/">About GenBank</a></li>
|
|
<li><a href="/genbank/submit_types">Submission Types</a></li>
|
|
<li><a href="/genbank/submit">Submission Tools</a></li>
|
|
<li><a href="/genbank/update">Update GenBank Records</a></li>
|
|
<li><a href="/nuccore/">Search</a></li>
|
|
<li><a href="/BLAST/Blast.cgi?CMD=Web&PAGETYPE=BLASTHome">BLAST</a></li>
|
|
<li><a href="/genbank/statistics">Statistics</a></li>
|
|
<li><a href="/genbank/samplerecord/">Sample Record</a></li>
|
|
<li><a href="/genbank/sequencerevisionhistory/">Revision History</a></li>
|
|
<li><a href="/genbank/sequenceids/">Sequence IDs</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a href="#">Submit</a><ul>
|
|
<li><a href="/genbank/submit">Submission Tools</a></li>
|
|
<li><a href="/genbank/submit_types">Submission Types</a></li>
|
|
<li><a href="/WebSub/?tool=genbank">BankIt</a></li>
|
|
<li><a href="/genbank/table2asn">table2asn</a></li>
|
|
<li><a href="https://www.ncbi.nlm.nih.gov/sra/docs/sequence-data-processing">Sequence Data Processing</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a href="#">Genomes</a><ul>
|
|
<li><a href="/genbank/genomesubmit">Complete Genome Submission Guide</a></li>
|
|
<li><a href="/genbank/genomesubmit_annotation">Prokaryotic Genome Annotation Guide</a></li>
|
|
<li><a href="/genbank/eukaryotic_genome_submission_annotation">Eukaryotic Genome Annotation Guide</a></li>
|
|
<li><a href="/genbank/examples.wgs">Annotation Examples</a></li>
|
|
<li><a href="https://submit.ncbi.nlm.nih.gov/subs/wgs/">Genome Submission Portal</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a title="Whole Genome Shotgun sequences and submissions" href="#">WGS</a><ul>
|
|
<li><a href="/genbank/wgs">About WGS</a></li>
|
|
<li><a href="/Traces/wgs">WGS Project List</a></li>
|
|
<li><a href="/genbank/wgs.submit">WGS Submission Guide</a></li>
|
|
<li><a href="/genbank/wgsfaq/">FAQ</a></li>
|
|
<li><a href="https://submit.ncbi.nlm.nih.gov/subs/wgs/">Genome Submission Portal</a></li>
|
|
<li><a href="/genbank/eukaryotic_genome_submission_annotation">Eukaryotic Annotation Guide</a></li>
|
|
<li><a href="/genbank/genomesubmit_annotation">Prokaryotic Annotation Guide</a></li>
|
|
<li><a href="/genbank/asndisc">Discrepancy Report</a></li>
|
|
<li><a href="/assembly/agp/AGP_Specification/">AGP format</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a href="#">Metagenomes</a><ul>
|
|
<li><a href="/genbank/metagenome">About Metagenomes</a></li>
|
|
<li><a href="/genbank/structuredcomment">Structured Comment</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a href="#">TPA</a><ul>
|
|
<li><a href="/genbank/TPA">About TPA</a></li>
|
|
<li><a href="/genbank/tpafaq">FAQ</a></li>
|
|
<li><a href="/genbank/TPA-Exp">TPA-Exp</a></li>
|
|
<li><a href="/genbank/TPA-Inf">TPA-Inf</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a href="#">TSA</a><ul>
|
|
<li><a href="/genbank/TSA">About TSA</a></li>
|
|
<li><a href="/genbank/TSAguide">TSA Submission Guide</a></li>
|
|
<li><a href="/genbank/TSAfaq">FAQ</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a href="#">INSDC</a><ul>
|
|
<li><a href="/genbank/collab">About INSDC</a></li>
|
|
<li><a href="/genbank/collab/country">Geographic Location Name List</a></li>
|
|
<li><a href="/genbank/collab/db_xref">db_xref List</a></li>
|
|
<li><a href="http://www.insdc.org/documents/feature_table.html">Feature Table</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a href="#">Documentation</a><ul>
|
|
<li><a href="https://www.ncbi.nlm.nih.gov/sra/docs/sequence-data-processing/">Sequence Data Processing</a></li>
|
|
<li><a href="/genbank/submission_brokers">Submission Brokers</a></li>
|
|
<li><a href="/genbank/acc_prefix">Accession Number Prefixes</a></li>
|
|
<li><a href="/genbank/organelle_submit/">Organelle Submission Guide</a></li>
|
|
<li><a href="/genbank/monkeypox_submission/">Monkeypox Submission Guide</a></li>
|
|
<li><a href="/genbank/validation/">Common Submission Errors</a> </li>
|
|
<li><a href="/genbank/sequencecheck/">Ribosomal Submission Errors</a></li>
|
|
<li><a href="/genbank/sequencecheck/virus">Common Sequence Errors</a></li>
|
|
<li><a href="https://support.nlm.nih.gov/knowledgebase/category/?id=CAT-01240">Submission FAQs</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a href="#">Other</a><ul>
|
|
<li><a href="/genbank/htgs">About HTGs</a></li>
|
|
<li><a href="/genbank/dbest">About EST</a></li>
|
|
<li><a href="/genbank/dbgss">About GSS</a></li>
|
|
<li><a href="/genbank/tls">About TLS</a></li>
|
|
<li><a href="/genbank/tlsguide">Submit TLS</a></li>
|
|
</ul>
|
|
</li>
|
|
</ul></div>
|
|
</div>
|
|
|
|
<!-- was itemctrl -->
|
|
<div class="container">
|
|
<div id="maincontent" class="content col twelve_col last">
|
|
<div class="col1">
|
|
<h1 id="annotating-genomes-with-gff3-or-">Annotating Genomes with GFF3 or GTF files</h1>
|
|
|
|
|
|
<p>This page describes how to create an annoated genome submission from GFF3 or GTF files, using the beta version of our process. Note that you can always use GenBank's standard 5-column feature table (see <a href="https://www.ncbi.nlm.nih.gov/genbank/genomesubmit_annotation">Prokaryotic Annotation Guidelines</a> or <a href="https://www.ncbi.nlm.nih.gov/genbank/eukaryotic_genome_submission">Eukaryotic Annotation Guidelines</a>) as input.</p>
|
|
|
|
|
|
<h3 id="table-of-contents">Table of Contents</h3>
|
|
|
|
|
|
<ul>
|
|
<li><a href="#basicformat">Basic format</a></li>
|
|
<li><a href="#genbank_specific">GenBank-specific requirements</a><ul>
|
|
<li><a href="#formatting">Formatting requirements</a></li>
|
|
<li><a href="#changes">Changes that occur during processing</a></li>
|
|
<li><a href="#attributes">Attributes/Annotation features</a></li>
|
|
</ul>
|
|
</li>
|
|
<li><a href="#crossing_gaps">Annotation crossing gaps</a></li>
|
|
<li><a href="#run">Run table2asn to annotate the sequences</a></li>
|
|
<li><a href="#sample">Sample files</a></li>
|
|
</ul>
|
|
|
|
|
|
<h2 id="basicformat">Basic format</h2>
|
|
|
|
|
|
<p>A 9-column annotation file conforming to the GFF3 or GTF specifications can be used for genome annotation submission. The basic characteristics of the file formats are described at:</p>
|
|
|
|
|
|
<ul>
|
|
<li>GFF3: <a href="https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md">https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md</a></li>
|
|
<li>GTF: <a href="http://mblab.wustl.edu/GTF22.html">http://mblab.wustl.edu/GTF22.html</a></li>
|
|
</ul>
|
|
|
|
|
|
<p>The GFF3 format is better described and allows for a richer annotation, but GTF will also work for many submissions. This documentation focuses on GFF3 formatting conventions, but GTF conventions to use for submission are similar. Several basic validators are available to verify that a GFF3 file is syntactically valid:</p>
|
|
|
|
|
|
<ul>
|
|
<li><a href="http://www.sequenceontology.org/software/GAL.html">http://www.sequenceontology.org/software/GAL.html</a></li>
|
|
<li><a href="http://genometools.org/cgi-bin/gff3validator.cgi">http://genometools.org/cgi-bin/gff3validator.cgi</a></li>
|
|
</ul>
|
|
|
|
|
|
<p>Note these standalone validators will not detect all formatting and annotation issues, and the GenBank annotation submission software is tolerant of some common GFF3 formatting issues, but they can be useful for initial testing, especially if an input file isn't working as expected.</p>
|
|
|
|
|
|
<p>GFF2 is not acceptable because it does not provide the required information.</p>
|
|
|
|
|
|
<h2 id="genbank_specific">GenBank-specific requirements</h2>
|
|
|
|
|
|
<p>An additional set of rules, specific attributes (equivalent to INSDC qualifiers), and automatic
|
|
processing are utilized for submission of annotated genomes to GenBank. These additions are:</p>
|
|
|
|
|
|
<h3 id="formatting">Formatting requirements</h3>
|
|
|
|
|
|
<p>[1] seqid in GFF3/GTF column 1 should match the corresponding FASTA or ASN.1 file that is being annotated. For assemblies already in GenBank, seqids will be matched to their corresponding accessions if they are the same as what was used in the original submission. [The seqid is the text between the '>' and the first space in the fasta definition line; do not include the '>' in the GFF file]</p>
|
|
|
|
|
|
<p>[2] contig, supercontig, chromosome and similar landmark features are not required and will be ignored.</p>
|
|
|
|
|
|
<p>[3] multi-exon mRNA and other RNA features can be represented using either:
|
|
[a] child exon features
|
|
[b] child five_prime_UTR, CDS, and three_prime_UTR features
|
|
[c] multiple RNA feature rows with the same ID</p>
|
|
|
|
|
|
<p>Furthermore, whereas the GFF3 specifications require that all rows of a multi-exon CDS feature use the same ID, some commonly used software deviates from this requirement. To allow for deviations from the specifications, for eukaryotes the GenBank software assumes that multiple CDS rows with the same Parent attribute represent parts of the same CDS feature. Multiple CDS features for the same gene need to be annotated by using a separate mRNA Parent feature for each, so there is always a 1:1 relationship of mRNA to CDS, like in the following schematic:</p>
|
|
|
|
|
|
<pre><code>gene1 ================================ ID=gene1
|
|
mRNA1 ================================ ID=mRNA1;Parent=gene1
|
|
five_prime_UTR == Parent=mRNA1
|
|
CDS1 ==....=====...........== Parent=mRNA1 (3 rows)
|
|
three_prime_UTR ====== Parent=mRNA1
|
|
mRNA2 ================================ ID=mRNA2;Parent=gene1
|
|
exon ==== Parent=mRNA2
|
|
CDS2 ==....................== Parent=mRNA2 (2 rows)
|
|
exon ======== Parent=mRNA2
|
|
</code></pre>
|
|
|
|
|
|
<p>[4] GFF3 ID attributes are required for interpreting parent-child feature relationships and that is their only role here.</p>
|
|
|
|
|
|
<ul>
|
|
<li>They are not automatically used for the locus_tag qualifier, so if the ID is applicable as the locus_tag, it should be copied into that attribute with the appropriate formatting.</li>
|
|
<li>However, if no transcript_id, or protein_id qualifiers are present, then the GFF3 ID attribute will be used as the basis of those qualifiers, as described in point [5c] below. These qualifiers do not appear in the flatfile view, so if the GFF3 IDs are meant to be seen in that view, then they should be copied into a 'note' attribute with the appropriate formatting.</li>
|
|
</ul>
|
|
|
|
|
|
<p>[5] GFF3 Name attributes are ignored.</p>
|
|
|
|
|
|
<h3 id="changes">Changes that occur during processing</h3>
|
|
|
|
|
|
<p>[1] CDS features that don't include but are adjacent to a stop codon will be automatically extended 1-3 bp to include the stop codon. start_codon and stop_codon features are not required in either GFF3 or GTF.</p>
|
|
|
|
|
|
<p>[2] gene and mRNA features are useful but NOT required. If they are omitted, and only CDS features are provided, then gene and/or mRNA features will be created on-the-fly based on the corresponding CDS feature. mRNA features are auto-created for eukaryote genome annotation submissions when the appropriate argument is included in the command line, and are normally omitted for prokaryotes.</p>
|
|
|
|
|
|
<p>[3] The partialness markup on gene, mRNA, and CDS features is computed automatically based on the completeness of the CDS feature at either end. There is no need to specify attributes in column 9, and any attributes that are sometimes used to specify partialness, such as start_range or end_range, will be ignored.</p>
|
|
|
|
|
|
<p>[4] The product name of the CDS is copied to its corresponding mRNA, replacing any product name that may have been in the .gff file. <strong>Consequently, if a product is only on the mRNA or gene, the CDS will be automatically named 'hypothetical protein' and that name will be copied to be the product name of the corresponding mRNA</strong>. In this case the .dr discrepancy report file will include: <em>PROTEIN_NAMES: All proteins have same name "hypothetical protein"</em>. </p>
|
|
|
|
|
|
<h3 id="attributes">Attributes/Annotation features</h3>
|
|
|
|
|
|
<p>[1] <strong>Many SO feature types</strong> are recognized in column 3 and converted to their INSDC equivalents. Commonly used types are:</p>
|
|
|
|
|
|
<ul>
|
|
<li>gene</li>
|
|
<li>CDS</li>
|
|
<li>mRNA</li>
|
|
<li>exon</li>
|
|
<li>five_prime_UTR</li>
|
|
<li>three_prime_UTR</li>
|
|
<li>rRNA</li>
|
|
<li>tRNA</li>
|
|
<li>ncRNA</li>
|
|
<li>tmRNA</li>
|
|
<li>transcript</li>
|
|
<li>mobile_genetic_element</li>
|
|
<li>origin_of_replication</li>
|
|
<li>promoter</li>
|
|
<li>repeat_region</li>
|
|
</ul>
|
|
|
|
|
|
<p>Some SO types may need to be changed before processing in order to be properly recognized:
|
|
[a] all gene features should use "gene". More specific SO types like rRNA_gene, miRNA_gene, tRNA_gene, pseudogene, pseudogenic_tRNA, and others should be converted to use "gene" instead</p>
|
|
|
|
|
|
<p>[b] misc_RNA is sometimes used for a generic RNA feature type, but it is not a recognized SO term. Use "transcript" instead.
|
|
Feature types that aren't recognized will be automatically dropped and reported in the log file. Feature types that are always ignored (so not reported in the log file) are:</p>
|
|
|
|
|
|
<ul>
|
|
<li>intron</li>
|
|
<li>protein</li>
|
|
</ul>
|
|
|
|
|
|
<p>[2] <strong>pseudogenes</strong> should be flagged with pseudogene=<TYPE> qualifier in column 9 on the gene feature and optionally on any child features. Further details about the TYPE values allowed for the pseudogene qualifier are available at:
|
|
<a href="http://www.insdc.org/documents/pseudogene-qualifier-vocabulary">http://www.insdc.org/documents/pseudogene-qualifier-vocabulary</a> .</p>
|
|
|
|
|
|
<p>[3] <strong>annotate with pseudo=true any genes that are 'broken' but are not thought to be pseudogenes</strong>. These are genes that do not encode the expected translation, for example because of internal stop codons. These are often caused by problems with the sequence and/or assembly.</p>
|
|
|
|
|
|
<p>[4] <strong>gene features require locus_tag qualifiers</strong>. GFF3 ID attributes are <strong>not</strong> used for the locus_tag qualifier, so if the ID is applicable as the locus_tag, it should be copied into that attribute with the appropriate formatting. The locus_tags can be provided either by:</p>
|
|
|
|
|
|
<p>[a] adding a locus_tag= attribute to column 9. This option should be used for annotation updates to keep the existing locus_tags where appropriate.</p>
|
|
|
|
|
|
<p>[b] using the -locus-tag-prefix option in the command line and specifying the prefix to use so that the software assigns locus_tags automatically</p>
|
|
|
|
|
|
<p>[5] <strong>mRNA and CDS features require transcript_id and protein_id qualifiers, respectively</strong>. They can be provided either by including both or neither of them. Specifically [a] and [b], OR just [c]:</p>
|
|
|
|
|
|
<p>[a] adding transcript_id= attributes to mRNA (and other RNA) features, using the format:</p>
|
|
|
|
|
|
<ul>
|
|
<li>transcript_id=gnl|dbname|ID</li>
|
|
</ul>
|
|
|
|
|
|
<p>Where dbname is either the locus_tag prefix, or WGS:XXXX (for assemblies that have already been assigned a WGS accession prefix). Further details are available in the <a href="/genbank/eukaryotic_genome_submission_annotation/#transcript_id">eukaryotic annotation guidelines</a> .</p>
|
|
|
|
|
|
<p>[b] adding protein_id attributes to the CDS features, using one of these formats:</p>
|
|
|
|
|
|
<ul>
|
|
<li>protein_id=gnl|dbname|ID</li>
|
|
<li>protein_id=gnl|dbname|ID|gb|accession</li>
|
|
</ul>
|
|
|
|
|
|
<p>"gb|accession" is only applicable for annotation updates, and is not required to reuse existing protein accessions if the same dbname and ID are provided. Further details are available in the <a href="/genbank/eukaryotic_genome_submission_annotation/#transcript_id">eukaryotic annotation guidelines</a>.</p>
|
|
|
|
|
|
<p>[c] both transcript_id and protein_id can be omitted, and they will be generated automatically based on the IDs of the mRNA/CDS and gene locus_tag prefix. These qualifiers do not appear in the flatfile view, so if the GFF3 IDs are meant to be seen in that view, then they should be copied into a 'note' attribute with the appropriate formatting. However, annotation updates should include the generated protein_ids on CDS features described in point [5b] to allow protein accessions to be preserved appropriately.</p>
|
|
|
|
|
|
<p>[6] <strong>product names</strong> are specified using a product= attribute on a CDS or tRNA, rRNA, or ncRNA feature.</p>
|
|
|
|
|
|
<ul>
|
|
<li>Protein product names should conform to <a href="/genbank/eukaryotic_genome_submission_annotation/#CDS">GenBank guidelines</a> .</li>
|
|
<li>Multiple names can be specified by providing the primary name first, and additional names as a comma-separated list.</li>
|
|
<li>Commas that are intended to be part of a name should be encoded (%2C) according to the GFF3 specifications. However, literal commas should only be included when they are part of enzymatic names. Semi-colons generally should not be included in product names.</li>
|
|
<li>If a CDS feature does not specify a product name, it will be automatically named 'hypothetical protein'.</li>
|
|
<li>The CDS product name is always copied to its corresponding mRNA feature.</li>
|
|
<li><em>If a product is only on the mRNA or gene, then the CDS will be automatically named 'hypothetical protein' AND that name will be copied to the product name of the corresponding mRNA.</em> </li>
|
|
<li>Product names should be provided for tRNAs, rRNAs and ncRNAs in GFF3/GTF submission files.</li>
|
|
</ul>
|
|
|
|
|
|
<p>[7] <strong>Most INSDC qualifiers</strong> that can be used for submission in a conventional 5-column .tbl file will also work if provided as attributes in column 9 of a GFF3 input file. <strong>Multiple values for a qualifier should be provided as a comma-separated list.</strong>
|
|
Commonly used attributes/qualifiers include:</p>
|
|
|
|
|
|
<p>[a] attributes described above in more detail:</p>
|
|
|
|
|
|
<ul>
|
|
<li>locus_tag=<tag>_ID (gene)</li>
|
|
<li>transcript_id=gnl|dbname|ID (RNA)</li>
|
|
<li>protein_id=gnl|dbname|ID|gb|accession (CDS)</li>
|
|
<li>product=<name> (RNA, CDS)</li>
|
|
<li>pseudo=true (gene, RNA, CDS)</li>
|
|
<li>pseudogene=<TYPE> (GENE, RNA, CDS)</li>
|
|
</ul>
|
|
|
|
|
|
<p>[b] Dbxref=DB:value (all feature types). See <a href="https://www.ncbi.nlm.nih.gov/genbank/collab/db_xref/%20">https://www.ncbi.nlm.nih.gov/genbank/collab/db_xref/</a> for the current list of allowed databases</p>
|
|
|
|
|
|
<p>[c] ec_number=x.x.x.x (CDS features)</p>
|
|
|
|
|
|
<p>[d] Note= (all feature types). Converted to INSDC /note (also known as a comment)</p>
|
|
|
|
|
|
<p>[e] gene=Abc1 (gene). For the biological gene name (aka symbol)</p>
|
|
|
|
|
|
<p>[f] gene_synonym=xyz (gene). Database names can be included as synonyms, even with no gene name</p>
|
|
|
|
|
|
<p>[g] description= (gene). gene full name, displayed as /note in flatfile.</p>
|
|
|
|
|
|
<p>[h] exception=<CV string> (gene, RNA, CDS)</p>
|
|
|
|
|
|
<p>[i] transl_except=(pos:<base_range>%2Caa:<amino_acid>) (CDS). Used to specify the location of translation exceptions on a CDS feature where a codon at a specific location on the genome should be translated as an alternative amino acid, such as Sec.</p>
|
|
|
|
|
|
<p>[j] function (CDS)</p>
|
|
|
|
|
|
<p>[k] experiment (RNA,CDS)</p>
|
|
|
|
|
|
<p>[l] old_locus_tag (gene)</p>
|
|
|
|
|
|
<p>[m] mobile_element. This has the mandatory qualifier of mobile_element_type, eg mobile_element_type=SINE:Alu</p>
|
|
|
|
|
|
<p>[n] ncRNA_class, regulatory_class, recombination_class. These can also be represented with specific SO feature types in column 3, if they have equivalents in the <a href="http://www.insdc.org/insdc-controlled-vocabularies">INSDC class controlled vocabularies</a> .</p>
|
|
|
|
|
|
<h2 id="crossing_gaps">Annotation crossing gaps</h2>
|
|
|
|
|
|
<p>A CDS can only cross a gap of unknown size in introns, not in the actual coding region. If the gap of unknown size is within an exon, then you could split the CDS into two partial CDS features (and mRNAs in eukaryotes) that abut the gap, with a single gene over the whole locus. Alternatively, one of the partial CDS/mRNA features may be deleted if it is very short and there is little or no supporting evidence for it. If you have a single gene and two partial CDS/mRNA features, you should: (1) add a note to each CDS referencing the other half of the gene, (2) add a note to the gene and CDS features stating, "gap found within coding sequence."
|
|
A CDS exon can cross a gap of estimated size; however, a CDS (or mRNA) should not cross a gap such that over 50% of the translation is X (ie, is in the gap). This situation will generate an error. Again, the CDS/mRNA should either be partial up to the gap or split into two partial CDS/mRNA features on either side of the gap, depending upon your confidence in the translation on each side of the gap.
|
|
In addition, no feature should begin or end inside a gap. Instead, the feature should abut the gap and be partial.
|
|
For more information about splitting CDS features, see either the <a href="/genbank/eukaryotic_genome_submission_annotation/#Splitgenesontwocontigs">eukaryotic annotation guidelines</a> or the <a href="/genbank/genomesubmit_annotation/#Split_genes">prokaryotic annotation guidelines</a> .</p>
|
|
|
|
|
|
<h2 id="run">Run table2asn to annotate the sequences</h2>
|
|
|
|
|
|
<p>Use the command-line program <a href="/genbank/table2asn">table2asn</a> to combine a <a href="https://submit.ncbi.nlm.nih.gov/genbank/template/submission/">template</a> file along with the fasta and annotation .gff files to create an annotated ASN.1 (.sqn) file for submission to GenBank. Follow these steps:</p>
|
|
|
|
|
|
<p>[1] Get table2asn by anonymous FTP at <a href="https://ftp.ncbi.nlm.nih.gov/asn1-converters/by_program/table2asn/">https://ftp.ncbi.nlm.nih.gov/asn1-converters/by_program/table2asn/</a> (this was formerly posted on the FTP site as <em>table2asn_gff</em>).</p>
|
|
|
|
|
|
<p>[2] Include these arguments in the command line:</p>
|
|
|
|
|
|
<p><table border="1" cellpadding="1" cellspacing="1" height="309" width="821">
|
|
<tbody>
|
|
<tr>
|
|
<td>
|
|
<strong>Argument</strong>
|
|
</td>
|
|
<td>
|
|
<strong>When to include</strong>
|
|
</td>
|
|
</tr>
|
|
<tr>
|
|
<td>-M n -Z -J -c w</td>
|
|
<td>always</td>
|
|
</tr>
|
|
<tr>
|
|
<td>-euk</td>
|
|
<td>when the organism is a eukaryote</td>
|
|
</tr>
|
|
<tr>
|
|
<td>-locus-tag-prefix <text></td>
|
|
<td>if the locus_tags are not in the gff file. The value of 'text' is the registered locus_tag prefix.</td>
|
|
</tr>
|
|
<tr>
|
|
<td>-gaps-min <integer></td>
|
|
<td>minimum number of Ns in a row that represents a gap</td>
|
|
</tr>
|
|
<tr>
|
|
<td>-gaps-unknown <integer></td>
|
|
<td>exact number of Ns in a row that represents a gap of completely unknown length</td>
|
|
</tr>
|
|
<tr>
|
|
<td>-l</td>
|
|
<td>the evidence for the linkage of the sequences on either side of the gaps. Most commonly, "paired-ends" or "align-genus"</td>
|
|
</tr>
|
|
<tr>
|
|
<td>-V b</td>
|
|
<td>to generate a GenBank flatfile of the output, with a .gbf suffix. Adding this could slow table2asn so you may choose to include it only for the first run to make sure that the annotation looks as expected, eg to confirm that the CDS and mRNA products are not all "hypothetical protein". This file is only for viewing; it is not for submission.</td>
|
|
</tr>
|
|
<tr>
|
|
<td>-help</td>
|
|
<td>print usage, description and arguments of the program</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
</p>
|
|
|
|
|
|
<p>[a] For the genes to be properly included, the locus_tag's must be present in the .gff file (in column 9 of every gene) OR added by including "-locus-tag-prefix XXXX" in the command line (where XXXX is the registered locus_tag prefix of this genome).</p>
|
|
|
|
|
|
<p>[b] If the organism is a prokaryote, then include the genetic code in the command line [gcode=11].</p>
|
|
|
|
|
|
<p>[c] Prokaryote example:</p>
|
|
|
|
|
|
<p><strong>table2asn -M n -J -c w -t template.sbt -gaps-min 10 -l paired-ends -locus-tag-prefix XXXX -j "[organism=Escherichia coli] [strain=abcd] [gcode=11]" -i fasta_file -f gff_file -o output_file.sqn -Z -V b </strong></p>
|
|
|
|
|
|
<p>because in this example:</p>
|
|
|
|
|
|
<ul>
|
|
<li>the source information and genetic code is in the command line, not the fasta file<ul>
|
|
<li>the organism and strain are provided; the rest of the information will be pulled from the corresponding BioSample</li>
|
|
<li>always include the genetic code, [gcode=11], to ensure that the alternative start codons are recognized</li>
|
|
</ul>
|
|
</li>
|
|
<li>all runs of 10 or more Ns represent gaps of estimated size that are connected by paired-ends linkage evidence</li>
|
|
<li>the locus_tag prefix is in the command line because the locus_tag's are not in the .gff file</li>
|
|
<li>an output file is not needed or accepted by -Z in versions of table2asn posted after July 1, 2019. Download the current version if the program you have still needs to include "-Z output_file.dr"</li>
|
|
<li>having a .gbf file to see how the annotation looks is desired</li>
|
|
</ul>
|
|
|
|
|
|
<p>[d] Eukaryote example</p>
|
|
|
|
|
|
<p><strong>table2asn -M n -J -c w -euk -t template.sbt -gaps-min 10 -l paired-ends -j "[organism=Loa loa] [isolate=F231]" -i fasta_file -f gff_file -o output_file.sqn -Z -V b </strong></p>
|
|
|
|
|
|
<p>because in this example:</p>
|
|
|
|
|
|
<ul>
|
|
<li>the source information is in the command line, not the fasta file<ul>
|
|
<li>the organism and isolate are provided; the rest of the information will be pulled from the corresponding BioSample</li>
|
|
</ul>
|
|
</li>
|
|
<li>all runs of 10 or more Ns represent gaps of estimated size that are connected by paired-ends linkage evidence</li>
|
|
<li>the locus_tags are included in the GFF file (if not, then include "-locus-tag-prefix XXXX" in the command line, where XXXX is the registered locus_tag prefix for this genome, as seen in the prokaryote example)</li>
|
|
<li>an output file is not needed or accepted by -Z in versions of table2asn posted after July 1, 2019. Download the current version if the program you have still needs to include "-Z output_file.dr"</li>
|
|
<li>having a .gbf file to see how the annotation looks is desired</li>
|
|
</ul>
|
|
|
|
|
|
<p>[e] The <a href="/genbank/wgs_gapped">Gapped Format for Genome Submissions</a> page describes how to convert runs of Ns to assembly_gaps appropriately.</p>
|
|
|
|
|
|
<p>[f] Run table2asn with the relevant arguments for your situation. (FYI, "table2asn -help" will print out all the arguments)</p>
|
|
|
|
|
|
<p>[3] Check the output of the validation and discrepancy report and fix problems</p>
|
|
|
|
|
|
<ul>
|
|
<li>Check the .stats file for the number, severity and type of errors that are present in the .val files. All Errors and Rejects need to be fixed. The presence of errors will slow processing. See the <a href="/genbank/genome_validation">genome validation errors</a> for guidance. Contact <a href="mailto:genomes@ncbi.nlm.nih.gov">genomes@ncbi.nlm.nih.gov</a> with any questions about the validation output. During processing there may be some questions about other aspects of the submission.</li>
|
|
<li>Check the .dr file for the results of the discrepancy report. Categories prefaced with FATAL are always unacceptable and must be fixed. Some of the categories are informational, for example <em>PROTEIN_NAMES: All proteins have same name "hypothetical protein"</em>. Reports that are not flagged as fatal should be examined to determine if they represent annotation artifacts that need to be corrected or if they are acceptable due to the biology of the genome. See the <a href="/genbank/asndisc/#evaluating_the_output">discrepancy report examples and explanations</a> and <a href="/genbank/new_asndisc_examples">common discrepancy reports</a> for guidance. Write to <a href="mailto:genomes@ncbi.nlm.nih.gov">genomes@ncbi.nlm.nih.gov</a> and send the discrep file with questions about this report.</li>
|
|
<li>Some common discrepancy reports of which to be aware:<ul>
|
|
<li><em>NO_ANNOTATION</em> and <em>LONG_NO_ANNOTATION</em>. If either of these is expected, that is fine. However, if not expected, then check that the IDs in column 1 of the GFF file match the SeqIDs of the sequences in the fasta file. When you submit, please let us know when the sequences in the LONG_NO_ANNOTATION report are expected to be unannotated, so that we know to ignore this report.</li>
|
|
<li><em>PROTEIN_NAMES: All proteins have same name "hypothetical protein".</em> If this is expected, that is fine. However, this occurs when the product names are included on the gene or mRNA instead of the CDS, so move the products to the CDS/exon rows if this situation is not expected.</li>
|
|
<li><em>FATAL: BACTERIAL_PARTIAL_NONEXTENDABLE_PROBLEMS</em>. If this is a eukaryotic genome, you can ignore this error. If this is a prokaryotic genome, then every CDS must begin and end with valid start and stop codons, respectively, or be partial and either extend to the end of the sequence or abut a gap within the scaffold sequence. However, you should annotate with <em>pseudo=true</em> any genes that are 'broken' but are not thought to be pseudogenes. These are genes that do not encode the expected translation, for example because of internal stop codons or missing start or stop codons, and are often caused by problems with the sequence and/or assembly.</li>
|
|
<li><em>FATAL: MISSING_GENES</em>. This usually occurs because locus_tag's were not included. Be sure that locus_tag's are present either by including them in the GFF file in column 9 of each gene OR by including "locus-tag-prefix XXXX" (where XXXX is the registered locus_tag prefix for the genome) in the command line.</li>
|
|
</ul>
|
|
</li>
|
|
<li>Make any necessary fixes to the input .fsa and/or .gff files and run table2asn again.</li>
|
|
</ul>
|
|
|
|
|
|
<p>[4] Submit the error-free .sqn files via the <a href="https://submit.ncbi.nlm.nih.gov/subs/genome/">Submission Portal</a>, per the <a href="/genbank/genomesubmit/#submitting_genomes">usual instructions.</a></p>
|
|
|
|
|
|
<h2 id="sample">Sample files</h2>
|
|
|
|
|
|
<p>Several files with more information about using GFF3 files as the annotation input are posted in the <a href="https://ftp.ncbi.nlm.nih.gov/asn1-converters/by_program/table2asn/DOCUMENTATION/">DOCUMENTATION</a> section on the FTP site. There you will find these files:</p>
|
|
|
|
|
|
<ul>
|
|
<li>GFF3Guidance.docx = a brief document with GFF3 requirements and some example input GFF3 formats and table2asn command lines </li>
|
|
<li>short.fsa and .gff files = the files described and used in the GFF3Guidance.docx document</li>
|
|
</ul>
|
|
|
|
|
|
</div>
|
|
<!--/.col1-->
|
|
<div class="col2">
|
|
<div class="rightnav">
|
|
<h2 id="genome-resources">Genome Resources</h2>
|
|
<ul>
|
|
<li><a href="/genbank/wgs/">About WGS</a></li>
|
|
<li><a href="https://www.ncbi.nlm.nih.gov/Traces/wgs/?view=wgs">WGS Browser</a></li>
|
|
<li><a href="/genbank/genomesubmit/">Genome Submission Guide</a></li>
|
|
<li><a href="https://submit.ncbi.nlm.nih.gov/subs/genome/">Genome Submission Portal</a></li>
|
|
<li><a href="/genbank/wgs_update/">Update Genome Records</a></li>
|
|
<li><a href="/genbank/wgsfaq/">FAQ</a></li>
|
|
<li><a href="/genbank/table2asn">table2asn</a></li>
|
|
<li><a href="/genbank/diploid_haps">Submitting Multiple Haplotype Assemblies</a></li>
|
|
<li><a href="/WebSub/template.cgi/">Create Submission Template</a></li>
|
|
<li><a href="/genbank/eukaryotic_genome_submission/">Eukaryotic Annotation Guide</a></li>
|
|
<li><a href="/genbank/genomesubmit_annotation/">Prokaryotic Annotation Guide</a></li>
|
|
<li><a href="/genbank/examples.wgs/">Annotation Example Files</a></li>
|
|
<li><a href="/genbank/genomes_gff">Annotating Genomes with GFF3 or GTF files</a></li>
|
|
<li><a href="/genbank/genome_validation">Validation Error Explanations for Genomes</a></li>
|
|
<li><a href="/genbank/asndisc/">Discrepancy Report</a></li>
|
|
<li><a href="https://www.ncbi.nlm.nih.gov/genome/annotation_prok/">NCBI Prokaryotic Genome Annotation Pipeline</a></li>
|
|
<li><a href="https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/">AGP Format</a></li>
|
|
<li><a href="/genbank/metagenome/">Metagenome Submission Guide</a></li>
|
|
<li><a href="/genbank/structuredcomment/">Structured Comment</a></li>
|
|
<li><a href="/bioproject/">BioProject</a></li>
|
|
<li><a href="/biosample/">BioSample</a></li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
<!--/.col2-->
|
|
<div class="col3">
|
|
|
|
</div>
|
|
<!--/.col3-->
|
|
<div class="col4">
|
|
|
|
</div>
|
|
<!--/.col4-->
|
|
<div class="col5">
|
|
|
|
</div>
|
|
<div class="col6">
|
|
|
|
</div>
|
|
<div class="col7">
|
|
|
|
</div>
|
|
<div class="col8">
|
|
|
|
</div>
|
|
<div class="col9">
|
|
|
|
</div>
|
|
</div><!--/.content-->
|
|
</div><!--/.container-->
|
|
<div id="NCBIFooter_dynamic">
|
|
<div class="breadcrumbs">You are here:
|
|
<span id="breadcrumb_text"><a href="/guide/">NCBI</a></span></div>
|
|
<a id="help-desk-link" class="help_desk" href="https://support.ncbi.nlm.nih.gov/ics/support/default.asp?Time=2025-03-05T02:46:04-05:00&Snapshot=%2Fprojects%2Fstaticsites%2Fgenbank%2Fgenbank@2.21&Host=portal104&ncbi_phid=CE8B25027C7F008100000000011300E0&ncbi_session=CE8B5AF87C7FFCB1_0191SID&from=https%3A%2F%2Fwww.ncbi.nlm.nih.gov%2Fgenbank%2Fgenomes_gff%2F&Ncbi_App=genbank&Page=custom-page&style=classic&deptID=28049" target="_blank">Support Center</a>
|
|
<noscript><img alt="" src="/stat?jsdisabled=true&ncbi_app=genbank&ncbi_db=&ncbi_pdid=custom-page&ncbi_phid=CE8B25027C7F008100000000011300E0" /></noscript>
|
|
</div>
|
|
|
|
|
|
<div xmlns:xi="http://www.w3.org/2001/XInclude">
|
|
<div xmlns="http://www.w3.org/1999/xhtml" class="footer" id="footer" xml:base="http://127.0.0.1/sites/static/header_footer">
|
|
<section class="icon-section">
|
|
<div id="icon-section-header" class="icon-section_header">Follow NCBI</div>
|
|
<div class="grid-container container">
|
|
<div class="icon-section_container">
|
|
<a class="footer-icon" id="footer_twitter" href="https://twitter.com/ncbi" aria-label="Twitter">
|
|
<svg xmlns="http://www.w3.org/2000/svg" width="40" height="40" viewBox="0 0 40 40" fill="none">
|
|
<title>Twitter</title>
|
|
<g id="twitterx1008">
|
|
<path id="path1008" d="M6.06736 7L16.8778 20.8991L6.00001 32.2H10.2L18.6 23.1L25.668 32.2H34L22.8 17.5L31.9 7H28.4L20.7 15.4L14.401 7H6.06898H6.06736ZM9.66753 8.73423H12.9327L29.7327 30.4658H26.5697L9.66753 8.73423Z" fill="#5B616B"></path>
|
|
</g>
|
|
</svg>
|
|
</a>
|
|
<a class="footer-icon" id="footer_facebook" href="https://www.facebook.com/ncbi.nlm" aria-label="Facebook"><svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" viewBox="0 0 300 300">
|
|
<title>Facebook</title>
|
|
<path class="cls-11" d="M210.5,115.12H171.74V97.82c0-8.14,5.39-10,9.19-10h27.14V52l-39.32-.12c-35.66,0-42.42,26.68-42.42,43.77v19.48H99.09v36.32h27.24v109h45.41v-109h35Z">
|
|
</path>
|
|
</svg></a>
|
|
<a class="footer-icon" id="footer_linkedin" href="https://www.linkedin.com/company/ncbinlm" aria-label="LinkedIn"><svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" viewBox="0 0 300 300">
|
|
<title>LinkedIn</title>
|
|
<path class="cls-11" d="M101.64,243.37H57.79v-114h43.85Zm-22-131.54h-.26c-13.25,0-21.82-10.36-21.82-21.76,0-11.65,8.84-21.15,22.33-21.15S101.7,78.72,102,90.38C102,101.77,93.4,111.83,79.63,111.83Zm100.93,52.61A17.54,17.54,0,0,0,163,182v61.39H119.18s.51-105.23,0-114H163v13a54.33,54.33,0,0,1,34.54-12.66c26,0,44.39,18.8,44.39,55.29v58.35H198.1V182A17.54,17.54,0,0,0,180.56,164.44Z">
|
|
</path>
|
|
</svg></a>
|
|
<a class="footer-icon" id="footer_github" href="https://github.com/ncbi" aria-label="GitHub"><svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" viewBox="0 0 300 300">
|
|
<defs>
|
|
<style>
|
|
.cls-11,
|
|
.cls-12 {
|
|
fill: #737373;
|
|
}
|
|
|
|
.cls-11 {
|
|
fill-rule: evenodd;
|
|
}
|
|
</style>
|
|
</defs>
|
|
<title>GitHub</title>
|
|
<path class="cls-11" d="M151.36,47.28a105.76,105.76,0,0,0-33.43,206.1c5.28,1,7.22-2.3,7.22-5.09,0-2.52-.09-10.85-.14-19.69-29.42,6.4-35.63-12.48-35.63-12.48-4.81-12.22-11.74-15.47-11.74-15.47-9.59-6.56.73-6.43.73-6.43,10.61.75,16.21,10.9,16.21,10.9,9.43,16.17,24.73,11.49,30.77,8.79,1-6.83,3.69-11.5,6.71-14.14C108.57,197.1,83.88,188,83.88,147.51a40.92,40.92,0,0,1,10.9-28.39c-1.1-2.66-4.72-13.42,1-28,0,0,8.88-2.84,29.09,10.84a100.26,100.26,0,0,1,53,0C198,88.3,206.9,91.14,206.9,91.14c5.76,14.56,2.14,25.32,1,28a40.87,40.87,0,0,1,10.89,28.39c0,40.62-24.74,49.56-48.29,52.18,3.79,3.28,7.17,9.71,7.17,19.58,0,14.15-.12,25.54-.12,29,0,2.82,1.9,6.11,7.26,5.07A105.76,105.76,0,0,0,151.36,47.28Z">
|
|
</path>
|
|
<path class="cls-12" d="M85.66,199.12c-.23.52-1.06.68-1.81.32s-1.2-1.06-.95-1.59,1.06-.69,1.82-.33,1.21,1.07.94,1.6Zm-1.3-1">
|
|
</path>
|
|
<path class="cls-12" d="M90,203.89c-.51.47-1.49.25-2.16-.49a1.61,1.61,0,0,1-.31-2.19c.52-.47,1.47-.25,2.17.49s.82,1.72.3,2.19Zm-1-1.08">
|
|
</path>
|
|
<path class="cls-12" d="M94.12,210c-.65.46-1.71,0-2.37-.91s-.64-2.07,0-2.52,1.7,0,2.36.89.65,2.08,0,2.54Zm0,0"></path>
|
|
<path class="cls-12" d="M99.83,215.87c-.58.64-1.82.47-2.72-.41s-1.18-2.06-.6-2.7,1.83-.46,2.74.41,1.2,2.07.58,2.7Zm0,0">
|
|
</path>
|
|
<path class="cls-12" d="M107.71,219.29c-.26.82-1.45,1.2-2.64.85s-2-1.34-1.74-2.17,1.44-1.23,2.65-.85,2,1.32,1.73,2.17Zm0,0">
|
|
</path>
|
|
<path class="cls-12" d="M116.36,219.92c0,.87-1,1.59-2.24,1.61s-2.29-.68-2.3-1.54,1-1.59,2.26-1.61,2.28.67,2.28,1.54Zm0,0">
|
|
</path>
|
|
<path class="cls-12" d="M124.42,218.55c.15.85-.73,1.72-2,1.95s-2.37-.3-2.52-1.14.73-1.75,2-2,2.37.29,2.53,1.16Zm0,0"></path>
|
|
</svg></a>
|
|
<a class="footer-icon" id="footer_blog" href="https://ncbiinsights.ncbi.nlm.nih.gov/" aria-label="Blog">
|
|
<svg xmlns="http://www.w3.org/2000/svg" id="Layer_1" data-name="Layer 1" viewBox="0 0 40 40">
|
|
<defs><style>.cls-1{fill:#737373;}</style></defs>
|
|
<title>NCBI Insights Blog</title>
|
|
<path class="cls-1" d="M14,30a4,4,0,1,1-4-4,4,4,0,0,1,4,4Zm11,3A19,19,0,0,0,7.05,15a1,1,0,0,0-1,1v3a1,1,0,0,0,.93,1A14,14,0,0,1,20,33.07,1,1,0,0,0,21,34h3a1,1,0,0,0,1-1Zm9,0A28,28,0,0,0,7,6,1,1,0,0,0,6,7v3a1,1,0,0,0,1,1A23,23,0,0,1,29,33a1,1,0,0,0,1,1h3A1,1,0,0,0,34,33Z"></path>
|
|
</svg>
|
|
</a>
|
|
</div>
|
|
</div>
|
|
</section>
|
|
|
|
<section class="container-fluid bg-primary">
|
|
<div class="container pt-5">
|
|
<div class="row mt-3">
|
|
<div class="col-lg-3 col-12">
|
|
<p><a class="text-white" href="https://www.nlm.nih.gov/socialmedia/index.html">Connect with NLM</a></p>
|
|
<ul class="list-inline social_media">
|
|
<li class="list-inline-item"><a href="https://twitter.com/NLM_NIH" aria-label="Twitter" target="_blank" rel="noopener noreferrer">
|
|
<svg xmlns="http://www.w3.org/2000/svg" width="35" height="35" viewBox="0 0 36 35" fill="none">
|
|
<title>Twitter</title>
|
|
<g id="twitterx1009" clip-path="url(#clip0_65276_3946)">
|
|
<path id="Vector_Twitter" d="M17.5006 34.6565C26.9761 34.6565 34.6575 26.9751 34.6575 17.4996C34.6575 8.02416 26.9761 0.342773 17.5006 0.342773C8.02514 0.342773 0.34375 8.02416 0.34375 17.4996C0.34375 26.9751 8.02514 34.6565 17.5006 34.6565Z" fill="#205493" stroke="white" stroke-width="1.0" stroke-miterlimit="10"></path>
|
|
<path id="path1009" d="M8.54811 8.5L16.2698 18.4279L8.50001 26.5H11.5L17.5 20L22.5486 26.5H28.5L20.5 16L27 8.5H24.5L19 14.5L14.5007 8.5H8.54927H8.54811ZM11.1197 9.73873H13.4519L25.4519 25.2613H23.1926L11.1197 9.73873Z" fill="white"></path>
|
|
</g>
|
|
<defs>
|
|
<clipPath id="clip0_65276_3946">
|
|
<rect width="35" height="35" fill="white"></rect>
|
|
</clipPath>
|
|
</defs>
|
|
</svg>
|
|
</a></li>
|
|
<li class="list-inline-item"><a href="https://www.facebook.com/nationallibraryofmedicine" aria-label="Facebook" rel="noopener noreferrer" target="_blank">
|
|
<svg xmlns="http://www.w3.org/2000/svg" width="35" height="35" viewBox="0 0 36 35" fill="none">
|
|
<title>Facebook</title>
|
|
<g id="Facebook" clip-path="url(#clip0_1717_1086)">
|
|
<path id="Vector_Facebook" d="M15.1147 29.1371C15.1147 29.0822 15.1147 29.0296 15.1147 28.9747V18.9414H11.8183C11.6719 18.9414 11.6719 18.9414 11.6719 18.8018C11.6719 17.5642 11.6719 16.3289 11.6719 15.0937C11.6719 14.9793 11.7062 14.9518 11.816 14.9518C12.8683 14.9518 13.9206 14.9518 14.9751 14.9518H15.1215V14.8329C15.1215 13.8057 15.1215 12.774 15.1215 11.7492C15.1274 10.9262 15.3148 10.1146 15.6706 9.37241C16.1301 8.38271 16.9475 7.60378 17.9582 7.19235C18.6492 6.90525 19.3923 6.76428 20.1405 6.7783C21.0029 6.79202 21.8653 6.83091 22.7278 6.86065C22.8879 6.86065 23.048 6.89496 23.2082 6.90182C23.2974 6.90182 23.3271 6.94071 23.3271 7.02993C23.3271 7.54235 23.3271 8.05477 23.3271 8.5649C23.3271 9.16882 23.3271 9.77274 23.3271 10.3767C23.3271 10.4819 23.2974 10.5139 23.1921 10.5116C22.5379 10.5116 21.8814 10.5116 21.2271 10.5116C20.9287 10.5184 20.6316 10.5528 20.3395 10.6146C20.0822 10.6619 19.8463 10.7891 19.6653 10.9779C19.4842 11.1668 19.3672 11.4078 19.3307 11.6669C19.2857 11.893 19.2612 12.1226 19.2575 12.3531C19.2575 13.1904 19.2575 14.0299 19.2575 14.8695C19.2575 14.8946 19.2575 14.9198 19.2575 14.9564H23.0229C23.1807 14.9564 23.183 14.9564 23.1624 15.1074C23.0778 15.7662 22.9885 16.425 22.9039 17.0816C22.8322 17.6321 22.7636 18.1827 22.698 18.7332C22.6729 18.9437 22.6797 18.9437 22.4693 18.9437H19.2644V28.8992C19.2644 28.9793 19.2644 29.0593 19.2644 29.1394L15.1147 29.1371Z" fill="white"></path>
|
|
<path id="Vector_2_Facebook" d="M17.5006 34.657C26.9761 34.657 34.6575 26.9756 34.6575 17.5001C34.6575 8.02465 26.9761 0.343262 17.5006 0.343262C8.02514 0.343262 0.34375 8.02465 0.34375 17.5001C0.34375 26.9756 8.02514 34.657 17.5006 34.657Z" stroke="white" stroke-width="1.0" stroke-miterlimit="10"></path>
|
|
</g>
|
|
<defs>
|
|
<clipPath id="clip0_1717_1086">
|
|
<rect width="35" height="35" fill="white"></rect>
|
|
</clipPath>
|
|
</defs>
|
|
</svg>
|
|
</a></li>
|
|
<li class="list-inline-item"><a href="https://www.youtube.com/user/NLMNIH" aria-label="Youtube" target="_blank" rel="noopener noreferrer">
|
|
<svg xmlns="http://www.w3.org/2000/svg" width="35" height="35" viewBox="0 0 36 35" fill="none">
|
|
<title>Youtube</title>
|
|
<g id="YouTube" clip-path="url(#clip0_1717_1101)">
|
|
<path id="Vector_Youtube" d="M26.2571 11.4791C25.9025 11.1589 25.5709 10.9576 24.228 10.834C22.5512 10.6785 20.2797 10.6556 18.564 10.6533H16.4365C14.7208 10.6533 12.4493 10.6785 10.7725 10.834C9.43196 10.9576 9.09798 11.1589 8.7434 11.4791C7.81464 12.321 7.6202 14.6268 7.59961 16.8938C7.59961 17.3178 7.59961 17.741 7.59961 18.1635C7.62706 20.4121 7.82837 22.686 8.7434 23.521C9.09798 23.8412 9.42967 24.0425 10.7725 24.1661C12.4493 24.3216 14.7208 24.3445 16.4365 24.3468H18.564C20.2797 24.3468 22.5512 24.3216 24.228 24.1661C25.5686 24.0425 25.9025 23.8412 26.2571 23.521C27.1722 22.6929 27.3735 20.451 27.4009 18.2206C27.4009 17.7402 27.4009 17.2599 27.4009 16.7795C27.3735 14.5491 27.1699 12.3072 26.2571 11.4791ZM15.5604 20.5311V14.652L20.561 17.5001L15.5604 20.5311Z" fill="white"></path>
|
|
<path id="Vector_2_Youtube" d="M17.5006 34.657C26.9761 34.657 34.6575 26.9756 34.6575 17.5001C34.6575 8.02465 26.9761 0.343262 17.5006 0.343262C8.02514 0.343262 0.34375 8.02465 0.34375 17.5001C0.34375 26.9756 8.02514 34.657 17.5006 34.657Z" stroke="white" stroke-width="1.0" stroke-miterlimit="10"></path>
|
|
</g>
|
|
<defs>
|
|
<clipPath id="clip0_1717_1101">
|
|
<rect width="35" height="35" fill="white"></rect>
|
|
</clipPath>
|
|
</defs>
|
|
</svg>
|
|
</a></li>
|
|
</ul>
|
|
</div>
|
|
<div class="col-lg-3 col-12">
|
|
<p class="address_footer text-white">National Library of Medicine<br />
|
|
<a href="https://www.google.com/maps/place/8600+Rockville+Pike,+Bethesda,+MD+20894/@38.9959508,-77.101021,17z/data=!3m1!4b1!4m5!3m4!1s0x89b7c95e25765ddb:0x19156f88b27635b8!8m2!3d38.9959508!4d-77.0988323" class="text-white" target="_blank" rel="noopener noreferrer">8600 Rockville Pike<br />
|
|
Bethesda, MD 20894</a></p>
|
|
</div>
|
|
<div class="col-lg-3 col-12 centered-lg">
|
|
<p><a href="https://www.nlm.nih.gov/web_policies.html" class="text-white">Web Policies</a><br />
|
|
<a href="https://www.nih.gov/institutes-nih/nih-office-director/office-communications-public-liaison/freedom-information-act-office" class="text-white">FOIA</a><br />
|
|
<a href="https://www.hhs.gov/vulnerability-disclosure-policy/index.html" class="text-white" id="vdp">HHS Vulnerability Disclosure</a></p>
|
|
</div>
|
|
<div class="col-lg-3 col-12 centered-lg">
|
|
<p><a class="supportLink text-white" href="https://support.nlm.nih.gov/">Help</a><br />
|
|
<a href="https://www.nlm.nih.gov/accessibility.html" class="text-white">Accessibility</a><br />
|
|
<a href="https://www.nlm.nih.gov/careers/careers.html" class="text-white">Careers</a></p>
|
|
</div>
|
|
</div>
|
|
<div class="row">
|
|
<div class="col-lg-12 centered-lg">
|
|
<nav class="bottom-links">
|
|
<ul class="mt-3">
|
|
<li>
|
|
<a class="text-white" href="//www.nlm.nih.gov/">NLM</a>
|
|
</li>
|
|
<li>
|
|
<a class="text-white" href="https://www.nih.gov/">NIH</a>
|
|
</li>
|
|
<li>
|
|
<a class="text-white" href="https://www.hhs.gov/">HHS</a>
|
|
</li>
|
|
<li>
|
|
<a class="text-white" href="https://www.usa.gov/">USA.gov</a>
|
|
</li>
|
|
</ul>
|
|
</nav>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</section>
|
|
<script type="text/javascript" src="/portal/portal3rc.fcgi/rlib/js/InstrumentOmnitureBaseJS/InstrumentNCBIConfigJS/InstrumentNCBIBaseJS/InstrumentPageStarterJS.js?v=1"> </script>
|
|
<script type="text/javascript" src="/portal/portal3rc.fcgi/static/js/hfjs2.js"> </script>
|
|
</div>
|
|
</div>
|
|
<!--/.footer-->
|
|
<p class="last-updated small">Last updated: 2024-03-08T13:58:42Z</p>
|
|
</div>
|
|
<!--/.page-->
|
|
</div>
|
|
<!--/.wrap-->
|
|
<span class="PAFAppResources"></span>
|
|
|
|
|
|
</div><!-- /.twelve_col -->
|
|
</div>
|
|
<!-- /.grid -->
|
|
|
|
|
|
|
|
<!-- usually for JS scripts at page bottom -->
|
|
<span class="pagefixtures"></span>
|
|
|
|
|
|
<!-- CE8B5AF87C7FFCB1_0191SID /projects/staticsites/genbank/genbank@2.21 portal104 v4.1.r689238 Tue, Oct 22 2024 16:10:51 -->
|
|
<span id="portal-csrf-token" style="display:none" data-token="CE8B5AF87C7FFCB1_0191SID"></span>
|
|
|
|
<script type="text/javascript" src="//static.pubmed.gov/portal/portal3rc.fcgi/4218137/js/3879255/4121861/1490097/4087685.js" snapshot="genbank"></script></body>
|
|
</html>
|
|
|