nih-gov/www.ncbi.nlm.nih.gov/Genbank/genomesubmit_annotation.html

1298 lines
71 KiB
HTML

<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<!-- AppResources meta begin -->
<meta name="paf-app-resources" content="" />
<!-- AppResources meta end -->
<!-- TemplateResources meta begin -->
<meta name="paf_template" content="StdNCol" />
<!-- TemplateResources meta end -->
<!-- Page meta begin -->
<!-- Page meta end -->
<!-- Logger begin -->
<meta xmlns:ncbi-portal="http://ncbi.gov/portal/XSLT/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" name="ncbi_app" content="genbank" /><meta xmlns:ncbi-portal="http://ncbi.gov/portal/XSLT/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" name="ncbi_pdid" content="custom-page" />
<!-- Logger end -->
<title>Prokaryotic Genome Annotation Guide</title>
<!-- PageFixtures headcontent begin -->
<meta name="cms-local-nav-url" content="https://cms.ncbi.nlm.nih.gov//genbank/_nav" />
<!-- PageFixtures headcontent end -->
<!-- AppResources external_resources begin -->
<script type="text/javascript" src="/core/jig/1.15.6/js/jig.min.js"></script>
<!-- AppResources external_resources end -->
<!-- Page headcontent begin -->
<meta name="subsite" content="genbank" />
<meta name="path" content="genbank/genomesubmit_annotation" />
<meta name="modified" content="2024-04-23T18:36:34Z" /><meta xmlns:ncbi-portal="http://ncbi.gov/portal/XSLT/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" name="cms-edit-aux-url" content="http://cms.ncbi.nlm.nih.gov/node//edit" />
<!-- Page headcontent end -->
<!-- PageFixtures resources begin -->
<link xmlns="http://www.w3.org/1999/xhtml" type="text/css" rel="stylesheet" href="//static.pubmed.gov/portal/portal3rc.fcgi/4218191/css/4207974/4206132.css" xml:base="http://127.0.0.1/sites/static/header_footer" />
<!-- PageFixtures resources end -->
<link rel="shortcut icon" href="//www.ncbi.nlm.nih.gov/favicon.ico" /><meta name="ncbi_phid" content="CE8B28407C8035B10000000000F000B2.m_5" />
<meta name='referrer' content='origin-when-cross-origin'/><link type="text/css" rel="stylesheet" href="//static.pubmed.gov/portal/portal3rc.fcgi/4218137/css/4121862/3974050/3917732/251717/4108189/14534/45193/3534283/4128070/3407145/4005757/4062871.css" /><link type="text/css" rel="stylesheet" href="//static.pubmed.gov/portal/portal3rc.fcgi/4218137/css/3529741/3529739.css" media="print" /></head>
<body class=" col2 custom-page">
<div class="grid">
<div class="col twelve_col nomargin shadow">
<!-- System messages like service outage or JS required; this is handled by the TemplateResources portlet -->
<div class="sysmessages">
<noscript>
<p class="nojs">
<strong>Warning:</strong>
The NCBI web site requires JavaScript to function.
<a href="/guide/browsers/#enablejs" title="Learn how to enable JavaScript" target="_blank">more...</a>
</p>
</noscript>
</div>
<!--/.sysmessage-->
<div class="wrap">
<div class="page">
<div xmlns:xi="http://www.w3.org/2001/XInclude">
<div xmlns="http://www.w3.org/1999/xhtml" id="universal_header" xml:base="http://127.0.0.1/sites/static/header_footer">
<section class="usa-banner">
<div class="usa-accordion">
<header class="usa-banner-header">
<div class="usa-grid usa-banner-inner">
<img src="https://www.ncbi.nlm.nih.gov/coreutils/uswds/img/favicons/favicon-57.png" alt="U.S. flag" />
<p>An official website of the United States government</p>
<button class="non-usa-accordion-button usa-banner-button" aria-expanded="false" aria-controls="gov-banner-top" type="button">
<span class="usa-banner-button-text">Here's how you know</span>
</button>
</div>
</header>
<div class="usa-banner-content usa-grid usa-accordion-content" id="gov-banner-top" aria-hidden="true">
<div class="usa-banner-guidance-gov usa-width-one-half">
<img class="usa-banner-icon usa-media_block-img" src="https://www.ncbi.nlm.nih.gov/coreutils/uswds/img/icon-dot-gov.svg" alt="Dot gov" />
<div class="usa-media_block-body">
<p>
<strong>The .gov means it's official.</strong>
<br />
Federal government websites often end in .gov or .mil. Before
sharing sensitive information, make sure you're on a federal
government site.
</p>
</div>
</div>
<div class="usa-banner-guidance-ssl usa-width-one-half">
<img class="usa-banner-icon usa-media_block-img" src="https://www.ncbi.nlm.nih.gov/coreutils/uswds/img/icon-https.svg" alt="Https" />
<div class="usa-media_block-body">
<p>
<strong>The site is secure.</strong>
<br />
The <strong>https://</strong> ensures that you are connecting to the
official website and that any information you provide is encrypted
and transmitted securely.
</p>
</div>
</div>
</div>
</div>
</section>
<div class="usa-overlay"></div>
<header class="ncbi-header" role="banner" data-section="Header">
<div class="usa-grid">
<div class="usa-width-one-whole">
<div class="ncbi-header__logo">
<a href="/" class="logo" aria-label="NCBI Logo" data-ga-action="click_image" data-ga-label="NIH NLM Logo">
<img src="https://www.ncbi.nlm.nih.gov/coreutils/nwds/img/logos/AgencyLogo.svg" alt="NIH NLM Logo" />
</a>
</div>
<div class="ncbi-header__account">
<a id="account_login" href="https://account.ncbi.nlm.nih.gov" class="usa-button header-button" style="display:none" data-ga-action="open_menu" data-ga-label="account_menu">Log in</a>
<button id="account_info" class="header-button" style="display:none" aria-controls="account_popup" type="button">
<span class="fa fa-user" aria-hidden="true">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="20px" height="20px">
<g style="fill: #fff">
<ellipse cx="12" cy="8" rx="5" ry="6"></ellipse>
<path d="M21.8,19.1c-0.9-1.8-2.6-3.3-4.8-4.2c-0.6-0.2-1.3-0.2-1.8,0.1c-1,0.6-2,0.9-3.2,0.9s-2.2-0.3-3.2-0.9 C8.3,14.8,7.6,14.7,7,15c-2.2,0.9-3.9,2.4-4.8,4.2C1.5,20.5,2.6,22,4.1,22h15.8C21.4,22,22.5,20.5,21.8,19.1z"></path>
</g>
</svg>
</span>
<span class="username desktop-only" aria-hidden="true" id="uname_short"></span>
<span class="sr-only">Show account info</span>
</button>
</div>
<div class="ncbi-popup-anchor">
<div class="ncbi-popup account-popup" id="account_popup" aria-hidden="true">
<div class="ncbi-popup-head">
<button class="ncbi-close-button" data-ga-action="close_menu" data-ga-label="account_menu" type="button">
<span class="fa fa-times">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 48 48" width="24px" height="24px">
<path d="M38 12.83l-2.83-2.83-11.17 11.17-11.17-11.17-2.83 2.83 11.17 11.17-11.17 11.17 2.83 2.83 11.17-11.17 11.17 11.17 2.83-2.83-11.17-11.17z"></path>
</svg>
</span>
<span class="usa-sr-only">Close</span></button>
<h4>Account</h4>
</div>
<div class="account-user-info">
Logged in as:<br />
<b><span class="username" id="uname_long">username</span></b>
</div>
<div class="account-links">
<ul class="usa-unstyled-list">
<li><a id="account_myncbi" href="/myncbi/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_myncbi">Dashboard</a></li>
<li><a id="account_pubs" href="/myncbi/collections/bibliography/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_pubs">Publications</a></li>
<li><a id="account_settings" href="/account/settings/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_settings">Account settings</a></li>
<li><a id="account_logout" href="/account/signout/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_logout">Log out</a></li>
</ul>
</div>
</div>
</div>
</div>
</div>
</header>
<div role="navigation" aria-label="access keys">
<a id="nws_header_accesskey_0" href="https://www.ncbi.nlm.nih.gov/guide/browsers/#ncbi_accesskeys" class="usa-sr-only" accesskey="0" tabindex="-1">Access keys</a>
<a id="nws_header_accesskey_1" href="https://www.ncbi.nlm.nih.gov" class="usa-sr-only" accesskey="1" tabindex="-1">NCBI Homepage</a>
<a id="nws_header_accesskey_2" href="/myncbi/" class="set-base-url usa-sr-only" accesskey="2" tabindex="-1">MyNCBI Homepage</a>
<a id="nws_header_accesskey_3" href="#maincontent" class="usa-sr-only" accesskey="3" tabindex="-1">Main Content</a>
<a id="nws_header_accesskey_4" href="#" class="usa-sr-only" accesskey="4" tabindex="-1">Main Navigation</a>
</div>
<section data-section="Alerts">
<div class="ncbi-alerts-placeholder"></div>
</section>
</div>
</div>
<!--/.header-->
<div class="header">
<div class="res_logo"><h1 class="res_name"><a href="/genbank/" title="GenBank home">GenBank</a></h1><h2 class="res_tagline">Public nucleic acid sequence repository</h2></div>
<div class="search"><form method="get" action="/nuccore/"><div class="search_form"><label for="database" class="offscreen_noflow">Search database</label><select id="database"><optgroup label="Recent"><option value="nuccore" selected="selected">Nucleotide</option><option value="books">Books</option><option value="gene">Gene</option><option value="snp" class="last">SNP</option></optgroup><optgroup label="All"><option value="gquery">All Databases</option><option value="assembly">Assembly</option><option value="biocollections">Biocollections</option><option value="bioproject">BioProject</option><option value="biosample">BioSample</option><option value="books">Books</option><option value="clinvar">ClinVar</option><option value="cdd">Conserved Domains</option><option value="gap">dbGaP</option><option value="dbvar">dbVar</option><option value="gene">Gene</option><option value="genome">Genome</option><option value="gds">GEO DataSets</option><option value="geoprofiles">GEO Profiles</option><option value="gtr">GTR</option><option value="ipg">Identical Protein Groups</option><option value="medgen">MedGen</option><option value="mesh">MeSH</option><option value="nlmcatalog">NLM Catalog</option><option value="nuccore">Nucleotide</option><option value="omim">OMIM</option><option value="pmc">PMC</option><option value="protein">Protein</option><option value="proteinclusters">Protein Clusters</option><option value="protfam">Protein Family Models</option><option value="pcassay">PubChem BioAssay</option><option value="pccompound">PubChem Compound</option><option value="pcsubstance">PubChem Substance</option><option value="pubmed">PubMed</option><option value="snp">SNP</option><option value="sra">SRA</option><option value="structure">Structure</option><option value="taxonomy">Taxonomy</option><option value="toolkit">ToolKit</option><option value="toolkitall">ToolKitAll</option><option value="toolkitbookgh">ToolKitBookgh</option></optgroup></select><div class="nowrap"><label for="term" class="offscreen_noflow" accesskey="/">Search term</label><div class="nowrap"><input type="text" name="term" id="term" title="Search Nucleotide" value="" class="jig-ncbiclearbutton jig-ncbiautocomplete" data-jigconfig="isEnabled:false,disableUrl:'NcbiSearchBarAutoComplCtrl'" autocomplete="off" data-sbconfig="ds:'no',pjs:'no',afs:'yes'" /></div><button id="search" type="submit" class="button_search nowrap" cmd="go">Search</button></div></div></form></div>
</div>
<div class="nav_and_browser">
<div class="localnav"><ul class="jig-ncbilocalnav">
<li><a href="#">GenBank</a><ul>
<li><a href="/genbank/">About GenBank</a></li>
<li><a href="/genbank/submit_types">Submission Types</a></li>
<li><a href="/genbank/submit">Submission Tools</a></li>
<li><a href="/genbank/update">Update GenBank Records</a></li>
<li><a href="/nuccore/">Search</a></li>
<li><a href="/BLAST/Blast.cgi?CMD=Web&amp;PAGETYPE=BLASTHome">BLAST</a></li>
<li><a href="/genbank/statistics">Statistics</a></li>
<li><a href="/genbank/samplerecord/">Sample Record</a></li>
<li><a href="/genbank/sequencerevisionhistory/">Revision History</a></li>
<li><a href="/genbank/sequenceids/">Sequence IDs</a></li>
</ul>
</li>
<li><a href="#">Submit</a><ul>
<li><a href="/genbank/submit">Submission Tools</a></li>
<li><a href="/genbank/submit_types">Submission Types</a></li>
<li><a href="/WebSub/?tool=genbank">BankIt</a></li>
<li><a href="/genbank/table2asn">table2asn</a></li>
<li><a href="https://www.ncbi.nlm.nih.gov/sra/docs/sequence-data-processing">Sequence Data Processing</a></li>
</ul>
</li>
<li><a href="#">Genomes</a><ul>
<li><a href="/genbank/genomesubmit">Complete Genome Submission Guide</a></li>
<li><a href="/genbank/genomesubmit_annotation">Prokaryotic Genome Annotation Guide</a></li>
<li><a href="/genbank/eukaryotic_genome_submission_annotation">Eukaryotic Genome Annotation Guide</a></li>
<li><a href="/genbank/examples.wgs">Annotation Examples</a></li>
<li><a href="https://submit.ncbi.nlm.nih.gov/subs/wgs/">Genome Submission Portal</a></li>
</ul>
</li>
<li><a title="Whole Genome Shotgun sequences and submissions" href="#">WGS</a><ul>
<li><a href="/genbank/wgs">About WGS</a></li>
<li><a href="/Traces/wgs">WGS Project List</a></li>
<li><a href="/genbank/wgs.submit">WGS Submission Guide</a></li>
<li><a href="/genbank/wgsfaq/">FAQ</a></li>
<li><a href="https://submit.ncbi.nlm.nih.gov/subs/wgs/">Genome Submission Portal</a></li>
<li><a href="/genbank/eukaryotic_genome_submission_annotation">Eukaryotic Annotation Guide</a></li>
<li><a href="/genbank/genomesubmit_annotation">Prokaryotic Annotation Guide</a></li>
<li><a href="/genbank/asndisc">Discrepancy Report</a></li>
<li><a href="/assembly/agp/AGP_Specification/">AGP format</a></li>
</ul>
</li>
<li><a href="#">Metagenomes</a><ul>
<li><a href="/genbank/metagenome">About Metagenomes</a></li>
<li><a href="/genbank/structuredcomment">Structured Comment</a></li>
</ul>
</li>
<li><a href="#">TPA</a><ul>
<li><a href="/genbank/TPA">About TPA</a></li>
<li><a href="/genbank/tpafaq">FAQ</a></li>
<li><a href="/genbank/TPA-Exp">TPA-Exp</a></li>
<li><a href="/genbank/TPA-Inf">TPA-Inf</a></li>
</ul>
</li>
<li><a href="#">TSA</a><ul>
<li><a href="/genbank/TSA">About TSA</a></li>
<li><a href="/genbank/TSAguide">TSA Submission Guide</a></li>
<li><a href="/genbank/TSAfaq">FAQ</a></li>
</ul>
</li>
<li><a href="#">INSDC</a><ul>
<li><a href="/genbank/collab">About INSDC</a></li>
<li><a href="/genbank/collab/country">Geographic Location Name List</a></li>
<li><a href="/genbank/collab/db_xref">db_xref List</a></li>
<li><a href="http://www.insdc.org/documents/feature_table.html">Feature Table</a></li>
</ul>
</li>
<li><a href="#">Documentation</a><ul>
<li><a href="https://www.ncbi.nlm.nih.gov/sra/docs/sequence-data-processing/">Sequence Data Processing</a></li>
<li><a href="/genbank/submission_brokers">Submission Brokers</a></li>
<li><a href="/genbank/acc_prefix">Accession Number Prefixes</a></li>
<li><a href="/genbank/organelle_submit/">Organelle Submission Guide</a></li>
<li><a href="/genbank/monkeypox_submission/">Monkeypox Submission Guide</a></li>
<li><a href="/genbank/validation/">Common Submission Errors</a> </li>
<li><a href="/genbank/sequencecheck/">Ribosomal Submission Errors</a></li>
<li><a href="/genbank/sequencecheck/virus">Common Sequence Errors</a></li>
<li><a href="https://support.nlm.nih.gov/knowledgebase/category/?id=CAT-01240">Submission FAQs</a></li>
</ul>
</li>
<li><a href="#">Other</a><ul>
<li><a href="/genbank/htgs">About HTGs</a></li>
<li><a href="/genbank/dbest">About EST</a></li>
<li><a href="/genbank/dbgss">About GSS</a></li>
<li><a href="/genbank/tls">About TLS</a></li>
<li><a href="/genbank/tlsguide">Submit TLS</a></li>
</ul>
</li>
</ul></div>
</div>
<!-- was itemctrl -->
<div class="container">
<div id="maincontent" class="content col twelve_col last">
<div class="col1">
<h1 id="prokaryotic-genome-annotation-gu">Prokaryotic Genome Annotation Guide</h1>
<h2 id="id1353477">Annotation</h2>
<p>table2asn (the replacement of tbl2asn) uses a simple five-column tab-delimited table of feature locations and qualifiers in order to generate annotation.</p>
<p>The format of this feature table allows diferent kinds of features (e.g. gene, coding region, tRNA, repeat_region) and qualifiers (e.g. /product, /note) to be indicated. The validator will check for errors such as internal stops in coding regions.</p>
<p>Guidelines for <a href="/genbank/eukaryotic_genome_submission">eukaryotic genome submissions</a>.</p>
<p>If you do not understand any of the instructions presented here or you have questions, please contact us by email at <a href="mailto:genomes@ncbi.nlm.nih.gov">genomes@ncbi.nlm.nih.gov</a> prior to creating your submission. This will save us both a lot of time.</p>
<h2 id="table-of-contents">Table of Contents</h2>
<ol>
<li><a href="#prepare_table">Prepare annotation table</a><ul>
<li><a href="#Gene_features">Gene features</a></li>
<li><a href="#locus_tag">locus_tag is required</a></li>
<li><a href="#protein_id">protein_id</a></li>
<li><a href="#CDS">CDS (coding region) features and protein names</a></li>
<li><a href="#selenocysteine">Selenocysteine-containing coding regions</a></li>
<li><a href="#intein">Intein containing coding regions</a></li>
<li><a href="#partial_CDS">Partial coding regions in incomplete genomes</a></li>
<li><a href="#disrupted_genes">Gene fragments</a></li>
<li><a href="#introns">Intron Containing Genes</a></li>
<li><a href="#transpliced">Transpliced Genes</a></li>
<li><a href="#Split_genes">Split genes on two contigs</a></li>
<li><a href="#RNA">Ribosomal RNA, tRNA and other RNA features</a></li>
<li><a href="#Evidence_Qualifiers">Evidence Qualifiers</a></li>
<li><a href="#bacteriophage">Functional bacteriophage</a></li>
<li><a href="#transposons">Insertion sequences and transposons</a></li>
<li><a href="#db_xref">Data base cross references</a></li>
<li><a href="#GOterms">Gene Ontology</a></li>
<li><a href="#variation">Variation</a></li>
<li><a href="#Other">Other Annotation</a></li>
</ul>
</li>
</ol>
<h2 id="prepare_table">Prepare annotation table</h2>
<p>The features must be in a simple five-column tab-delimited table, called the feature table. The feature table specifies the location and type of each feature for table2asn (previously tbl2asn) to include in the GenBank submission that is created. The first line of the table contains the following basic information:</p>
<pre><code>&gt;Features SeqId table_name
</code></pre>
<p>The SeqId must be the same as that used on the sequence. The table_name is optional. Subsequent lines of the table list the features. Columns are separated by tabs.</p>
<ul>
<li>Column 1: Start location of feature</li>
<li>Column 2: Stop location of feature</li>
<li>Column 3: Feature key</li>
<li>Column 4: Qualifier key</li>
<li>Column 5: Qualifier value</li>
</ul>
<p><a href="/genbank/genomesubmit-examples#fig2">Figure 2</a> shows a sample feature table and illustrates a number of points about the feature table format. The GenBank flatfile corresponding to this table is shown in <a href="/genbank/genomesubmit-examples#fig3">Figure 3</a></p>
<p>Features that are on the complementary strand (such as the gene abrB in the examples below) and its corresponding CDS, are indicated by reversing the interval locations. Please avoid unnecessary capitalization in all text entered in your table.</p>
<h4 id="Gene_features">Gene features</h4>
<p>Gene features are usually a single interval, and their location should cover the intervals of all the relevant features such as promoters and operator binding sites. Gene names must follow the standard bacterial nomenclature rules of three lower case letters. Different loci are distinguished by a suffix of uppercase letters.</p>
<h4 id="id1353883">Example</h4>
<pre><code>correct cytB
incorrect CYTB
incorrect cytochrome B
incorrect orf1
incorrect putative gene fragment
</code></pre>
<p>If a gene is a pseudogene, please do not add the word "pseudo" to the gene name or protein name. Instead use the /pseudogene qualifier and <a href="https://www.insdc.org/documents/pseudogene-qualifier-vocabulary">appropriate value qualifier</a> on the gene feature. Please see <a href="#disrupted_genes">Gene fragments</a> for more details.</p>
<h4 id="locus_tag">locus_tag</h4>
<p>All genes must be assigned a systematic gene identifier which must receive the locus_tag qualifier on the gene feature in the annotation table. Genes may also have functional names as assigned in the scientific literature. In this example, OBB_0001 is the systematic gene identifier, while abcD is the functional gene name. We recommend having the BioProject registration process auto-assign a locus_tag prefix, as they are not meant to confer meaning. The locus_tag prefix must be 3-12 alphanumeric characters and the first character may not be a digit. Additionally locus_tag prefixes are case sensitive. All chromosomes and plasmids of an individual genome must use the exactly same locus_tag prefix followed by an underscore and then an alphanumeric identification number that is unique within the given genome. Other than the single underscore used to separate the prefix from the identification number no other special characters can be used in the locus_tag. Locus_tags must only be used in combination with a gene feature. Read more about <a href="//www.ncbi.nlm.nih.gov/genomes/locustag/Proposal.pdf">locus_tags</a> and their intended usage.</p>
<h4 id="id1353951">Table view of gene with both biological name and locus_tag</h4>
<pre><code>1 1575 gene
gene abcD
locus_tag OBB_0001
</code></pre>
<h4 id="id1353963">Flatfile view</h4>
<pre><code>gene 1..1575
/gene="abcD"
/locus_tag="OBB_0001"
</code></pre>
<h4 id="id1353975">Table view of gene with only locus_tag:</h4>
<pre><code>1 1575 gene
locus_tag OBB_0001
</code></pre>
<h4 id="id1353987">Flatfile view:</h4>
<pre><code>gene 1..157
/locus_tag="OBB_0001"
</code></pre>
<h4 id="protein_id">protein_id</h4>
<p>The submitter must assign an identification number to all proteins. NCBI uses this number to track proteins when sequences are updated. This number is indicated in the table by the CDS qualifier protein_id, and must have the format gnl|dbname|string, where dbname is a version of your lab name that you think will be unique (eg SmithUCSD), and string is the unique protein SeqID assigned by the submitter. We recommend using the locus_tag number as the protein identification number.</p>
<p>The protein_id is used for internal tracking in our database, it is important that the complete protein_id (dbname + string) not be duplicated by a genome center. Note that when WGS submissions are processed, the dbname in the protein_id is automatically changed to 'WGS:XXXX', where XXXX is the project's accession number prefix. After your genome is released into GenBank, the proteins are assigned accession numbers. We will provide a table of the protein SeqIDs and accession numbers for you to use in future <a href="https://cms.ncbi.nlm.nih.gov/%7E/genomesubmit#updating">updates</a>.</p>
<h5 id="example">Example</h5>
<p>The protein_id is saved with the record (in ASN.1 format), but it is not visible in the flatfile.</p>
<pre><code>1 1575 gene
gene abcD
locus_tag OBB_0001
1 1575 CDS
product enolase
protein_id gnl|SmithUCSD|OBB_0001
</code></pre>
<h4 id="CDS">protein names</h4>
<p>All CDS features must have a product qualifier (protein name). NCBI protein naming conventions are adopted from the <a href="https://www.ncbi.nlm.nih.gov/genome/doc/internatprot_nomenguide/">International Protein Nomenclature Guidelines</a>.</p>
<p>Consistent nomenclature is indispensable for communication, literature searching and data retrieval. Many species-specific communities have established gene nomenclature committees that try to assign consistent and, if possible, meaningful gene symbols. Other scientific communities have established protein nomenclatures for a set of proteins based on sequence similarity and/or function. But there is no established organization involved in the standardization of protein names, nor are there any efforts to establish naming rules that are valid across the largest spectrum of species possible.</p>
<p>Ambiguities regarding gene/protein names are a major problem in the literature and it is even worse in the sequence databases which tend to propagate the confusion. For this reason, we ask that you follow some basic guidelines in naming your proteins. The protein naming guidelines are based on the premise that a good and stable recommended name for a protein is a name that is as neutral as possible.</p>
<p>Guidelines for naming proteins:</p>
<ul>
<li>If it exists, use the approved nomenclature.</li>
<li>Use a concise name, not a description or phrase.</li>
<li>Ideally the name should be unique and attributed to all orthologs.</li>
<li>In cases where the protein name is not known use "hypothetical protein" or "uncharacterized protein" as the product name.</li>
<li>The protein name should not reflect the the protein's subcellular location, its molecular weight or its species of origin. This information can be included in the note.</li>
<li>For proteins that belong to a multigene family, it is recommended that you choose a coherent nomenclature with numbers to specify the different members of the family.</li>
<li>When naming proteins which can be grouped into a family based on homology or according to a notion of shared function, the different members should be enumerated with a dash "-" followed by an Arabic number. e.g. "desmoglein-1", "desmoglein-2", etc.</li>
<li>Proteins of unknown function which contain a defined domain or motif, can be named according to the domain present. The name should be of the following type: "&lt;domain|repeat&gt;-containing protein". e.g. "PAS domain-containing protein 5".</li>
<li>Protein names may be denoted by the same symbol as the corresponding gene, but the symbol begins with a capital letter.</li>
<li>Greek letters must be written in full e.g. "alpha", and written entirely in lower case with the exception of "Delta" in the context of steroid/fatty acid metabolism nomenclature. Additionally the Greek letters that are followed by a number should be preceded or followed by a dash "-" e.g. "unicornase alpha-1".</li>
<li>Use lowercase letters, except when uppercase are required (for example, in acronyms such as DNA or ATP).</li>
<li>Wherever appropriate, the name should use American spelling conventions.</li>
<li>Avoid the use of molecular weights in protein names "unicornase subunit A" is preferred to "unicornase 52 kDa subunit"</li>
<li>Avoid the term "homolog" in a protein as this infers an evolutionary relationship that has generally not been determined.</li>
<li>Avoid the use of commas in protein names whenever possible.</li>
<li>Avoid the use of Roman numerals where possible. Use Arabic numbers instead.</li>
<li>Do not build molecular weights into abbreviations</li>
<li>Do not use diacritics, such as accents, umlauts. Many computer systems (ours included) can only understand ASCII characters.</li>
<li>Do not use plurals in a protein name. e.g. "ankyrin repeats-containing protein 8" is wrong.</li>
</ul>
<p>Here are some examples of good protein names:</p>
<dl>
<dt><strong>cytochrome b</strong></dt>
<dt><strong>CytB</strong></dt>
<dt><strong>aconitate hydrase B</strong></dt>
<dt><strong>hypothetical protein</strong></dt>
<dt><strong>cytochrome b-like protein</strong></dt>
<dt><strong>4Fe-4S cluster binding protein</strong></dt>
<dt><strong>adenylyltransferase/ADP-heptose synthase</strong></dt>
<dt><strong>2-hydroxyhepta-2,4-diene-1,7-dioate isomerase</strong></dt>
<dt><strong>short-chain specific acyl-CoA dehydrogenase</strong></dt>
<dt><strong>formylmethanofuran--tetrahydromethanopterin formyltransferase</strong></dt>
<dt><strong>serine/threonine-protein kinase</strong></dt>
<dt><strong>translation initiation factor 1</strong></dt>
<dt><strong>triphosphoribosyl-dephospho-CoA synthetase</strong></dt>
<dt><strong>thiamine biosynthesis protein ThiC</strong></dt>
<dt><strong>PAS domain-containing protein 5</strong></dt>
<dt><strong>ABC transporter ATP-binding protein AlbC</strong></dt>
<dt><strong>stage 0 sporulation protein J</strong></dt>
<dd><em>These names all concisely describe the function of the protein, where known, and avoid references to structure, homology and species.</em></dd>
</dl>
<p>Here are some examples of bad protein names:</p>
<dl>
<dt><strong>required for the efficient incorporation of molybdate into molybdoproteins</strong></dt>
<dd><em>This describes the protein's role in a biosynthetic process but is not a protein name.</em></dd>
<dt><strong>chaperone Hsp70; DNA biosynthesis; autoregulated heat shock proteins</strong></dt>
<dd><em>The name "chaperone Hsp70" is fine however the remaining comments would be best fielded as a note or in the function qualifier.</em></dd>
<dt><strong>putative carbonic anhdrase (EC 4.2.1.1)</strong></dt>
<dd><em>The EC number should not be part of the protein name but instead fielded in the EC_number qualifier</em></dd>
<dt><strong>similar to aconitrate hydrase B</strong></dt>
<dd><em>This statement is fine as a note, however as a protein name aconitrate hydrase B-like protein is preferred</em></dd>
<dt><strong>related to protein of unknown function</strong></dt>
<dd><em>uninformative name</em></dd>
<dt><strong>cytochrome b-like</strong></dt>
<dd><em>cytochrome b-like protein is preferred</em></dd>
<dt><strong>ABC transporter related</strong></dt>
<dd><em>vague name, there are many ABC transporters and subunits, be more specific. "ABC transporter-related protein" would be acceptable but a more specific name would be better, if possible.</em></dd>
<dt><strong>pirin, N-terminal:pirin, C-terminal</strong></dt>
<dd><em>uniformative name noting similarity in N and C terminus</em></dd>
<dt><strong>helix-turn-helix motif</strong></dt>
<dd><em>Describes a motif or structural domain but is not an appropriate protein name.</em></dd>
<dt><strong>PP-loop</strong></dt>
<dd><em>Describes a motif or structural domain but is not an appropriate protein name.</em></dd>
<dt><strong>alpha/beta hydrolast fold</strong></dt>
<dd><em>Describes a motif or structural domain but is not an appropriate protein name.</em></dd>
<dt><strong>pentapeptide repeat</strong></dt>
<dd><em>Describes a motif or structural domain but is not an appropriate protein name.</em></dd>
<dt><strong>phosphopantetheine-binding domain</strong></dt>
<dd><em>Describes a motif or structural domain but is not an appropriate protein name.</em></dd>
<dt><strong>protein of unknown function:conserved</strong></dt>
<dd><em>uninformative name</em></dd>
<dt><strong>hypothetical 32.5 kDa protein homologous to phytoene and squalene synthethases</strong></dt>
<dd><em>Hypothetical protein alone is appropriate. The remaining comments should be fielded as a note.</em></dd>
<dt><strong>ribosomal protein L3 (E. coli)</strong></dt>
<dd><em>Protein names should not contain references to organism names. Ribosomal protein L3 is an appropriate name by itself.</em></dd>
<dt><strong>saccharopine dehydrogenase or related protein</strong></dt>
<dd><em>"saccharopine dehydrogenase" or "saccharopine dehydrogenase-like protein" would be more appropriate</em></dd>
<dt><strong>tyrosine-protein kinase (capsular polysaccharide biosynthesis)</strong></dt>
<dd><em>tyrosine-protein kinase is fine as a protein name but capsular polysaccharide biosynthesis would be more appropriate as a function.</em></dd>
<dt><strong>RimM protein, required for 16S rRNA processing</strong></dt>
<dd><em>RimM is fine as a protein name but descriptive comments should be placed in the note.</em></dd>
<dt><strong>involved in flagellar biosynthesis</strong></dt>
<dd><em>This is a functional comment and not a protein name.</em></dd>
</dl>
<h4 id="id1354410">Notes</h4>
<p>Please avoid including notes indicating a specific percentage of similarity to other entries in the database, since the corresponding record that you have pointed to may change and make your current note inaccurate, incorrect and obsolete. Descriptions, notes describing similarity to other proteins, and functional comments must be placed in the appropriate CDS qualifiers such as note, or prot_desc, as they are descriptors of the product. E.C. numbers must be fielded in an EC_number qualifier.</p>
<p>start stop CDS
product DNA gyrase subunit B
EC_number 5.99.1.3
note required for the gyration of DNA</p>
<p>Qualifiers that can be used on the CDS feature are:</p>
<pre><code>start stop CDS
product
prot_desc
function
EC_number
note
experiment
inference
go_component
go_process
go_function
db_xref
pseudo
exception
transl_except
</code></pre>
<h4 id="id1354431">Bifunctional Proteins</h4>
<p>If a protein contains two separate and distinct functions or if it has more than one name, it can be annotated in several ways as outlined below.</p>
<p>Table view:</p>
<pre><code>start stop CDS
product adenylyltransferase/ADP-heptose synthase
note bifunctional
EC_number 2.7.7.2
EC_number 1.4.1.13
</code></pre>
<p>or</p>
<pre><code>start stop CDS
product bifunctional adenylyltransferase/ADP-heptose synthase cyclohydrolase
EC_number 2.7.7.2
EC_number 1.4.1.13
</code></pre>
<p>or</p>
<pre><code>start stop CDS
product FolD
function adenylyltransferase
function ADP-heptose synthase cyclohydrolase
note bifunctional
EC_number 2.7.7.2
EC_number 1.4.1.13
</code></pre>
<h4 id="selenocysteine">Selenocysteine-containing coding regions</h4>
<p>To annotate a selenocysteine, include a transl_except qualifier with the nucleotide location of the codon plus the amino acid 'Sec', like this:</p>
<pre><code>1790 3187 CDS
product selenoprotein
transl_except (pos:1817..1819,aa:Sec)
protein_id gnl|mycenter|ABC_0216437
</code></pre>
<p>where the selenocysteine-coding codon is at nt1817-1819. Use the nucleotide location, not the amino acid number.</p>
<p>If the CDS is on the minus strand you still indicate the codon's location from 5' to 3'. Therefore, the codon is included as nt9395-9393 in this example:</p>
<pre><code>9422 8208 CDS
product hypothetical protein
transl_except (pos:9395..9393,aa:Sec)
protein_id gnl|mycenter|ACC_0216440
</code></pre>
<h4 id="intein">Intein-containing coding regions</h4>
<p>Intein-containing coding regions must be represented as follows:</p>
<pre><code>946506 950039 gene
gene recA
locus_tag OBB_0010
946506 950039 CDS
product DNA recombination protein precursor
protein_id gnl|dbname|OBB_0010
946506 946790 misc_feature
948057 950036
note DNA recombination protein
946791 948056 misc_feature
note intein
</code></pre>
<p>Inteins should be annotated with two mat_peptide features, one for the intein and one for the final protein. We also add "precursor" to the product name on the CDS feature. Unfortunately, you can not add a mat_peptide feature in a table. Instead, you can add a misc_feature and we can convert them for you. Please see accession number <a href="//www.ncbi.nlm.nih.gov/nuccore/AY847267">AY847267</a> for an example of an intein containing protein.</p>
<h4 id="partial_CDS">Partial coding regions in incomplete genomes</h4>
<p>Annotate a partial coding region using the "&lt;" or "&gt;" in your feature table to designate the feature as either 5' or 3' partial. The coding region must begin at the first nucleotide present, however the translation will start at the first complete codon.</p>
<p><span> NOTE: Partial coding regions are only allowed in prokaryotes at the end of a sequence or abutting a gap. Internal coding regions must begin with a start codon and end with a stop codon. </span></p>
<h5 id="examples">Examples:</h5>
<p>In the first example below, the "&lt;" designates this coding region as 5' partial and "codon_start 3" tells the software to start translation with the third nucleotide of the CDS. Note that if the codon_start is not specified, then the software assumes a codon_start of 1. The second coding region below is partial at the 3' end so "&gt;" is used to indicate a 3' partial feature. The third example is of a 3' partial coding region on the complementary or minus strand.</p>
<pre><code>&lt;1 497 gene
gene abcD
locus_tag OBB_0001
&lt;1 497 CDS
product AbcD
note similar to Bacillus subtilis aldolase
codon_start 3
protein_id gnl|dbname|OBB_0001
200 &gt;1575 gene
gene xyzA
locus_tag OBB_0002
200 &gt;1575 CDS
product actin-like protein
protein_id gnl|dbname|OBB_0002
436 &gt;1 gene
gene nirK
locus_tag OBB_0003
436 &gt;1 CDS
product NirK
protein_id gnl|dbname|OBB_0003
</code></pre>
<p>Here are <a href="/genbank/examples.wgs#partialcds">more examples of formatting partial CDS features</a>.</p>
<h4 id="disrupted_genes">Disrupted genes and gene fragments</h4>
<p>Sometimes a genome will have adjacent or nearby genes that seem to be only part of a protein. In many cases these indicate a possible problem with the sequence and/or annotation. A related issue is the presence of internal stop codons in the conceptual translation of a CDS that looks like it should be a real CDS. These problems may be due to a variety of reasons, including mutations or sequencing or assembly artifacts. They can be annotated in a number of ways:</p>
<ol>
<li>
<p>Annotate the gene with /pseudo to indicate that there is a problem with the gene. Note that this qualifier does NOT mean that the gene is a pseudogene. (see point 2, below, if it is known that the gene IS a pseudogene) If multiple gene fragments were present initially, then add a single gene feature which covers all of the potential coding regions and add the pseudo qualifier. If known, a note qualifier may be added indicating why this gene is disrupted, for example:</p>
<pre><code>1 200 gene
gene phoA
gene_desc alkaline phosphatase
locus_tag OBB_0001
pseudo
note nonfunctional due to frameshift
</code></pre>
</li>
<li>
<p>If you are sure that the disrupted or error-filled gene is a biological pseudogene, then add the pseudogene qualifier and the appropriate <a href="https://www.insdc.org/documents/pseudogene-qualifier-vocabulary">pseudogene type</a>. For example:</p>
<pre><code>1 200 gene
gene phoA
gene_desc alkaline phosphatase
locus_tag OBB_0001
pseudogene unprocessed
</code></pre>
</li>
<li>
<p>A coding region containing a frameshift that is thought to be corrected by ribosomal slippage can be annotated using joined feature spans and a 'ribosomal slippage' exception. Joined spans on a feature are used to combine two non-contiguous regions of sequence that are joined together to encode a protein, for example. This is typically used to combine eukaryotic exons to translate the coding region. To create a join CDS you must specify the spans of each contiguous region of sequence that encodes the protein. The use of the joined feature spans is rare in bacteria. In this case the CDS must also include an exception qualifier with the exact text "ribosomal slippage". If you include a join feature for a different reason, please include a note qualifier indicating why the two nucleotide spans are joined.</p>
<pre><code>333255 333181 CDS
333179 332157
product AbcD
protein_id gnl|dbname|OBB_0001
exception ribosomal slippage
</code></pre>
</li>
<li>
<p>Gene containing an authentic frameshift induced by phase variation can be represented by a gene feature with a note.</p>
<pre><code>1 200 gene
gene phoA
gene_desc alkaline phosphatase
locus_tag OBB_0001
note authentic frameshift induced by phase variation; This region contains
an authentic frameshift or in-frame stop in the coding sequence and is not the
result of a sequencing error
</code></pre>
</li>
</ol>
<h4 id="introns">Intron-Containing Genes</h4>
<p>While rare, there are some examples of bacterial genes containing introns. Annotate the gene feature of any intron-containing gene such that the gene feature spans are a single span covering all exons and introns. The actual feature (CDS, tRNA, etc.) should then be annotated with sets of nucleotide spans showing how the exons are joined to create the correct product. In this example there are two exons transcribed to create a tRNA. The first exon is from 1456 to 1419 and the second is from 1400 to 1361. Note how the gene feature spans encompass both exons and the intron.</p>
<pre><code>1456 1361 gene
locus_tag APO_t01
1456 1419 tRNA
1400 1361
product tRNA-Cys
</code></pre>
<h4 id="transpliced">Transpliced Genes</h4>
<p>Transpliced genes are the exception to the rule for annotating gene feature spans. Transpliced genes are similar to intron containing genes except the two pieces of the gene are found on different regions of the chromosome. These genes are transcribed as two or more separate RNA products that are transpliced into a single mRNA or tRNA. To annotate this using a table, enter the nucleotide spans so that the complementary (minus strand) spans are arranged from high to low and vice versa for the plus strand.</p>
<pre><code>36700 36618 gene
86988 87064
locus_tag NEQ_t38
exception trans-splicing
36631 36618 misc_feature
note sequence cleaved during processing of trans-spliced tRNAs
36673 36635
87030 87064 tRNA
product tRNA-Glu
exception trans-splicing
note this trans-spliced tRNA consists of two halves on mixed strands; it shares a 3' half with another tRNA
</code></pre>
<p>Flatfile view:</p>
<pre><code>gene join(complement(36618..36700),86988..87064)
/locus_tag="NEQ_t38"
/trans_splicing
misc_feature complement(36618..36631)
/locus_tag="NEQ_t38"
/note="sequence cleaved during processing of trans-spliced tRNAs"
tRNA join(complement(36635..36673),87030..87064)
/locus_tag="NEQ_t38"
/product="tRNA-Glu"
/trans_splicing
/note="this trans-spliced tRNA consists of two halves on
mixed strands; it shares a 3' half with another tRNA"
</code></pre>
<h4 id="Split_genes">Split genes on two contigs</h4>
<p>Sept 2012: Sometimes in incomplete genomes the ends of a gene may be on different contigs. When certain that the two pieces are part of the same gene, annotate these as separate genes with unique locus_tags, plus separate CDS with different protein_id's. In addition, link the features together with notes that refer to the other part of the gene. However, do not create extremely short features, for example if one end is only the start methinione or only a few amino acids before the stop codon.</p>
<h5 id="example_1">Example</h5>
<pre><code>&gt;Feature Cont01.00111
5000 &gt;7500 gene
locus_tag KCS_2223A
5488 5500 CDS
6000 &gt;7200
product enolase
protein_id gnl|dbname|KCS_2223A
note 5' end; 3' end is gene KCS_2223B on contig Cont01.00224
&gt;Feature Cont01.00224
&lt;1 1000 gene
locus_tag KCS_2223B
&lt;100 876 CDS
product enolase
protein_id gnl|dbname|KCS_2223B
note 3' end; 5' end is gene KCS_2223A on contig Cont01.00111
</code></pre>
<h4 id="RNA">Ribosomal RNA, tRNA and other RNA features</h4>
<p>RNA features (rRNA, tRNA, ncRNA) must include a corresponding gene feature with a locus_tag qualifier. Please be sure to specify which amino acid the tRNA gene corresponds to. If the amino acid of a tRNA is unknown, use tRNA-Xxx as the product, as in the example. Many submitters like to label the tRNAs such as tRNA-Gly1, etc. If you wish to do this please include "tRNA-Gly1" as a note and not in /gene. The use of /gene is reserved for the actual biological gene symbol such as "trnG". If a tRNA is a pseudogene, please use the /pseudo qualifier.</p>
<p>Annotate ncRNAs that belong to one of the INSDC <a href="//www.insdc.org/documents/ncrna-vocabulary">ncRNA_class</a> as an ncRNA feature, with the appropriate value in the required /ncRNA_class qualifier. Regions of an RNA should be annotated as a misc_feature (eg, leader sequences), or a misc_binding feature if they bind a known molecule (eg, riboswitches). See <a href="#Other">Other Annotation</a> for examples of regions of RNAs.</p>
<h5 id="some-rrna-trna-ncrna-examples">Some rRNA, tRNA, ncRNA examples:</h5>
<pre><code>1 400 gene
locus_tag OBB_0001
&lt;1 400 rRNA
product 16S ribosomal RNA
401 500 gene
gene trnG
note tRNA-Gly1
locus_tag OBB_0002
401 500 tRNA
product tRNA-Gly
501 600 gene
locus_tag OBB_0003
501 600 tmRNA
product tmRNA
601 700 gene
locus_tag OBB_0004
601 700 tRNA
product tRNA-Xxx
701 800 gene
locus_tag OBB_0005
pseudo
701 800 tRNA
product tRNA-Phe
pseudo
801 900 gene
locus_tag OBB_0006
801 900 ncRNA
ncRNA_class SRP_RNA
product RNA component of signal recognition particle
</code></pre>
<h4 id="Evidence_Qualifiers">Evidence Qualifiers</h4>
<p>At the 2005 annual meeting of the International Nucleotide Sequence Databases (INSD), DDBJ, EMBL and GenBank agreed to adopt two qualifiers to describe the evidence for features in sequence records. These are "/experimental=text" and "/inference=TYPE:text", where 'TYPE' is from a select list and 'text' is structured text. These new qualifiers replace "evidence=experimental" and "evidence=non-experimental", respectively, which are no longer supported. Read more about <a href="/genbank/evidence">Evidence Qualifiers</a></p>
<pre><code>1 100 gene
locus_tag Test_0001
1 100 CDS
product RecA
protein_id gnl|center_name|Test_0001
inference ab initio prediction:Genscan:2.0
200 300 gene
locus_tag Test_0002
200 300 CDS
product SecA
protein_id gnl|center_name|Test_0002
inference similar to DNA sequence, (same species):INSD:DQ060639.1
400 500 gene
locus_tag Test_0003
400 500 CDS
product ribonuclease R
protein_id gnl|center_name|Test_0003
inference protein motif:InterPro:IPR001900
db_xref InterPro:IPR001900
600 700 gene
locus_tag Test_0004
600 700 CDS
product nitroreductase A
protein_id gnl|center_name|Test_0004
experiment expression of GST fusion protein
</code></pre>
<h4 id="bacteriophage">Functional bacteriophage</h4>
<p>If a bacterial genome contains a functional phage, an additional source feature must be included with the spans covering the complete phage sequence. However, if the phage is not functional or if you are not sure, annotate it as a misc_feature.</p>
<pre><code>361 4200 source
organism Bacteriophage xyz
</code></pre>
<h4 id="transposons">Insertion sequences and transposons</h4>
<p>Insertion sequences and transposons must be annotated as repeat_region features. The name of the insertion sequence or transposon must be added in a insertion_seq or transposon qualifier. Note that transposons and insertion sequences should not be given locus_tags.</p>
<pre><code>1 100 repeat_region
mobile_element insertion sequence:IS1363
500 600 repeat_region
mobile_element transposon:Athena-Av1
</code></pre>
<h4 id="db_xref">Data base cross references</h4>
<p>A variety of data base cross references can be added to a feature. These appear as /db_xref on the features. This qualifier serves as a vehicle for linking of sequence records to other external databases. See the full list of <a href="/genbank/collab/db_xref">db_xref</a>.</p>
<pre><code>1 100 gene
locus_tag Test_0001
1 100 CDS
product RecA
protein_id gnl|center_name|Test_0001
db_xref InterPro:IPR000111
180 210 misc_feature
note yybP-ykoY element
db_xref RFAM:RF00080
</code></pre>
<h4 id="GOterms">Gene Ontology</h4>
<p>GO (Gene Ontology) terms can be included in genomes in order to describe protein functionality. Gene Ontology (GO) terms can be indicated with the following qualifiers</p>
<pre><code>1 100 CDS
product AbcD
go_component exocyst|0000145
go_process regulation of transcription, DNA-dependent|0006355
go_process exocytosis|0006887
go_function DNA binding|0003677
</code></pre>
<p>The value field is separated by vertical bars '|' into a descriptive string, the GO identifier (leading zeroes are retained), and optionally a PubMed ID and one or more evidence codes. The evidence code is the fourth token, so include blank fields, as necessary (eg the last qualifier has no PubMed ID so the third field is blank). See examples on the <a href="/genbank/eukaryotic_genome_submission_annotation#GOterms">detailed eukaryotic annotation</a> page.</p>
<h4 id="variation">Variation</h4>
<p>Polymorphisms in the sequence can be shown with variation features. Include one of the polymorphisms in the sequence (usually, this is the most commonly seen sequence), and then add a variation feature in the .tbl file for each of the other possibilities.</p>
<ul>
<li>The variation feature requires a 'replace' qualifier, whose value is the sequence of the polymorphism that is NOT in the submitted sequence. For example, if CCC is most common at position 100-102 but there is also CC (a substitution), CCCCC (an insertion), and nothing (a deletion) then the sequence will have CCC at that location and you would include three variation features, one for each polymorphism.</li>
<li>For an insertion polymorphism, a carat (^) is part of the start location.</li>
<li>When the polymorphism is a complete deletion, then the replace value is just two double-quotes.</li>
<li>You can also include optional qualifiers- note, and the frequency with which the other sequence is found.</li>
</ul>
<p>Here is an example with all of those options:</p>
<pre><code>100 102 variation
replace cc
note polymorphism
100^ 102 variation
replace ccccc
frequency 0.1
100 102 variation
replace ""
note deletion
</code></pre>
<p>Those features will appear like this in the GenBank view:</p>
<pre><code>variation 100..102
/note="polymorphism"
/replace="cc"
variation 100^102
/frequency="0.1"
/replace="ccccc"
variation 100..102
/note="deletion"
/replace=""
</code></pre>
<h4 id="Other">Other Annotation</h4>
<p>Riboswitches had been annotated before 2017 using the misc_binding feature if the bound moiety was known, for example:</p>
<pre><code>1 100 misc_binding
note cobalamin riboswitch
bound_moiety adenosylcobalamin
</code></pre>
<p>As of 2017, riboswitches are annotated as regulatory features with the regulatory_class 'riboswitch':</p>
<pre><code>1 100 regulatory
regulatory_class riboswitch
note cobalamin riboswitch
bound_moiety adenosylcobalamin
</code></pre>
<p>If the bound moiety is unknown or if the sequence is a leader sequence, annotate as a misc_feature, for example:</p>
<pre><code>1 100 misc_feature
note yybP-ykoY element
</code></pre>
<p>misc_feature and misc_binding and regulatory features do not have an associated gene feature. If it is desired to tag these features with a locus_tag-like identifier, then include that value in the note, separated from other information by a semi-colon and space.</p>
</div>
<!--/.col1-->
<div class="col2">
<div class="rightnav">
<h2 id="genome-resources">Genome Resources</h2>
<ul>
<li><a href="/genbank/wgs/">About WGS</a></li>
<li><a href="https://www.ncbi.nlm.nih.gov/Traces/wgs/?view=wgs">WGS Browser</a></li>
<li><a href="/genbank/genomesubmit/">Genome Submission Guide</a></li>
<li><a href="https://submit.ncbi.nlm.nih.gov/subs/genome/">Genome Submission Portal</a></li>
<li><a href="/genbank/wgs_update/">Update Genome Records</a></li>
<li><a href="/genbank/wgsfaq/">FAQ</a></li>
<li><a href="/genbank/table2asn">table2asn</a></li>
<li><a href="/genbank/diploid_haps">Submitting Multiple Haplotype Assemblies</a></li>
<li><a href="/WebSub/template.cgi/">Create Submission Template</a></li>
<li><a href="/genbank/eukaryotic_genome_submission/">Eukaryotic Annotation Guide</a></li>
<li><a href="/genbank/genomesubmit_annotation/">Prokaryotic Annotation Guide</a></li>
<li><a href="/genbank/examples.wgs/">Annotation Example Files</a></li>
<li><a href="/genbank/genomes_gff">Annotating Genomes with GFF3 or GTF files</a></li>
<li><a href="/genbank/genome_validation">Validation Error Explanations for Genomes</a></li>
<li><a href="/genbank/asndisc/">Discrepancy Report</a></li>
<li><a href="https://www.ncbi.nlm.nih.gov/genome/annotation_prok/">NCBI Prokaryotic Genome Annotation Pipeline</a></li>
<li><a href="https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/">AGP Format</a></li>
<li><a href="/genbank/metagenome/">Metagenome Submission Guide</a></li>
<li><a href="/genbank/structuredcomment/">Structured Comment</a></li>
<li><a href="/bioproject/">BioProject</a></li>
<li><a href="/biosample/">BioSample</a></li>
</ul>
</div>
</div>
<!--/.col2-->
<div class="col3">
</div>
<!--/.col3-->
<div class="col4">
</div>
<!--/.col4-->
<div class="col5">
</div>
<div class="col6">
</div>
<div class="col7">
</div>
<div class="col8">
</div>
<div class="col9">
</div>
</div><!--/.content-->
</div><!--/.container-->
<div id="NCBIFooter_dynamic">
<div class="breadcrumbs">You are here:
<span id="breadcrumb_text"><a href="/guide/">NCBI</a></span></div>
<a id="help-desk-link" class="help_desk" href="https://support.ncbi.nlm.nih.gov/ics/support/default.asp?Time=2025-03-05T03:58:40-05:00&amp;Snapshot=%2Fprojects%2Fstaticsites%2Fgenbank%2Fgenbank@2.21&amp;Host=portal104&amp;ncbi_phid=CE8B28407C8035B10000000000F000B2&amp;ncbi_session=CE8B5AF87C7FFCB1_0191SID&amp;from=https%3A%2F%2Fwww.ncbi.nlm.nih.gov%2Fgenbank%2Fgenomesubmit_annotation%2F&amp;Ncbi_App=genbank&amp;Page=custom-page&amp;style=classic&amp;deptID=28049" target="_blank">Support Center</a>
<noscript><img alt="" src="/stat?jsdisabled=true&amp;ncbi_app=genbank&amp;ncbi_db=&amp;ncbi_pdid=custom-page&amp;ncbi_phid=CE8B28407C8035B10000000000F000B2" /></noscript>
</div>
<div xmlns:xi="http://www.w3.org/2001/XInclude">
<div xmlns="http://www.w3.org/1999/xhtml" class="footer" id="footer" xml:base="http://127.0.0.1/sites/static/header_footer">
<section class="icon-section">
<div id="icon-section-header" class="icon-section_header">Follow NCBI</div>
<div class="grid-container container">
<div class="icon-section_container">
<a class="footer-icon" id="footer_twitter" href="https://twitter.com/ncbi" aria-label="Twitter">
<svg xmlns="http://www.w3.org/2000/svg" width="40" height="40" viewBox="0 0 40 40" fill="none">
<title>Twitter</title>
<g id="twitterx1008">
<path id="path1008" d="M6.06736 7L16.8778 20.8991L6.00001 32.2H10.2L18.6 23.1L25.668 32.2H34L22.8 17.5L31.9 7H28.4L20.7 15.4L14.401 7H6.06898H6.06736ZM9.66753 8.73423H12.9327L29.7327 30.4658H26.5697L9.66753 8.73423Z" fill="#5B616B"></path>
</g>
</svg>
</a>
<a class="footer-icon" id="footer_facebook" href="https://www.facebook.com/ncbi.nlm" aria-label="Facebook"><svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" viewBox="0 0 300 300">
<title>Facebook</title>
<path class="cls-11" d="M210.5,115.12H171.74V97.82c0-8.14,5.39-10,9.19-10h27.14V52l-39.32-.12c-35.66,0-42.42,26.68-42.42,43.77v19.48H99.09v36.32h27.24v109h45.41v-109h35Z">
</path>
</svg></a>
<a class="footer-icon" id="footer_linkedin" href="https://www.linkedin.com/company/ncbinlm" aria-label="LinkedIn"><svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" viewBox="0 0 300 300">
<title>LinkedIn</title>
<path class="cls-11" d="M101.64,243.37H57.79v-114h43.85Zm-22-131.54h-.26c-13.25,0-21.82-10.36-21.82-21.76,0-11.65,8.84-21.15,22.33-21.15S101.7,78.72,102,90.38C102,101.77,93.4,111.83,79.63,111.83Zm100.93,52.61A17.54,17.54,0,0,0,163,182v61.39H119.18s.51-105.23,0-114H163v13a54.33,54.33,0,0,1,34.54-12.66c26,0,44.39,18.8,44.39,55.29v58.35H198.1V182A17.54,17.54,0,0,0,180.56,164.44Z">
</path>
</svg></a>
<a class="footer-icon" id="footer_github" href="https://github.com/ncbi" aria-label="GitHub"><svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" viewBox="0 0 300 300">
<defs>
<style>
.cls-11,
.cls-12 {
fill: #737373;
}
.cls-11 {
fill-rule: evenodd;
}
</style>
</defs>
<title>GitHub</title>
<path class="cls-11" d="M151.36,47.28a105.76,105.76,0,0,0-33.43,206.1c5.28,1,7.22-2.3,7.22-5.09,0-2.52-.09-10.85-.14-19.69-29.42,6.4-35.63-12.48-35.63-12.48-4.81-12.22-11.74-15.47-11.74-15.47-9.59-6.56.73-6.43.73-6.43,10.61.75,16.21,10.9,16.21,10.9,9.43,16.17,24.73,11.49,30.77,8.79,1-6.83,3.69-11.5,6.71-14.14C108.57,197.1,83.88,188,83.88,147.51a40.92,40.92,0,0,1,10.9-28.39c-1.1-2.66-4.72-13.42,1-28,0,0,8.88-2.84,29.09,10.84a100.26,100.26,0,0,1,53,0C198,88.3,206.9,91.14,206.9,91.14c5.76,14.56,2.14,25.32,1,28a40.87,40.87,0,0,1,10.89,28.39c0,40.62-24.74,49.56-48.29,52.18,3.79,3.28,7.17,9.71,7.17,19.58,0,14.15-.12,25.54-.12,29,0,2.82,1.9,6.11,7.26,5.07A105.76,105.76,0,0,0,151.36,47.28Z">
</path>
<path class="cls-12" d="M85.66,199.12c-.23.52-1.06.68-1.81.32s-1.2-1.06-.95-1.59,1.06-.69,1.82-.33,1.21,1.07.94,1.6Zm-1.3-1">
</path>
<path class="cls-12" d="M90,203.89c-.51.47-1.49.25-2.16-.49a1.61,1.61,0,0,1-.31-2.19c.52-.47,1.47-.25,2.17.49s.82,1.72.3,2.19Zm-1-1.08">
</path>
<path class="cls-12" d="M94.12,210c-.65.46-1.71,0-2.37-.91s-.64-2.07,0-2.52,1.7,0,2.36.89.65,2.08,0,2.54Zm0,0"></path>
<path class="cls-12" d="M99.83,215.87c-.58.64-1.82.47-2.72-.41s-1.18-2.06-.6-2.7,1.83-.46,2.74.41,1.2,2.07.58,2.7Zm0,0">
</path>
<path class="cls-12" d="M107.71,219.29c-.26.82-1.45,1.2-2.64.85s-2-1.34-1.74-2.17,1.44-1.23,2.65-.85,2,1.32,1.73,2.17Zm0,0">
</path>
<path class="cls-12" d="M116.36,219.92c0,.87-1,1.59-2.24,1.61s-2.29-.68-2.3-1.54,1-1.59,2.26-1.61,2.28.67,2.28,1.54Zm0,0">
</path>
<path class="cls-12" d="M124.42,218.55c.15.85-.73,1.72-2,1.95s-2.37-.3-2.52-1.14.73-1.75,2-2,2.37.29,2.53,1.16Zm0,0"></path>
</svg></a>
<a class="footer-icon" id="footer_blog" href="https://ncbiinsights.ncbi.nlm.nih.gov/" aria-label="Blog">
<svg xmlns="http://www.w3.org/2000/svg" id="Layer_1" data-name="Layer 1" viewBox="0 0 40 40">
<defs><style>.cls-1{fill:#737373;}</style></defs>
<title>NCBI Insights Blog</title>
<path class="cls-1" d="M14,30a4,4,0,1,1-4-4,4,4,0,0,1,4,4Zm11,3A19,19,0,0,0,7.05,15a1,1,0,0,0-1,1v3a1,1,0,0,0,.93,1A14,14,0,0,1,20,33.07,1,1,0,0,0,21,34h3a1,1,0,0,0,1-1Zm9,0A28,28,0,0,0,7,6,1,1,0,0,0,6,7v3a1,1,0,0,0,1,1A23,23,0,0,1,29,33a1,1,0,0,0,1,1h3A1,1,0,0,0,34,33Z"></path>
</svg>
</a>
</div>
</div>
</section>
<section class="container-fluid bg-primary">
<div class="container pt-5">
<div class="row mt-3">
<div class="col-lg-3 col-12">
<p><a class="text-white" href="https://www.nlm.nih.gov/socialmedia/index.html">Connect with NLM</a></p>
<ul class="list-inline social_media">
<li class="list-inline-item"><a href="https://twitter.com/NLM_NIH" aria-label="Twitter" target="_blank" rel="noopener noreferrer">
<svg xmlns="http://www.w3.org/2000/svg" width="35" height="35" viewBox="0 0 36 35" fill="none">
<title>Twitter</title>
<g id="twitterx1009" clip-path="url(#clip0_65276_3946)">
<path id="Vector_Twitter" d="M17.5006 34.6565C26.9761 34.6565 34.6575 26.9751 34.6575 17.4996C34.6575 8.02416 26.9761 0.342773 17.5006 0.342773C8.02514 0.342773 0.34375 8.02416 0.34375 17.4996C0.34375 26.9751 8.02514 34.6565 17.5006 34.6565Z" fill="#205493" stroke="white" stroke-width="1.0" stroke-miterlimit="10"></path>
<path id="path1009" d="M8.54811 8.5L16.2698 18.4279L8.50001 26.5H11.5L17.5 20L22.5486 26.5H28.5L20.5 16L27 8.5H24.5L19 14.5L14.5007 8.5H8.54927H8.54811ZM11.1197 9.73873H13.4519L25.4519 25.2613H23.1926L11.1197 9.73873Z" fill="white"></path>
</g>
<defs>
<clipPath id="clip0_65276_3946">
<rect width="35" height="35" fill="white"></rect>
</clipPath>
</defs>
</svg>
</a></li>
<li class="list-inline-item"><a href="https://www.facebook.com/nationallibraryofmedicine" aria-label="Facebook" rel="noopener noreferrer" target="_blank">
<svg xmlns="http://www.w3.org/2000/svg" width="35" height="35" viewBox="0 0 36 35" fill="none">
<title>Facebook</title>
<g id="Facebook" clip-path="url(#clip0_1717_1086)">
<path id="Vector_Facebook" d="M15.1147 29.1371C15.1147 29.0822 15.1147 29.0296 15.1147 28.9747V18.9414H11.8183C11.6719 18.9414 11.6719 18.9414 11.6719 18.8018C11.6719 17.5642 11.6719 16.3289 11.6719 15.0937C11.6719 14.9793 11.7062 14.9518 11.816 14.9518C12.8683 14.9518 13.9206 14.9518 14.9751 14.9518H15.1215V14.8329C15.1215 13.8057 15.1215 12.774 15.1215 11.7492C15.1274 10.9262 15.3148 10.1146 15.6706 9.37241C16.1301 8.38271 16.9475 7.60378 17.9582 7.19235C18.6492 6.90525 19.3923 6.76428 20.1405 6.7783C21.0029 6.79202 21.8653 6.83091 22.7278 6.86065C22.8879 6.86065 23.048 6.89496 23.2082 6.90182C23.2974 6.90182 23.3271 6.94071 23.3271 7.02993C23.3271 7.54235 23.3271 8.05477 23.3271 8.5649C23.3271 9.16882 23.3271 9.77274 23.3271 10.3767C23.3271 10.4819 23.2974 10.5139 23.1921 10.5116C22.5379 10.5116 21.8814 10.5116 21.2271 10.5116C20.9287 10.5184 20.6316 10.5528 20.3395 10.6146C20.0822 10.6619 19.8463 10.7891 19.6653 10.9779C19.4842 11.1668 19.3672 11.4078 19.3307 11.6669C19.2857 11.893 19.2612 12.1226 19.2575 12.3531C19.2575 13.1904 19.2575 14.0299 19.2575 14.8695C19.2575 14.8946 19.2575 14.9198 19.2575 14.9564H23.0229C23.1807 14.9564 23.183 14.9564 23.1624 15.1074C23.0778 15.7662 22.9885 16.425 22.9039 17.0816C22.8322 17.6321 22.7636 18.1827 22.698 18.7332C22.6729 18.9437 22.6797 18.9437 22.4693 18.9437H19.2644V28.8992C19.2644 28.9793 19.2644 29.0593 19.2644 29.1394L15.1147 29.1371Z" fill="white"></path>
<path id="Vector_2_Facebook" d="M17.5006 34.657C26.9761 34.657 34.6575 26.9756 34.6575 17.5001C34.6575 8.02465 26.9761 0.343262 17.5006 0.343262C8.02514 0.343262 0.34375 8.02465 0.34375 17.5001C0.34375 26.9756 8.02514 34.657 17.5006 34.657Z" stroke="white" stroke-width="1.0" stroke-miterlimit="10"></path>
</g>
<defs>
<clipPath id="clip0_1717_1086">
<rect width="35" height="35" fill="white"></rect>
</clipPath>
</defs>
</svg>
</a></li>
<li class="list-inline-item"><a href="https://www.youtube.com/user/NLMNIH" aria-label="Youtube" target="_blank" rel="noopener noreferrer">
<svg xmlns="http://www.w3.org/2000/svg" width="35" height="35" viewBox="0 0 36 35" fill="none">
<title>Youtube</title>
<g id="YouTube" clip-path="url(#clip0_1717_1101)">
<path id="Vector_Youtube" d="M26.2571 11.4791C25.9025 11.1589 25.5709 10.9576 24.228 10.834C22.5512 10.6785 20.2797 10.6556 18.564 10.6533H16.4365C14.7208 10.6533 12.4493 10.6785 10.7725 10.834C9.43196 10.9576 9.09798 11.1589 8.7434 11.4791C7.81464 12.321 7.6202 14.6268 7.59961 16.8938C7.59961 17.3178 7.59961 17.741 7.59961 18.1635C7.62706 20.4121 7.82837 22.686 8.7434 23.521C9.09798 23.8412 9.42967 24.0425 10.7725 24.1661C12.4493 24.3216 14.7208 24.3445 16.4365 24.3468H18.564C20.2797 24.3468 22.5512 24.3216 24.228 24.1661C25.5686 24.0425 25.9025 23.8412 26.2571 23.521C27.1722 22.6929 27.3735 20.451 27.4009 18.2206C27.4009 17.7402 27.4009 17.2599 27.4009 16.7795C27.3735 14.5491 27.1699 12.3072 26.2571 11.4791ZM15.5604 20.5311V14.652L20.561 17.5001L15.5604 20.5311Z" fill="white"></path>
<path id="Vector_2_Youtube" d="M17.5006 34.657C26.9761 34.657 34.6575 26.9756 34.6575 17.5001C34.6575 8.02465 26.9761 0.343262 17.5006 0.343262C8.02514 0.343262 0.34375 8.02465 0.34375 17.5001C0.34375 26.9756 8.02514 34.657 17.5006 34.657Z" stroke="white" stroke-width="1.0" stroke-miterlimit="10"></path>
</g>
<defs>
<clipPath id="clip0_1717_1101">
<rect width="35" height="35" fill="white"></rect>
</clipPath>
</defs>
</svg>
</a></li>
</ul>
</div>
<div class="col-lg-3 col-12">
<p class="address_footer text-white">National Library of Medicine<br />
<a href="https://www.google.com/maps/place/8600+Rockville+Pike,+Bethesda,+MD+20894/@38.9959508,-77.101021,17z/data=!3m1!4b1!4m5!3m4!1s0x89b7c95e25765ddb:0x19156f88b27635b8!8m2!3d38.9959508!4d-77.0988323" class="text-white" target="_blank" rel="noopener noreferrer">8600 Rockville Pike<br />
Bethesda, MD 20894</a></p>
</div>
<div class="col-lg-3 col-12 centered-lg">
<p><a href="https://www.nlm.nih.gov/web_policies.html" class="text-white">Web Policies</a><br />
<a href="https://www.nih.gov/institutes-nih/nih-office-director/office-communications-public-liaison/freedom-information-act-office" class="text-white">FOIA</a><br />
<a href="https://www.hhs.gov/vulnerability-disclosure-policy/index.html" class="text-white" id="vdp">HHS Vulnerability Disclosure</a></p>
</div>
<div class="col-lg-3 col-12 centered-lg">
<p><a class="supportLink text-white" href="https://support.nlm.nih.gov/">Help</a><br />
<a href="https://www.nlm.nih.gov/accessibility.html" class="text-white">Accessibility</a><br />
<a href="https://www.nlm.nih.gov/careers/careers.html" class="text-white">Careers</a></p>
</div>
</div>
<div class="row">
<div class="col-lg-12 centered-lg">
<nav class="bottom-links">
<ul class="mt-3">
<li>
<a class="text-white" href="//www.nlm.nih.gov/">NLM</a>
</li>
<li>
<a class="text-white" href="https://www.nih.gov/">NIH</a>
</li>
<li>
<a class="text-white" href="https://www.hhs.gov/">HHS</a>
</li>
<li>
<a class="text-white" href="https://www.usa.gov/">USA.gov</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
</section>
<script type="text/javascript" src="/portal/portal3rc.fcgi/rlib/js/InstrumentOmnitureBaseJS/InstrumentNCBIConfigJS/InstrumentNCBIBaseJS/InstrumentPageStarterJS.js?v=1"> </script>
<script type="text/javascript" src="/portal/portal3rc.fcgi/static/js/hfjs2.js"> </script>
</div>
</div>
<!--/.footer-->
<p class="last-updated small">Last updated: 2024-04-23T18:36:34Z</p>
</div>
<!--/.page-->
</div>
<!--/.wrap-->
<span class="PAFAppResources"></span>
</div><!-- /.twelve_col -->
</div>
<!-- /.grid -->
<!-- usually for JS scripts at page bottom -->
<span class="pagefixtures"></span>
<!-- CE8B5AF87C7FFCB1_0191SID /projects/staticsites/genbank/genbank@2.21 portal104 v4.1.r689238 Tue, Oct 22 2024 16:10:51 -->
<span id="portal-csrf-token" style="display:none" data-token="CE8B5AF87C7FFCB1_0191SID"></span>
<script type="text/javascript" src="//static.pubmed.gov/portal/portal3rc.fcgi/4218137/js/3879255/4121861/1490097/4087685.js" snapshot="genbank"></script></body>
</html>