nih-gov/www.ncbi.nlm.nih.gov/refseq/annotation_euk/process/index.html

895 lines
61 KiB
HTML
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<!-- AppResources meta begin -->
<meta name="paf-app-resources" content="" />
<!-- AppResources meta end -->
<!-- TemplateResources meta begin -->
<meta name="paf_template" content="StdNCol" />
<!-- TemplateResources meta end -->
<!-- Page meta begin -->
<!-- Page meta end -->
<!-- Logger begin -->
<meta xmlns:ncbi-portal="http://ncbi.gov/portal/XSLT/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" name="ncbi_app" content="refseq" /><meta xmlns:ncbi-portal="http://ncbi.gov/portal/XSLT/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" name="ncbi_pdid" content="static" />
<!-- Logger end -->
<title>The NCBI Eukaryotic Genome Annotation Pipeline</title>
<!-- PageFixtures headcontent begin -->
<link type="text/css" rel="stylesheet" href="/core/assets/genome/css/genome.css" /><link type="text/css" rel="stylesheet" href="/core/assets/genome/css/genome_links.css" />
<meta name="cms-local-nav-url" content="http://cms.ncbi.nlm.nih.gov//refseq/annotation_euk/_nav.xml" />
<!-- PageFixtures headcontent end -->
<!-- AppResources external_resources begin -->
<script type="text/javascript" src="/core/jig/1.15.6/js/jig.min.js"></script>
<!-- AppResources external_resources end -->
<!-- Page headcontent begin -->
<meta name="subsite" content="refseq" />
<meta name="path" content="refseq/annotation_euk/process" />
<meta name="modified" content="2024-04-04T20:39:01Z" />
<!-- Page headcontent end -->
<!-- PageFixtures resources begin -->
<link xmlns="http://www.w3.org/1999/xhtml" type="text/css" rel="stylesheet" href="//static.pubmed.gov/portal/portal3rc.fcgi/4218191/css/4207974/4206132.css" xml:base="http://127.0.0.1/sites/static/header_footer" />
<!-- PageFixtures resources end -->
<link rel="shortcut icon" href="//www.ncbi.nlm.nih.gov/favicon.ico" /><meta name="ncbi_phid" content="CE8B42C87DC853710000000001340102.m_6" /><script type="text/javascript"><!--
var ScriptPath = '/portal/';
var objHierarchy = {"name":"PAFAppLayout","type":"Layout","realname":"PAFAppLayout",
"children":[{"name":"PAFAppLayout.AppController","type":"Cluster","realname":"PAFAppLayout.AppController",
"children":[{"name":"PAFAppLayout.AppController.AppResources","type":"Portlet","realname":"PAFAppLayout.AppController.AppResources","shortname":"AppResources"},
{"name":"PAFAppLayout.AppController.RequestProcessor","type":"Portlet","realname":"PAFAppLayout.AppController.RequestProcessor","shortname":"RequestProcessor"},
{"name":"PAFAppLayout.AppController.Controller","type":"Cluster","realname":"PAFAppLayout.AppController.Controller",
"children":[{"name":"PAFAppLayout.AppController.Controller.AnnotStatusStaticContentController","type":"Portlet","realname":"PAFAppLayout.AppController.Controller.AnnotStatusStaticContentController","shortname":"AnnotStatusStaticContentController"}]},
{"name":"PAFAppLayout.AppController.Page","type":"Cluster","realname":"PAFAppLayout.AppController.Page",
"children":[{"name":"PAFAppLayout.AppController.Page.PAFPageSelectorData","type":"Portlet","realname":"PAFAppLayout.AppController.Page.PAFPageSelector.PAFPageSelectorData","shortname":"PAFPageSelectorData"},
{"name":"PAFAppLayout.AppController.Page.PAFStaticPage","type":"Cluster","realname":"PAFAppLayout.AppController.Page.PAFPageSelector.PAFStaticPage",
"children":[{"name":"PAFAppLayout.AppController.Page.PAFStaticPage.MainPortlet","type":"Portlet","realname":"PAFAppLayout.AppController.Page.PAFPageSelector.PAFStaticPage.MainPortlet","shortname":"MainPortlet"}]}]},
{"name":"PAFAppLayout.AppController.PageFixtures","type":"Cluster","realname":"PAFAppLayout.AppController.PageFixtures",
"children":[{"name":"PAFAppLayout.AppController.PageFixtures.PageFixturesP","type":"Portlet","realname":"PAFAppLayout.AppController.PageFixtures.PAFPageFixtures.PageFixturesP","shortname":"PageFixturesP"},
{"name":"PAFAppLayout.AppController.PageFixtures.SearchBar","type":"Cluster","realname":"PAFAppLayout.AppController.PageFixtures.PAFPageFixtures.SearchBar",
"children":[{"name":"PAFAppLayout.AppController.PageFixtures.SearchBar.SearchBarChooser","type":"Portlet","realname":"PAFAppLayout.AppController.PageFixtures.PAFPageFixtures.SearchBar.SearchBarChooser","shortname":"SearchBarChooser"},
{"name":"PAFAppLayout.AppController.PageFixtures.SearchBar.PAFSearchBar","type":"Portlet","realname":"PAFAppLayout.AppController.PageFixtures.PAFPageFixtures.SearchBar.PAFSearchBar","shortname":"PAFSearchBar"}]},
{"name":"PAFAppLayout.AppController.PageFixtures.HeaderFooter","type":"Cluster","realname":"PAFAppLayout.AppController.PageFixtures.PAFPageFixtures.HeaderFooter",
"children":[{"name":"PAFAppLayout.AppController.PageFixtures.HeaderFooter.NCBIBreadcrumbs","type":"Portlet","realname":"PAFAppLayout.AppController.PageFixtures.PAFPageFixtures.HeaderFooter.NCBIBreadcrumbs","shortname":"NCBIBreadcrumbs"},
{"name":"PAFAppLayout.AppController.PageFixtures.HeaderFooter.NCBIHelpDesk","type":"Portlet","realname":"PAFAppLayout.AppController.PageFixtures.PAFPageFixtures.HeaderFooter.NCBIHelpDesk","shortname":"NCBIHelpDesk"},
{"name":"PAFAppLayout.AppController.PageFixtures.HeaderFooter.NCBIApplog_NoScript_Ping","type":"Portlet","realname":"PAFAppLayout.AppController.PageFixtures.PAFPageFixtures.HeaderFooter.NCBIApplog_NoScript_Ping","shortname":"NCBIApplog_NoScript_Ping"}]},
{"name":"PAFAppLayout.AppController.PageFixtures.LocalNavPortlet","type":"Portlet","realname":"PAFAppLayout.AppController.PageFixtures.LocalNavPortlet","shortname":"LocalNavPortlet"}]},
{"name":"PAFAppLayout.AppController.TemplateResources","type":"Cluster","realname":"PAFAppLayout.AppController.TemplateResources",
"children":[{"name":"PAFAppLayout.AppController.TemplateResources.StdNColResources","type":"Portlet","realname":"PAFAppLayout.AppController.TemplateResources.PAFTemplateResources.StdNColResources","shortname":"StdNColResources"}]},
{"name":"PAFAppLayout.AppController.Logger","type":"Portlet","realname":"PAFAppLayout.AppController.Logger","shortname":"Logger"},
{"name":"PAFAppLayout.AppController.DebugConsole","type":"Portlet","realname":"PAFAppLayout.AppController.DebugConsole","shortname":"DebugConsole"}]}]};
--></script>
<meta name='referrer' content='origin-when-cross-origin'/><link type="text/css" rel="stylesheet" href="//static.pubmed.gov/portal/portal3rc.fcgi/4218658/css/4121862/3974050/3917732/251717/4218659/4218660/14534/45193/3534283/4128070/4005757/4062871.css" /><link type="text/css" rel="stylesheet" href="//static.pubmed.gov/portal/portal3rc.fcgi/4218658/css/3529741.css" media="print" /><script type="text/javascript">
var ObjectLinks=[{i:0, ename: "p$ExL", esid:"*", sname: "p$ExL", ssid:"*", dname:"p$el", dsid:"0",m:"CopyValue",p:[],f: function(src, dst) {fn_CopyValue(src, dst);}}]
var ActiveNames = {"p$ExL":1};
</script></head>
<body class=" static">
<div class="grid">
<div class="col twelve_col nomargin shadow">
<!-- System messages like service outage or JS required; this is handled by the TemplateResources portlet -->
<div class="sysmessages">
<noscript>
<p class="nojs">
<strong>Warning:</strong>
The NCBI web site requires JavaScript to function.
<a href="/guide/browsers/#enablejs" title="Learn how to enable JavaScript" target="_blank">more...</a>
</p>
</noscript>
</div>
<!--/.sysmessage-->
<div class="wrap">
<div class="page">
<div xmlns:xi="http://www.w3.org/2001/XInclude">
<div xmlns="http://www.w3.org/1999/xhtml" id="universal_header" xml:base="http://127.0.0.1/sites/static/header_footer">
<section class="usa-banner">
<div class="usa-accordion">
<header class="usa-banner-header">
<div class="usa-grid usa-banner-inner">
<img src="https://www.ncbi.nlm.nih.gov/coreutils/uswds/img/favicons/favicon-57.png" alt="U.S. flag" />
<p>An official website of the United States government</p>
<button class="non-usa-accordion-button usa-banner-button" aria-expanded="false" aria-controls="gov-banner-top" type="button">
<span class="usa-banner-button-text">Here's how you know</span>
</button>
</div>
</header>
<div class="usa-banner-content usa-grid usa-accordion-content" id="gov-banner-top" aria-hidden="true">
<div class="usa-banner-guidance-gov usa-width-one-half">
<img class="usa-banner-icon usa-media_block-img" src="https://www.ncbi.nlm.nih.gov/coreutils/uswds/img/icon-dot-gov.svg" alt="Dot gov" />
<div class="usa-media_block-body">
<p>
<strong>The .gov means it's official.</strong>
<br />
Federal government websites often end in .gov or .mil. Before
sharing sensitive information, make sure you're on a federal
government site.
</p>
</div>
</div>
<div class="usa-banner-guidance-ssl usa-width-one-half">
<img class="usa-banner-icon usa-media_block-img" src="https://www.ncbi.nlm.nih.gov/coreutils/uswds/img/icon-https.svg" alt="Https" />
<div class="usa-media_block-body">
<p>
<strong>The site is secure.</strong>
<br />
The <strong>https://</strong> ensures that you are connecting to the
official website and that any information you provide is encrypted
and transmitted securely.
</p>
</div>
</div>
</div>
</div>
</section>
<div class="usa-overlay"></div>
<header class="ncbi-header" role="banner" data-section="Header">
<div class="usa-grid">
<div class="usa-width-one-whole">
<div class="ncbi-header__logo">
<a href="/" class="logo" aria-label="NCBI Logo" data-ga-action="click_image" data-ga-label="NIH NLM Logo">
<img src="https://www.ncbi.nlm.nih.gov/coreutils/nwds/img/logos/AgencyLogo.svg" alt="NIH NLM Logo" />
</a>
</div>
<div class="ncbi-header__account">
<a id="account_login" href="https://account.ncbi.nlm.nih.gov" class="usa-button header-button" style="display:none" data-ga-action="open_menu" data-ga-label="account_menu">Log in</a>
<button id="account_info" class="header-button" style="display:none" aria-controls="account_popup" type="button">
<span class="fa fa-user" aria-hidden="true">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="20px" height="20px">
<g style="fill: #fff">
<ellipse cx="12" cy="8" rx="5" ry="6"></ellipse>
<path d="M21.8,19.1c-0.9-1.8-2.6-3.3-4.8-4.2c-0.6-0.2-1.3-0.2-1.8,0.1c-1,0.6-2,0.9-3.2,0.9s-2.2-0.3-3.2-0.9 C8.3,14.8,7.6,14.7,7,15c-2.2,0.9-3.9,2.4-4.8,4.2C1.5,20.5,2.6,22,4.1,22h15.8C21.4,22,22.5,20.5,21.8,19.1z"></path>
</g>
</svg>
</span>
<span class="username desktop-only" aria-hidden="true" id="uname_short"></span>
<span class="sr-only">Show account info</span>
</button>
</div>
<div class="ncbi-popup-anchor">
<div class="ncbi-popup account-popup" id="account_popup" aria-hidden="true">
<div class="ncbi-popup-head">
<button class="ncbi-close-button" data-ga-action="close_menu" data-ga-label="account_menu" type="button">
<span class="fa fa-times">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 48 48" width="24px" height="24px">
<path d="M38 12.83l-2.83-2.83-11.17 11.17-11.17-11.17-2.83 2.83 11.17 11.17-11.17 11.17 2.83 2.83 11.17-11.17 11.17 11.17 2.83-2.83-11.17-11.17z"></path>
</svg>
</span>
<span class="usa-sr-only">Close</span></button>
<h4>Account</h4>
</div>
<div class="account-user-info">
Logged in as:<br />
<b><span class="username" id="uname_long">username</span></b>
</div>
<div class="account-links">
<ul class="usa-unstyled-list">
<li><a id="account_myncbi" href="/myncbi/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_myncbi">Dashboard</a></li>
<li><a id="account_pubs" href="/myncbi/collections/bibliography/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_pubs">Publications</a></li>
<li><a id="account_settings" href="/account/settings/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_settings">Account settings</a></li>
<li><a id="account_logout" href="/account/signout/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_logout">Log out</a></li>
</ul>
</div>
</div>
</div>
</div>
</div>
</header>
<div role="navigation" aria-label="access keys">
<a id="nws_header_accesskey_0" href="https://www.ncbi.nlm.nih.gov/guide/browsers/#ncbi_accesskeys" class="usa-sr-only" accesskey="0" tabindex="-1">Access keys</a>
<a id="nws_header_accesskey_1" href="https://www.ncbi.nlm.nih.gov" class="usa-sr-only" accesskey="1" tabindex="-1">NCBI Homepage</a>
<a id="nws_header_accesskey_2" href="/myncbi/" class="set-base-url usa-sr-only" accesskey="2" tabindex="-1">MyNCBI Homepage</a>
<a id="nws_header_accesskey_3" href="#maincontent" class="usa-sr-only" accesskey="3" tabindex="-1">Main Content</a>
<a id="nws_header_accesskey_4" href="#" class="usa-sr-only" accesskey="4" tabindex="-1">Main Navigation</a>
</div>
<section data-section="Alerts">
<div class="ncbi-alerts-placeholder"></div>
</section>
</div>
</div>
<!--/.header-->
<div class="header">
<div class="res_logo"><h1 class="res_name"><a href="/refseq/" title="RefSeq home">RefSeq</a></h1><h2 class="res_tagline">Integrated reference sequences</h2></div>
<div class="search"><form method="get" action="/refseq/"><div class="search_form"><label for="database" class="offscreen_noflow">Search database</label><select id="database"><optgroup label="Recent"><option value="refseq" selected="selected">RefSeq</option><option value="books">Books</option><option value="pubmed">PubMed</option><option value="medgen" class="last">MedGen</option></optgroup><optgroup label="All"><option value="gquery">All Databases</option><option value="assembly">Assembly</option><option value="biocollections">Biocollections</option><option value="bioproject">BioProject</option><option value="biosample">BioSample</option><option value="books">Books</option><option value="clinvar">ClinVar</option><option value="cdd">Conserved Domains</option><option value="gap">dbGaP</option><option value="dbvar">dbVar</option><option value="gene">Gene</option><option value="genome">Genome</option><option value="gds">GEO DataSets</option><option value="geoprofiles">GEO Profiles</option><option value="gtr">GTR</option><option value="ipg">Identical Protein Groups</option><option value="medgen">MedGen</option><option value="mesh">MeSH</option><option value="nlmcatalog">NLM Catalog</option><option value="nuccore">Nucleotide</option><option value="omim">OMIM</option><option value="pmc">PMC</option><option value="protein">Protein</option><option value="proteinclusters">Protein Clusters</option><option value="protfam">Protein Family Models</option><option value="pcassay">PubChem BioAssay</option><option value="pccompound">PubChem Compound</option><option value="pcsubstance">PubChem Substance</option><option value="pubmed">PubMed</option><option value="refseq">RefSeq</option><option value="snp">SNP</option><option value="sra">SRA</option><option value="structure">Structure</option><option value="taxonomy">Taxonomy</option><option value="toolkit">ToolKit</option><option value="toolkitall">ToolKitAll</option><option value="toolkitbookgh">ToolKitBookgh</option></optgroup></select><div class="nowrap"><label for="term" class="offscreen_noflow" accesskey="/">Search term</label><div class="nowrap"><input type="text" name="term" id="term" title="Search RefSeq" value="" class="jig-ncbiclearbutton jig-ncbiautocomplete" data-jigconfig="isEnabled:false,disableUrl:'NcbiSearchBarAutoComplCtrl'" autocomplete="off" data-sbconfig="ds:'no',pjs:'no',afs:'yes'" /></div><button id="search" type="submit" class="button_search nowrap" cmd="go">Search</button></div></div><input type="hidden" name="p$a" id="p$a" /><input type="hidden" name="p$l" id="p$l" value="PAFAppLayout" /><input type="hidden" name="p$st" id="p$st" value="refseq" /><input name="SessionId" id="SessionId" value="CE8BC1E97D9F05E1_0182SID" disabled="disabled" type="hidden" /><input name="Snapshot" id="Snapshot" value="/projects/refseq/refseq@1.21" disabled="disabled" type="hidden" /></form></div>
</div>
<div class="nav_and_browser">
<div class="localnav"><ul class="jig-ncbilocalnav">
<li><a href="/refseq/annotation_euk/">Eukaryotic Annotation Home</a></li>
<li><a href="#">Documentation</a><ul>
<li><a href="/refseq/annotation_euk/process">Annotation Process</a></li>
<li><a href="/refseq/annotation_euk/gnomon">Gnomon</a></li>
<li><a href="/books/NBK169439/">NCBI Handbook Chapter</a></li>
<li><a href="/refseq/annotation_euk/release_notes">Software Release Notes</a></li>
</ul>
</li>
<li><a href="#">Annotated Genomes</a><ul>
<li><a href="/refseq/annotation_euk/all">All Annotated Genomes</a></li>
<li><a href="/refseq/annotation_euk/status/#recent">Recently Annotated Genomes</a></li>
<li><a href="/refseq/annotation_euk/status/">Annotation Runs In Progress</a></li>
<li><a href="/refseq/annotation_euk/#graphs">Annotations Per Year Graphs</a></li>
</ul>
</li>
<li><a href="/refseq/annotation_euk/policy">Annotation Policy</a></li>
<li><span id="euk-annot-request-navbar"></span><a href="https://support.nlm.nih.gov/support/create-case/">Request Annotation</a></li>
</ul></div>
</div>
<!-- was itemctrl -->
<div class="container">
<div id="maincontent" class="content col twelve_col last">
<div class="col1">
<h1 id="the-ncbi-eukaryotic-genome-annot">The NCBI Eukaryotic Genome Annotation Pipeline</h1>
<p>The NCBI Eukaryotic Genome Annotation Pipeline provides content for various NCBI resources including <a href="/nucleotide/">Nucleotide</a>, <a href="/protein">Protein</a>, <a href="https://blast.ncbi.nlm.nih.gov/Blast.cgi">BLAST</a>, <a href="/gene">Gene</a> and the <a href="/genome/gdv/">Genome Data Viewer</a> genome browser.</p>
<p>This page provides an overview of the annotation process. Please refer to <a href="/books/NBK169439/">the Eukaryotic Genome Annotation chapter of the NCBI Handbook</a> for algorithmic details.</p>
<p>The pipeline uses a modular framework for the execution of all annotation tasks from the fetching of raw and curated data from public repositories (sequence and <a href="/assembly/">Assembly</a> databases) to the alignment of sequences and the prediction of genes, to the submission of the accessioned annotation products to public databases. Core components of the pipeline are alignment programs (<a href="/sutils/splign/splign.cgi?textpage=documentation">Splign</a> and <a href="/sutils/static/prosplign/prosplign.html">ProSplign</a>) and an HMM-based gene prediction program (<a href="../gnomon">Gnomon</a>) developed at NCBI.</p>
<p>Important features of the pipeline include:</p>
<ul>
<li>flexibility and speed</li>
<li>higher weight given to curated evidence than non-curated evidence</li>
<li>utilization of RNA-Seq for gene prediction</li>
<li>production of models that compensate for assembly issues</li>
<li>tracking of gene loci from one annotation to the next</li>
<li>ability to co-annotate multiple assemblies for the same organism</li>
</ul>
<p>The products of an annotation run (chromosome, scaffolds and model transcripts and proteins) are labeled with an Annotation Name. There are two formats for the Annotation Name, which is used throughout NCBI as a way to uniquely identify annotation products originating from the same annotation run.</p>
<ul>
<li>the combination of the organism name and Annotation Release number (e.g. NCBI <em>Pongo abelii</em> Annotation Release 103)</li>
<li>the combination of the RefSeq assembly accession and the year and month in which the annotation was started (e.g. NCBI GCF_016801865.2-RS_2022_12) </li>
</ul>
<h2 id="contents"><strong>Contents</strong></h2>
<ul>
<li>
<p><a href="#process"><strong>Process</strong></a></p>
<ul>
<li><a href="#assemblies">Source of genome assemblies</a></li>
<li><a href="#masking">Masking</a></li>
<li><a href="#transcripts">Transcript alignments</a></li>
<li><a href="#longSRA">Transcriptomics long read alignments</a></li>
<li><a href="#RNAseq">RNA-Seq read alignments</a></li>
<li><a href="#proteins">Protein alignments</a></li>
<li><a href="#gnomon">Model prediction</a></li>
<li><a href="#NGs">Curated RefSeq genomic sequence alignments</a></li>
<li><a href="#bestmodel">Choosing the best models for a gene</a></li>
<li><a href="#naming">Protein naming and determination of locus type</a></li>
<li><a href="#gene_ontology">Gene Ontology</a></li>
<li><a href="#gene_ids">Assignment of GeneIDs</a></li>
<li><a href="#small_rnas">Annotation of small RNAs</a></li>
<li><a href="#tss">Annotation of transcription start sites (TSS)</a></li>
</ul>
</li>
<li>
<p><a href="#special"><strong>Special considerations</strong></a></p>
<ul>
<li><a href="#mult_asm">Annotation of multiple assemblies</a></li>
<li><a href="#reannot">Re-annotation</a></li>
<li><a href="#quality">Annotation quality</a></li>
</ul>
</li>
<li>
<p><a href="#products"><strong>Annotation</strong> <strong>products</strong></a></p>
</li>
<li><a href="#data"><strong>Data availability</strong></a></li>
<li><a href="#references"><strong>References</strong></a></li>
</ul>
<p>Please see <a href="/books/NBK169439/">The Eukaryotic Genome Annotation chapter in the NCBI Handbook</a> for more details about the algorithms.</p>
<h2 id="process"><strong>Process</strong></h2>
<p>The figure below provides an overview of the annotation process. The genomic sequences are masked (grey) and transcripts (blue), proteins (green) and RNA-Seq reads and, if available in SRA, long reads transcriptomes and Cap Analysis Gene Expression (CAGE) data (orange) are aligned to the genome. If available for the organism being annotated, curated RefSeq genomic sequences are also aligned (pink). Gene model prediction based on transcript and protein alignments is then performed (brown). The best models are selected among the RefSeq and the predicted models, named and accessioned (purple). Finally, the annotation products are formatted and deployed to public resources (yellow).</p>
<p><img src="/core/assets/genome/images/Pipeline_sm_ncRNA_CAGE_80pct.png" alt="pipeline_overview" /></p>
<h3 id="assemblies"><strong>Source of genome assemblies</strong></h3>
<p>The RefSeq assemblies that are annotated by NCBI are copies of the genome assemblies that are public in <a href="http://www.insdc.org/">INSDC</a> (<a href="http://www.ddbj.nig.ac.jp/">DDBJ</a>, <a href="http://www.ebi.ac.uk/ena">ENA</a> and <a href="/genbank">GenBank</a>). Unplaced scaffolds with length below 1000 bases may not be included in the RefSeq copy of the assembly if the <a href="http://www.insdc.org/">INSDC</a> assembly contains more than 300,000 unplaced scaffolds and more than 25,000 of them are below 1000 bases. Both RefSeq and GenBank assemblies are further described in the <a href="/assembly/">Assembly</a> resource.</p>
<h3 id="masking"><strong>Masking</strong></h3>
<p>Masking is done using <a href="http://www.repeatmasker.org/">RepeatMasker</a> or <a href="/pubmed/16287941">WindowMasker</a>. Human and mouse are masked with <a href="http://www.repeatmasker.org/">RepeatMasker</a> using their respective <a href="https://dfam.org/home">Dfam</a> libraries, while genomes from other species are masked with <a href="/pubmed/16287941">WindowMasker</a>.</p>
<h3 id="transcripts"><strong>Transcript alignments</strong></h3>
<p>The set of transcripts selected for alignment to the genome varies by species, and may include transcripts from other organisms. This set generally includes:</p>
<ul>
<li>Known <a href="/refseq/about">RefSeq</a> transcripts: Coding and non-coding <a href="/refseq/about">RefSeq</a> transcripts with NM_ or NR_ prefixes, respectively, are generated by NCBI staff based on automatic processes, manual curation, or data from collaborating groups (see more details <a href="/books/NBK21091/">here</a>)</li>
<li><a href="/genbank">GenBank</a> transcripts from the taxonomically relevant GenBank divisions, and the Third-Party Annotation (<a href="/genbank/tpa/">TPA</a>), High-throughput cDNA (HTC) and Transcriptome Shotgun Assembly (<a href="/genbank/tsa">TSA</a>) divisions</li>
<li>ESTs from <a href="/dbEST/">dbEST</a></li>
</ul>
<p>Sequences highly likely to be mitochondrial or to have cloning vector or IS element contamination, and sequences identified as low quality by <a href="/refseq/about">RefSeq</a> curation staff are screened out.</p>
<p><a href="/refseq/about">RefSeq</a> transcripts and non-RefSeq transcripts that pass the contamination screen are aligned locally to the genome using BLAST to identify the location(s) at which transcripts align. Global re-alignment at these locations is performed with <a href="/sutils/splign/splign.cgi?textpage=documentation">Splign</a> to refine the identification of splice sites. Alignments are then ranked and filtered based on customizable criteria (such as coverage, identity, rank). Typically, only the best-placed (rank 1) alignment for a given query is selected for use in the downstream steps.</p>
<h3 id="longSRA"><strong>Transcriptomics long read alignments</strong></h3>
<p>Transcriptomics reads from <a href="/sra/">SRA</a> generated using long read sequencing technologies such as PacBio or Oxford Nanopore are aligned to the genome using <a href="https://pubmed.ncbi.nlm.nih.gov/29750242/">Minimap2</a>. Each transcript's best-placed (rank 1) alignment is selected for use in the downstream steps, if above 85% identity.</p>
<h3 id="RNAseq"><strong>RNA-Seq read alignments</strong></h3>
<p>RNA-Seq reads for the species or closely related species are aligned to the genome. When a very large number of samples and reads (multiple billions) are available in <a href="/sra">SRA</a>, projects with samples spanning the widest range of tissues and developmental stages are chosen over others, with a preference for untreated or non-diseased samples. RNA-Seq reads are aligned to the genome with <a href="https://pubmed.ncbi.nlm.nih.gov/23104886">STAR</a>. To address the short length, redundancy and abundance of the reads, alignments with the same splice structure and the same or similar start and end points are collapsed into a single representative alignment. Information is recorded about the samples and number of reads represented by each alignment, so the level of support can be used to filter alignments and evaluate gene predictions. Alignments representing very rare introns likely to be background noise are filtered out.</p>
<p>For each SRA run aligned to the genome, RNA-seq read coverage graphs in UCSC BigWig format are generated and made available for download on the FTP site (see link below). The number of reads mapped to annotated genes is also counted using <a href="https://pubmed.ncbi.nlm.nih.gov/24227677">Subread featureCounts</a> software, and the gene expression counts files are made available for download on the FTP site. Additionally, a file containing information about all of the SRA runs used is provided.</p>
<h3 id="proteins"><strong>Protein alignments</strong></h3>
<p>The set of proteins selected for alignment to the genome varies by species, and may include proteins from other organisms. This set generally includes:</p>
<ul>
<li>Known <a href="/refseq/about">RefSeq</a> proteins</li>
<li><a href="/genbank">GenBank</a> proteins derived from cDNAs from the taxonomically relevant <a href="/genbank">GenBank</a> divisions</li>
</ul>
<p>Highly repetitive sequences are removed from the set. Proteins are aligned locally to the genome with BLAST and re-aligned globally using <a href="/sutils/static/prosplign/prosplign.html">ProSplign</a>. Alignments are then ranked and filtered based on customizable criteria.</p>
<h3 id="gnomon"><strong>Model prediction</strong></h3>
<p>Protein, transcript, transcriptomics and RNA-Seq read alignments are passed to <a href="../gnomon">Gnomon</a> for gene prediction. <a href="../gnomon">Gnomon</a> first chains together non-conflicting alignments into putative models. In a second step, <a href="../gnomon">Gnomon</a> extends predictions missing a start or a stop codon or internal exon(s) using an HMM-based algorithm. <a href="../gnomon">Gnomon</a> additionally creates pure <em>ab initio</em> predictions where open reading frames of sufficient length but with no supporting alignment are detected.</p>
<p>This first set of predictions is further refined by alignment against a subset of the nr (non-redundant) database of protein sequences. The additional alignments are added to the initial alignments, and the chaining and <em>ab initio</em> extension steps are repeated. The results constitute the set of <a href="../gnomon">Gnomon</a> predictions.</p>
<p>Gnomon predictions may include deletions or insertions of Ns with respect to the genomic sequence. These differentes are introduced to compensate for frameshifts or stop codons in the literal translation of the genome, when the aligning proteins provides evidence of an intact ORF.</p>
<h3 id="NGs"><strong>Curated RefSeq genomic sequence alignments</strong></h3>
<p>For some organisms, a set of genomic sequences is curated (<a href="/refseq/about">RefSeq</a> accessions with NG_ prefixes). These sequences represent either non-transcribed pseudogenes, a manually annotated gene cluster that is difficult to annotate via automated methods, and human <a href="/refseq/rsg">RefSeqGene</a> records. They are aligned to the genome, and their best placement is identified.</p>
<h3 id="bestmodel"><strong>Choosing the best models for a gene</strong></h3>
<p>The final set of annotated features comprises, in order of preference, pre-existing <a href="/refseq/about">RefSeq</a> sequences and a subset of well-supported <a href="../gnomon">Gnomon</a>-predicted models. It is built by evaluating together at each locus the known <a href="/refseq/about">RefSeq</a> transcripts, the features projected from curated <a href="/refseq/about">RefSeq</a> genomic alignments and the models predicted by <a href="../gnomon">Gnomon</a>.</p>
<p><em>1. Models based on known and curated RefSeq</em></p>
<p><a href="/refseq/about">RefSeq</a> transcripts are given precedence over overlapping <a href="../gnomon">Gnomon</a> models with the same splice pattern. Alignments of known same-species <a href="/refseq/about">RefSeq</a> transcripts or curated genomic sequences are used directly to annotate the gene, RNA and CDS features on the genome. Since the <a href="/refseq/about">RefSeq</a> sequence may not align perfectly or completely to the genomic sequence, a consequence of this rule is that the annotated product may differ from the conceptual translation of the genome. Differences between the RefSeq transcripts and the genome are provided in a note on the RefSeq genomic record (scaffold or chromosome).</p>
<p><em>2. Models based on Gnomon predictions</em></p>
<p><a href="../gnomon">Gnomon</a> predictions are included in the final set of annotations if they do not share all splice sites with a <a href="/refseq/about">RefSeq</a> transcript and if they meet certain quality thresholds including:</p>
<ul>
<li>Only fully- or partially-supported <a href="../gnomon">Gnomon</a> predictions, or pure <em>ab initio</em> <a href="../gnomon">Gnomon</a> predictions with high coverage hits to UniProtKB/SwissProt proteins are selected</li>
<li>When multiple fully-supported transcript variants are predicted for a gene, only the <a href="../gnomon">Gnomon</a> predictions supported in their entirety by a single long alignment (e.g. a full-length mRNA) or by RNA-Seq reads from a single BioSample are selected</li>
<li>Poorly-supported <a href="../gnomon">Gnomon</a> predictions conflicting with better-supported models annotated on the opposite strand are excluded from the final set of models</li>
<li><a href="../gnomon">Gnomon</a> predictions with high homology to transposable or retro-transposable elements are excluded from the final set of models</li>
</ul>
<p><em>3. Integrating RefSeq and Gnomon annotations</em></p>
<p>As a result of the model selection process, a gene may be represented by multiple splice variants, with some of them known <a href="/refseq/about">RefSeq</a> and others model <a href="/refseq/about">RefSeq</a> (originating from <a href="../gnomon">Gnomon</a> predictions).</p>
<p><a href="../gnomon">Gnomon</a> predictions selected for the final annotation set are assigned model RefSeq accessions with XM_ or XR_ prefixes for transcripts and XP_ prefixes for proteins to distinguish them from known RefSeq with NM_/NR_ and NP_ prefixes. Model RefSeq can be searched in Entrez with the query “srcdb_refseq_model[properties]” while known RefSeq sequences can be obtained with the query “srcdb_refseq_known[properties]”.</p>
<h3 id="naming"><strong>Protein naming and determination of locus type</strong></h3>
<ul>
<li>Genes represented by known or curated RefSeq sequences inherit the <a href="/gene">Gene</a> symbol, name and locus type (e.g. coding, pseudogene...) of the <a href="/refseq/about">RefSeq</a> sequence.</li>
<li>Genes represented by predicted models are named based on homology to SwissProt proteins.</li>
<li>Most <a href="../gnomon">Gnomon</a> models with insertions, deletions or frameshifts are labeled pseudogenes.</li>
<li><a href="../gnomon">Gnomon</a> models with insertions or deletions relative to the genome may be considered coding if they have a strong unique hit to the SwissProt database or appear to be orthologs of known protein-coding genes. Titles for these models are prefixed with “PREDICTED: LOW QUALITY PROTEIN” to indicate that these models and the underlying assembly sequences may content defects.</li>
<li><a href="../gnomon">Gnomon</a> models that appear to be single-exon retrocopies of protein-coding genes may be annotated as pseudogenes.</li>
<li>When <a href="#mult_asm">multiple assemblies are annotated</a>, a partial or imperfect model may be called coding because a complete model exists at the corresponding locus on one of the other annotated assemblies.</li>
</ul>
<h3 id="gene_ontology"><strong>Gene Ontology</strong></h3>
<p>Gene Ontology (GO) terms for all annotated proteins were computed using <a href="https://pubmed.ncbi.nlm.nih.gov/24451626">InterProScan</a>, a tool that identifies protein domains and families. The GO terms were then collated by gene, and the resulting GO annotations are made available for download from the FTP site (see link below) in the <a href="http://geneontology.org/docs/go-annotation-file-gaf-format-2.1/">GAF (GO Annotation File) format</a>.</p>
<h3 id="gene_ids"><strong>Assignment of GeneIDs</strong></h3>
<p>Genes in the final set of models are assigned GeneIDs in NCBI's <a href="/gene">Gene</a> database.</p>
<ul>
<li>A gene represented by a known <a href="/refseq/about">RefSeq</a> transcript will receive the GeneID of the <a href="/refseq/about">RefSeq</a> transcript.</li>
<li>All alternative splice forms of a gene get the same GeneID.</li>
<li>As much as possible, GeneIDs are carried forward from one annotation run to the next, using the <a href="#reannot">mapping</a> of the new assembly to the previous one if the assembly was updated.</li>
<li>Gene features mapped to equivalent locations of <a href="#mult_asm">co-annotated assemblies</a> are assigned the same GeneIDs.</li>
</ul>
<h3 id="small_rnas"><strong>Annotation of small RNAs</strong></h3>
<ul>
<li>miRNAs are imported from <a href="http://www.mirbase.org">miRBase</a>, accessioned with NR_ prefixes and placed using <a href="/sutils/splign/splign.cgi?textpage=documentation">Splign</a>.</li>
<li>tRNAs are predicted with <a href="http://lowelab.ucsc.edu/tRNAscan-SE/">tRNAscan-SE</a>.</li>
<li>Starting with software version 8.0, rRNAs, snoRNAs and snRNAs are annotated by searching eukaryotic <a href="http://rfam.xfam.org/">RFAM</a> HMMs against the genome with Infernal's <em>cmsearch</em>.</li>
</ul>
<h3 id="tss"><strong>Annotation of transcription start sites (TSS)</strong></h3>
<p>Starting with software release 9.0, Cap Analysis Gene Expression (CAGE) data that is available in SRA for the species are aligned to the genome with <a href="/sutils/splign/splign.cgi?textpage=documentation">Splign</a> and used for annotating transcription start sites.</p>
<h2 id="special"><strong>Special considerations</strong></h2>
<h3 id="mult_asm"><strong>Annotation of multiple assemblies</strong></h3>
<p>When multiple assemblies of good quality are available for a given organism, annotation of all is done in coordination. To ensure that matching regions across assemblies are annotated the same way, assemblies are aligned to each other before the annotation.</p>
<ul>
<li>Assembly-assembly alignment results are used to rank the transcript and the curated genomic alignments: for a given query sequence, alignments to corresponding regions of two assemblies receive the same rank.</li>
<li>Corresponding loci of multiple assemblies are assigned the same GeneID and locus type.</li>
</ul>
<h3 id="reannot"><strong>Re-annotation</strong></h3>
<p>Organisms are periodically re-annotated when new evidence is available (e.g. RNA-Seq) or when a new assembly is released. Special attention is given to tracking of models and genes from one release of the annotation to the next. Previous and current models annotated at overlapping genomic locations are identified and the locus type and GeneID of the previous models are taken into consideration when assigning GeneIDs to the new models. If the assembly was updated between the two rounds of annotation, the assemblies are aligned to each other and the alignments used to match previous and current models in mapped regions.</p>
<h3 id="quality"><strong>Annotation quality</strong></h3>
<p>The quality of the annotation is assessed prior to publishing, based on the intrinsic characteristics of the annotated models and on the expectations for the species. Indicators of a low quality annotation may disqualify a genome from being included in RefSeq. These indicators are: high count of coding genes that lack near-full coverage by alignments of experimental evidence, high count of partial coding genes (lacking a start or stop codon, or internal exons), high count of low-quality genes with suspected frameshifts or premature stop codons, low BUSCO completeness score (see below), and, for vertebrates, low count of genes with orthologs to a reference species.</p>
<p><a href="https://busco.ezlab.org/">BUSCO</a> run in "protein" mode provides an estimate of the completeness of the gene set. The BUSCO models (single-copy marker genes) for the most fitting lineage based on NCBI Taxonomy are searched against the longest protein for each annotated coding gene. Results are reported in BUSCO notation (C:complete [S:single-copy, D:duplicated], F:fragmented, M:missing, n:number of genes used).</p>
<h2 id="products"><strong>Annotation products</strong></h2>
<ul>
<li>The products of the annotation process comprise:<ul>
<li>The scaffolds and chromosomes of the assembled genomes, with the annotation products as features.</li>
<li>The individual products (transcripts and proteins)</li>
</ul>
</li>
</ul>
<p><table border="1" cellpadding="1" cellspacing="1">
<thead>
<tr>
<th scope="col">Product</th>
<th scope="col">Origin of the product</th>
<th scope="col">Note for the features on the scaffolds and chromosomes*</th>
</tr>
</thead>
<tbody>
<tr>
<td>Known transcripts/proteins (NM_, NR_, NP_)</td>
<td>curated RefSeq genomic</td>
<td>
<p>
<span class="feature"><span>"</span>Derived by automated computational analysis using gene prediction method: Curated Genomic"</span>
</p>
</td>
</tr>
<tr>
<td>Known transcripts/proteins (NM_, NR_, NP_)</td>
<td>known RefSeq transcript</td>
<td>
<span class="feature"><span>"</span>Derived by automated computational analysis using gene prediction method: BestRefseq"</span>
</td>
</tr>
<tr>
<td>Model transcripts/proteins (fully or partially supported) (XM_, XR_, XP_)</td>
<td>Gnomon</td>
<td>
<span class="feature"><span>"</span>Derived by automated computational analysis using gene prediction method: Gnomon"</span>
</td>
</tr>
<tr>
<td>Model short non-coding transcripts (XR_)</td>
<td>Rfam + cmsearch</td>
<td>
<span class="feature"><span>"</span>Derived by automated computational analysis using gene prediction method: cmsearch"</span>
</td>
</tr>
<tr>
<td>tRNAs (no accession)</td>
<td>tRNAscan-SE</td>
<td>
<p>
<span class="feature"><span>"</span>tRNA features were annotated by tRNAscan-SE"</span>
</p>
</td>
</tr>
<tr>
<td>Non-transcribed pseudogenes (no accession)</td>
<td>curated RefSeq genomic</td>
<td>
<p>
<span class="feature"><span>"</span>Derived by automated computational analysis using gene prediction method: Curated Genomic"</span>
</p>
</td>
</tr>
<tr>
<td>Non-transcribed pseudogenes (no accession)</td>
<td>Gnomon</td>
<td>
<span class="feature"><span>"</span>Derived by automated computational analysis using gene prediction method: Gnomon"</span>
</td>
</tr>
<tr>
<td>Full set of Gnomon predictions (no accession)</td>
<td>Gnomon</td>
<td>NA. Not in the sequence database. Available on the <a target="_blank" href="https://ftp.ncbi.nih.gov/genomes/">FTP site</a> and as <a target="_blank" href="https://blast.ncbi.nlm.nih.gov/Blast.cgi">BLAST</a> databases</td>
</tr>
</tbody>
</table>
* For predicted models, the note is also on the records of individual annotation products.</p>
<ul>
<li>Sequence records for predicted models, scaffolds and chromosomes contain the Annotation Name, which uniquely identifies the annotation. Examples:</li>
</ul>
<p>The sequence records for scaffolds, chromosomes and predicted transcripts and proteins for <strong>NCBI <em>Pongo abelii</em> Annotation Release 103</strong> contain the following comment:</p>
<p class="rteindent1">
<span>##Genome-Annotation-Data-START##<br />
Annotation Provider         :: NCBI<br />
Annotation Status           :: Full annotation<br />
Annotation Name             :: Pongo abelii Annotation Release 103<br />
Annotation Version          :: 103<br />
Annotation Pipeline         :: NCBI eukaryotic genome annotation pipeline<br />
Annotation Software Version :: 8.0<br />
Annotation Method           :: Best-placed RefSeq; Gnomon<br />
Features Annotated          :: Gene; mRNA; CDS; ncRNA<br />
##Genome-Annotation-Data-END##</span></p>
<p>The sequence records for scaffolds, chromosomes and predicted transcripts and proteins for <strong>NCBI GCF_016801865.2-RS_2022_12</strong> contain the following comment:</p>
<p class="rteindent1">
<span>##Genome-Annotation-Data-START##<br />
Annotation Provider         :: NCBI RefSeq<br />
Annotation Status           :: Full annotation<br />
Annotation Name             :: GCF_016801865.2-RS_2022_12<br />
Annotation Pipeline         :: NCBI eukaryotic genome annotation pipeline<br />
Annotation Software Version :: 10.1<br />
Annotation Method           :: Gnomon; cmsearch; tRNAscan-SE<br />
Features Annotated          :: Gene; mRNA; CDS; ncRNA<br />
##Genome-Annotation-Data-END##</span></p>
<h2 id="data"><strong>Data availability</strong></h2>
<p>The data produced by the annotation pipeline is available in various resources:</p>
<ul>
<li><a href="/nucleotide/">Nucleotide</a></li>
<li><a href="/protein">Protein</a></li>
<li><a href="https://blast.ncbi.nlm.nih.gov/Blast.cgi">BLAST</a></li>
<li><a href="/gene">Gene</a></li>
<li><a href="/genome/gdv/">Genome Data Viewer</a></li>
<li><a href="https://ftp.ncbi.nih.gov/genomes/">FTP site</a></li>
</ul>
<h2 id="references"><strong>References</strong></h2>
<ul>
<li><a href="https://busco.ezlab.org/">BUSCO</a>: Manni M et al. <a href="https://pubmed.ncbi.nlm.nih.gov/34320186/"><em>Molecular biology and evolution</em> 2021, <strong>38</strong>(10):4647-4654</a></li>
<li><a href="https://www.ebi.ac.uk/interpro/">InterProScan</a>: Jones P et al. <a href="https://pubmed.ncbi.nlm.nih.gov/24451626"><em>Bioinformatics</em> 2014. <strong>30</strong>(9):1236-1240</a></li>
<li><a href="https://github.com/lh3/minimap2">Minimap2</a>: Li H. <a href="/pubmed/29750242/"><em>Bioinformatics</em> 2018 <strong>34</strong>(18):3094-3100</a></li>
<li><a href="http://www.mirbase.org">miRBase</a>: Griffiths-Jones S. <a href="/pubmed/14681370"><em>Nucleic Acids Research</em> 2004, <strong>32</strong>(Database Issue):D109-11</a></li>
<li><a href="/refseq/about">RefSeq</a>: Pruitt KD et al. <a href="/pubmed/24259432"><em>Nucleic Acids Research</em> 2014, <strong>42</strong>(Database issue):D756-63</a></li>
<li><a href="http://www.repeatmasker.org">RepeatMasker</a><span class="ref-cit"><span id="__element-citationid3536779"></span><span class="element-citation">: Smit AFA, Hubley R, Green P. RepeatMasker Open-3.0. 19962004. <a href="http://www.repeatmasker.org">http://www.repeatmasker.org</a></span></span></li>
<li><a href="http://rfam.xfam.org/">Rfam</a>: Nawrocki, EP et al. <a href="/pubmed/25392425"><em>Nucleic Acids Research</em> 2015, <strong>43</strong>(Database issue):D130-7</a></li>
<li><a href="/sutils/splign/splign.cgi?textpage=documentation">Splign</a>: Kapustin Y, Souvorov A, Tatusova T, Lipman D. <a href="/pubmed/18495041"><em>Biology Direct</em> 2008, <strong>3</strong>:20</a></li>
<li><a href="https://github.com/alexdobin/STAR">STAR</a>: Dobin A, Davis CA, Schlesinger F, Drenkow J, Zaleski C, Jha S, Batut P, Chaisson M, Gingeras TR. <a href="/pubmed/23104886"><em>Bioinformatics</em> 2013, <strong>29</strong>(1): 1521</a></li>
<li><a href="https://subread.sourceforge.net/">Subread featureCounts</a>: Liao, Y, Smyth GK, Shi, W. <a href="https://pubmed.ncbi.nlm.nih.gov/24227677"><em>Bioinformatics</em> 2014, <strong>30</strong>(7):923-930</a></li>
<li><a href="http://lowelab.ucsc.edu/tRNAscan-SE/">tRNAscan-SE</a>: Lowe, TM and Eddy, SR. <a href="/pubmed/9023104"><em>Nucleic Acids Research</em> 1997, <strong>25</strong>: 955-964</a></li>
<li><a href="/pubmed/16287941">WindowMasker</a>: Morgulis A, Gertz EM, Schäffer AA, Agarwala R.<a href="/pubmed/16287941"><em>Bioinformatics</em> 2006 <strong>2</strong>:134-41</a></li>
</ul>
<p><span id="shared-content-1"></span></p>
</div>
<!--/.col1-->
<div class="col2">
</div>
<!--/.col2-->
<div class="col3">
</div>
<!--/.col3-->
<div class="col4">
</div>
<!--/.col4-->
<div class="col5">
</div>
<div class="col6">
</div>
<div class="col7">
</div>
<div class="col8">
</div>
<div class="col9">
</div>
</div><!--/.content-->
</div><!--/.container-->
<div id="NCBIFooter_dynamic">
<!--<component id="NCBIBreadcrumbs"/>
<component id="NCBIHelpDesk"/>-->
<noscript><img alt="" src="/stat?jsdisabled=true&amp;ncbi_app=refseq&amp;ncbi_db=&amp;ncbi_pdid=static&amp;ncbi_phid=CE8B42C87DC853710000000001340102" /></noscript>
</div>
<div xmlns:xi="http://www.w3.org/2001/XInclude">
<div xmlns="http://www.w3.org/1999/xhtml" class="footer" id="footer" xml:base="http://127.0.0.1/sites/static/header_footer">
<section class="icon-section">
<div id="icon-section-header" class="icon-section_header">Follow NCBI</div>
<div class="grid-container container">
<div class="icon-section_container">
<a class="footer-icon" id="footer_twitter" href="https://twitter.com/ncbi" aria-label="Twitter">
<svg xmlns="http://www.w3.org/2000/svg" width="40" height="40" viewBox="0 0 40 40" fill="none">
<title>Twitter</title>
<g id="twitterx1008">
<path id="path1008" d="M6.06736 7L16.8778 20.8991L6.00001 32.2H10.2L18.6 23.1L25.668 32.2H34L22.8 17.5L31.9 7H28.4L20.7 15.4L14.401 7H6.06898H6.06736ZM9.66753 8.73423H12.9327L29.7327 30.4658H26.5697L9.66753 8.73423Z" fill="#5B616B"></path>
</g>
</svg>
</a>
<a class="footer-icon" id="footer_facebook" href="https://www.facebook.com/ncbi.nlm" aria-label="Facebook"><svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" viewBox="0 0 300 300">
<title>Facebook</title>
<path class="cls-11" d="M210.5,115.12H171.74V97.82c0-8.14,5.39-10,9.19-10h27.14V52l-39.32-.12c-35.66,0-42.42,26.68-42.42,43.77v19.48H99.09v36.32h27.24v109h45.41v-109h35Z">
</path>
</svg></a>
<a class="footer-icon" id="footer_linkedin" href="https://www.linkedin.com/company/ncbinlm" aria-label="LinkedIn"><svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" viewBox="0 0 300 300">
<title>LinkedIn</title>
<path class="cls-11" d="M101.64,243.37H57.79v-114h43.85Zm-22-131.54h-.26c-13.25,0-21.82-10.36-21.82-21.76,0-11.65,8.84-21.15,22.33-21.15S101.7,78.72,102,90.38C102,101.77,93.4,111.83,79.63,111.83Zm100.93,52.61A17.54,17.54,0,0,0,163,182v61.39H119.18s.51-105.23,0-114H163v13a54.33,54.33,0,0,1,34.54-12.66c26,0,44.39,18.8,44.39,55.29v58.35H198.1V182A17.54,17.54,0,0,0,180.56,164.44Z">
</path>
</svg></a>
<a class="footer-icon" id="footer_github" href="https://github.com/ncbi" aria-label="GitHub"><svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" viewBox="0 0 300 300">
<defs>
<style>
.cls-11,
.cls-12 {
fill: #737373;
}
.cls-11 {
fill-rule: evenodd;
}
</style>
</defs>
<title>GitHub</title>
<path class="cls-11" d="M151.36,47.28a105.76,105.76,0,0,0-33.43,206.1c5.28,1,7.22-2.3,7.22-5.09,0-2.52-.09-10.85-.14-19.69-29.42,6.4-35.63-12.48-35.63-12.48-4.81-12.22-11.74-15.47-11.74-15.47-9.59-6.56.73-6.43.73-6.43,10.61.75,16.21,10.9,16.21,10.9,9.43,16.17,24.73,11.49,30.77,8.79,1-6.83,3.69-11.5,6.71-14.14C108.57,197.1,83.88,188,83.88,147.51a40.92,40.92,0,0,1,10.9-28.39c-1.1-2.66-4.72-13.42,1-28,0,0,8.88-2.84,29.09,10.84a100.26,100.26,0,0,1,53,0C198,88.3,206.9,91.14,206.9,91.14c5.76,14.56,2.14,25.32,1,28a40.87,40.87,0,0,1,10.89,28.39c0,40.62-24.74,49.56-48.29,52.18,3.79,3.28,7.17,9.71,7.17,19.58,0,14.15-.12,25.54-.12,29,0,2.82,1.9,6.11,7.26,5.07A105.76,105.76,0,0,0,151.36,47.28Z">
</path>
<path class="cls-12" d="M85.66,199.12c-.23.52-1.06.68-1.81.32s-1.2-1.06-.95-1.59,1.06-.69,1.82-.33,1.21,1.07.94,1.6Zm-1.3-1">
</path>
<path class="cls-12" d="M90,203.89c-.51.47-1.49.25-2.16-.49a1.61,1.61,0,0,1-.31-2.19c.52-.47,1.47-.25,2.17.49s.82,1.72.3,2.19Zm-1-1.08">
</path>
<path class="cls-12" d="M94.12,210c-.65.46-1.71,0-2.37-.91s-.64-2.07,0-2.52,1.7,0,2.36.89.65,2.08,0,2.54Zm0,0"></path>
<path class="cls-12" d="M99.83,215.87c-.58.64-1.82.47-2.72-.41s-1.18-2.06-.6-2.7,1.83-.46,2.74.41,1.2,2.07.58,2.7Zm0,0">
</path>
<path class="cls-12" d="M107.71,219.29c-.26.82-1.45,1.2-2.64.85s-2-1.34-1.74-2.17,1.44-1.23,2.65-.85,2,1.32,1.73,2.17Zm0,0">
</path>
<path class="cls-12" d="M116.36,219.92c0,.87-1,1.59-2.24,1.61s-2.29-.68-2.3-1.54,1-1.59,2.26-1.61,2.28.67,2.28,1.54Zm0,0">
</path>
<path class="cls-12" d="M124.42,218.55c.15.85-.73,1.72-2,1.95s-2.37-.3-2.52-1.14.73-1.75,2-2,2.37.29,2.53,1.16Zm0,0"></path>
</svg></a>
<a class="footer-icon" id="footer_blog" href="https://ncbiinsights.ncbi.nlm.nih.gov/" aria-label="Blog">
<svg xmlns="http://www.w3.org/2000/svg" id="Layer_1" data-name="Layer 1" viewBox="0 0 40 40">
<defs><style>.cls-1{fill:#737373;}</style></defs>
<title>NCBI Insights Blog</title>
<path class="cls-1" d="M14,30a4,4,0,1,1-4-4,4,4,0,0,1,4,4Zm11,3A19,19,0,0,0,7.05,15a1,1,0,0,0-1,1v3a1,1,0,0,0,.93,1A14,14,0,0,1,20,33.07,1,1,0,0,0,21,34h3a1,1,0,0,0,1-1Zm9,0A28,28,0,0,0,7,6,1,1,0,0,0,6,7v3a1,1,0,0,0,1,1A23,23,0,0,1,29,33a1,1,0,0,0,1,1h3A1,1,0,0,0,34,33Z"></path>
</svg>
</a>
</div>
</div>
</section>
<section class="container-fluid bg-primary">
<div class="container pt-5">
<div class="row mt-3">
<div class="col-lg-3 col-12">
<p><a class="text-white" href="https://www.nlm.nih.gov/socialmedia/index.html">Connect with NLM</a></p>
<ul class="list-inline social_media">
<li class="list-inline-item"><a href="https://twitter.com/NLM_NIH" aria-label="Twitter" target="_blank" rel="noopener noreferrer">
<svg xmlns="http://www.w3.org/2000/svg" width="35" height="35" viewBox="0 0 36 35" fill="none">
<title>Twitter</title>
<g id="twitterx1009" clip-path="url(#clip0_65276_3946)">
<path id="Vector_Twitter" d="M17.5006 34.6565C26.9761 34.6565 34.6575 26.9751 34.6575 17.4996C34.6575 8.02416 26.9761 0.342773 17.5006 0.342773C8.02514 0.342773 0.34375 8.02416 0.34375 17.4996C0.34375 26.9751 8.02514 34.6565 17.5006 34.6565Z" fill="#205493" stroke="white" stroke-width="1.0" stroke-miterlimit="10"></path>
<path id="path1009" d="M8.54811 8.5L16.2698 18.4279L8.50001 26.5H11.5L17.5 20L22.5486 26.5H28.5L20.5 16L27 8.5H24.5L19 14.5L14.5007 8.5H8.54927H8.54811ZM11.1197 9.73873H13.4519L25.4519 25.2613H23.1926L11.1197 9.73873Z" fill="white"></path>
</g>
<defs>
<clipPath id="clip0_65276_3946">
<rect width="35" height="35" fill="white"></rect>
</clipPath>
</defs>
</svg>
</a></li>
<li class="list-inline-item"><a href="https://www.facebook.com/nationallibraryofmedicine" aria-label="Facebook" rel="noopener noreferrer" target="_blank">
<svg xmlns="http://www.w3.org/2000/svg" width="35" height="35" viewBox="0 0 36 35" fill="none">
<title>Facebook</title>
<g id="Facebook" clip-path="url(#clip0_1717_1086)">
<path id="Vector_Facebook" d="M15.1147 29.1371C15.1147 29.0822 15.1147 29.0296 15.1147 28.9747V18.9414H11.8183C11.6719 18.9414 11.6719 18.9414 11.6719 18.8018C11.6719 17.5642 11.6719 16.3289 11.6719 15.0937C11.6719 14.9793 11.7062 14.9518 11.816 14.9518C12.8683 14.9518 13.9206 14.9518 14.9751 14.9518H15.1215V14.8329C15.1215 13.8057 15.1215 12.774 15.1215 11.7492C15.1274 10.9262 15.3148 10.1146 15.6706 9.37241C16.1301 8.38271 16.9475 7.60378 17.9582 7.19235C18.6492 6.90525 19.3923 6.76428 20.1405 6.7783C21.0029 6.79202 21.8653 6.83091 22.7278 6.86065C22.8879 6.86065 23.048 6.89496 23.2082 6.90182C23.2974 6.90182 23.3271 6.94071 23.3271 7.02993C23.3271 7.54235 23.3271 8.05477 23.3271 8.5649C23.3271 9.16882 23.3271 9.77274 23.3271 10.3767C23.3271 10.4819 23.2974 10.5139 23.1921 10.5116C22.5379 10.5116 21.8814 10.5116 21.2271 10.5116C20.9287 10.5184 20.6316 10.5528 20.3395 10.6146C20.0822 10.6619 19.8463 10.7891 19.6653 10.9779C19.4842 11.1668 19.3672 11.4078 19.3307 11.6669C19.2857 11.893 19.2612 12.1226 19.2575 12.3531C19.2575 13.1904 19.2575 14.0299 19.2575 14.8695C19.2575 14.8946 19.2575 14.9198 19.2575 14.9564H23.0229C23.1807 14.9564 23.183 14.9564 23.1624 15.1074C23.0778 15.7662 22.9885 16.425 22.9039 17.0816C22.8322 17.6321 22.7636 18.1827 22.698 18.7332C22.6729 18.9437 22.6797 18.9437 22.4693 18.9437H19.2644V28.8992C19.2644 28.9793 19.2644 29.0593 19.2644 29.1394L15.1147 29.1371Z" fill="white"></path>
<path id="Vector_2_Facebook" d="M17.5006 34.657C26.9761 34.657 34.6575 26.9756 34.6575 17.5001C34.6575 8.02465 26.9761 0.343262 17.5006 0.343262C8.02514 0.343262 0.34375 8.02465 0.34375 17.5001C0.34375 26.9756 8.02514 34.657 17.5006 34.657Z" stroke="white" stroke-width="1.0" stroke-miterlimit="10"></path>
</g>
<defs>
<clipPath id="clip0_1717_1086">
<rect width="35" height="35" fill="white"></rect>
</clipPath>
</defs>
</svg>
</a></li>
<li class="list-inline-item"><a href="https://www.youtube.com/user/NLMNIH" aria-label="Youtube" target="_blank" rel="noopener noreferrer">
<svg xmlns="http://www.w3.org/2000/svg" width="35" height="35" viewBox="0 0 36 35" fill="none">
<title>Youtube</title>
<g id="YouTube" clip-path="url(#clip0_1717_1101)">
<path id="Vector_Youtube" d="M26.2571 11.4791C25.9025 11.1589 25.5709 10.9576 24.228 10.834C22.5512 10.6785 20.2797 10.6556 18.564 10.6533H16.4365C14.7208 10.6533 12.4493 10.6785 10.7725 10.834C9.43196 10.9576 9.09798 11.1589 8.7434 11.4791C7.81464 12.321 7.6202 14.6268 7.59961 16.8938C7.59961 17.3178 7.59961 17.741 7.59961 18.1635C7.62706 20.4121 7.82837 22.686 8.7434 23.521C9.09798 23.8412 9.42967 24.0425 10.7725 24.1661C12.4493 24.3216 14.7208 24.3445 16.4365 24.3468H18.564C20.2797 24.3468 22.5512 24.3216 24.228 24.1661C25.5686 24.0425 25.9025 23.8412 26.2571 23.521C27.1722 22.6929 27.3735 20.451 27.4009 18.2206C27.4009 17.7402 27.4009 17.2599 27.4009 16.7795C27.3735 14.5491 27.1699 12.3072 26.2571 11.4791ZM15.5604 20.5311V14.652L20.561 17.5001L15.5604 20.5311Z" fill="white"></path>
<path id="Vector_2_Youtube" d="M17.5006 34.657C26.9761 34.657 34.6575 26.9756 34.6575 17.5001C34.6575 8.02465 26.9761 0.343262 17.5006 0.343262C8.02514 0.343262 0.34375 8.02465 0.34375 17.5001C0.34375 26.9756 8.02514 34.657 17.5006 34.657Z" stroke="white" stroke-width="1.0" stroke-miterlimit="10"></path>
</g>
<defs>
<clipPath id="clip0_1717_1101">
<rect width="35" height="35" fill="white"></rect>
</clipPath>
</defs>
</svg>
</a></li>
</ul>
</div>
<div class="col-lg-3 col-12">
<p class="address_footer text-white">National Library of Medicine<br />
<a href="https://www.google.com/maps/place/8600+Rockville+Pike,+Bethesda,+MD+20894/@38.9959508,-77.101021,17z/data=!3m1!4b1!4m5!3m4!1s0x89b7c95e25765ddb:0x19156f88b27635b8!8m2!3d38.9959508!4d-77.0988323" class="text-white" target="_blank" rel="noopener noreferrer">8600 Rockville Pike<br />
Bethesda, MD 20894</a></p>
</div>
<div class="col-lg-3 col-12 centered-lg">
<p><a href="https://www.nlm.nih.gov/web_policies.html" class="text-white">Web Policies</a><br />
<a href="https://www.nih.gov/institutes-nih/nih-office-director/office-communications-public-liaison/freedom-information-act-office" class="text-white">FOIA</a><br />
<a href="https://www.hhs.gov/vulnerability-disclosure-policy/index.html" class="text-white" id="vdp">HHS Vulnerability Disclosure</a></p>
</div>
<div class="col-lg-3 col-12 centered-lg">
<p><a class="supportLink text-white" href="https://support.nlm.nih.gov/">Help</a><br />
<a href="https://www.nlm.nih.gov/accessibility.html" class="text-white">Accessibility</a><br />
<a href="https://www.nlm.nih.gov/careers/careers.html" class="text-white">Careers</a></p>
</div>
</div>
<div class="row">
<div class="col-lg-12 centered-lg">
<nav class="bottom-links">
<ul class="mt-3">
<li>
<a class="text-white" href="//www.nlm.nih.gov/">NLM</a>
</li>
<li>
<a class="text-white" href="https://www.nih.gov/">NIH</a>
</li>
<li>
<a class="text-white" href="https://www.hhs.gov/">HHS</a>
</li>
<li>
<a class="text-white" href="https://www.usa.gov/">USA.gov</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
</section>
<script type="text/javascript" src="/portal/portal3rc.fcgi/rlib/js/InstrumentOmnitureBaseJS/InstrumentNCBIConfigJS/InstrumentNCBIBaseJS/InstrumentPageStarterJS.js?v=1"> </script>
<script type="text/javascript" src="/portal/portal3rc.fcgi/static/js/hfjs2.js"> </script>
</div>
</div>
<!--/.footer-->
<p class="last-updated small">Last updated: 2024-04-04T20:39:01Z</p>
</div>
<!--/.page-->
</div>
<!--/.wrap-->
<span class="PAFAppResources"></span>
</div><!-- /.twelve_col -->
</div>
<!-- /.grid -->
<!-- usually for JS scripts at page bottom -->
<span class="pagefixtures"></span>
<!-- CE8BC1E97D9F05E1_0182SID /projects/refseq/refseq@1.21 portal104 v4.1.r689238 Tue, Oct 22 2024 16:10:51 -->
<span id="portal-csrf-token" style="display:none" data-token="CE8BC1E97D9F05E1_0182SID"></span>
<script type='text/javascript' src='/portal/js/portal.js'></script><script type="text/javascript" src="//static.pubmed.gov/portal/portal3rc.fcgi/4218658/js/3879255/4121861/4218656/4087685.js" snapshot="refseq"></script></body>
</html>