nih-gov/www.ncbi.nlm.nih.gov/books/n/handbook/ch14/index.html

<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">

    <head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
        <!-- AppResources meta begin -->
        <meta name="paf-app-resources" content="" />
                 <script type="text/javascript">var ncbi_startTime = new Date();</script>

        <!-- AppResources meta end -->

        <!-- TemplateResources meta begin -->
        <meta name="paf_template" content="" />

        <!-- TemplateResources meta end -->

        <!-- Logger begin -->
        <meta name="ncbi_db" content="books" /><meta name="ncbi_pdid" content="book-part" /><meta name="ncbi_acc" content="NBK21086" /><meta name="ncbi_domain" content="handbook" /><meta name="ncbi_report" content="record" /><meta name="ncbi_type" content="fulltext" /><meta name="ncbi_objectid" content="" /><meta name="ncbi_pcid" content="/NBK21086/" /><meta name="ncbi_pagename" content="Genome Assembly and Annotation Process - The NCBI Handbook - NCBI Bookshelf" /><meta name="ncbi_bookparttype" content="chapter" /><meta name="ncbi_app" content="bookshelf" />
        <!-- Logger end -->

        <title>Genome Assembly and Annotation Process - The NCBI Handbook - NCBI Bookshelf</title>

        <!-- AppResources external_resources begin -->
        <link rel="stylesheet" href="/core/jig/1.15.2/css/jig.min.css" /><script type="text/javascript" src="/core/jig/1.15.2/js/jig.min.js"></script>

        <!-- AppResources external_resources end -->

        <!-- Page meta begin -->
        <meta name="robots" content="NOINDEX,NOFOLLOW,NOARCHIVE,NOIMAGEINDEX" /><meta name="citation_inbook_title" content="The NCBI Handbook [Internet]" /><meta name="citation_title" content="Genome Assembly and Annotation Process" /><meta name="citation_publisher" content="National Center for Biotechnology Information (US)" /><meta name="citation_date" content="2003/08/13" /><meta name="citation_author" content="Paul Kitts" /><meta name="citation_fulltext_html_url" content="https://www.ncbi.nlm.nih.gov/books/NBK21086/" /><link rel="schema.DC" href="http://purl.org/DC/elements/1.0/" /><meta name="DC.Title" content="Genome Assembly and Annotation Process" /><meta name="DC.Type" content="Text" /><meta name="DC.Publisher" content="National Center for Biotechnology Information (US)" /><meta name="DC.Contributor" content="Paul Kitts" /><meta name="DC.Date" content="2003/08/13" /><meta name="DC.Identifier" content="https://www.ncbi.nlm.nih.gov/books/NBK21086/" /><meta name="description" content="NCBI may assemble a genome prior to annotation, add annotations to a genome assembled elsewhere, or simply process an annotated genome to produce RefSeqs and maps for display in Map Viewer (Chapter 20)." /><meta name="og:title" content="Genome Assembly and Annotation Process" /><meta name="og:type" content="book" /><meta name="og:description" content="NCBI may assemble a genome prior to annotation, add annotations to a genome assembled elsewhere, or simply process an annotated genome to produce RefSeqs and maps for display in Map Viewer (Chapter 20)." /><meta name="og:url" content="https://www.ncbi.nlm.nih.gov/books/NBK21086/" /><meta name="og:site_name" content="NCBI Bookshelf" /><meta name="og:image" content="https://www.ncbi.nlm.nih.gov/corehtml/pmc/pmcgifs/bookshelf/thumbs/th-handbook-lrg.png" /><meta name="twitter:card" content="summary" /><meta name="twitter:site" content="@ncbibooks" /><meta name="warning" content="This publication is provided for historical reference only and the information may be out of date." /><meta name="bk-non-canon-loc" content="/books/n/handbook/ch14/" /><link rel="canonical" href="https://www.ncbi.nlm.nih.gov/books/NBK21086/" /><link rel="stylesheet" href="/corehtml/pmc/css/figpopup.css" type="text/css" media="screen" /><link rel="stylesheet" href="/corehtml/pmc/css/bookshelf/2.26/css/books.min.css" type="text/css" /><link rel="stylesheet" href="/corehtml/pmc/css/bookshelf/2.26/css/books_print.min.css" type="text/css" media="print" /><style type="text/css">.main-content {background:transparent repeat-y top left;background-image:url(/corehtml/pmc/css/bookshelf/2.26/img/archive.png);background-size: auto, contain; padding:0 0 0 3em }</style><style type="text/css">p a.figpopup{display:inline !important} .bk_tt {font-family: monospace}  .first-line-outdent .bk_ref {display: inline}  .body-content h2, .body-content .h2  {border-bottom: 1px solid #97B0C8} .body-content h2.inline {border-bottom: none} a.page-toc-label , .jig-ncbismoothscroll a {text-decoration:none;border:0 !important} .temp-labeled-list  .graphic {display:inline-block !important} .temp-labeled-list  img{width:100%}</style><script type="text/javascript" src="/corehtml/pmc/js/jquery.hoverIntent.min.js"> </script><script type="text/javascript" src="/corehtml/pmc/js/common.min.js?_=3.18"> </script><script type="text/javascript" src="/corehtml/pmc/js/large-obj-scrollbars.min.js"> </script><script type="text/javascript">window.name="mainwindow";</script><script type="text/javascript" src="/corehtml/pmc/js/bookshelf/2.26/book-toc.min.js"> </script><script type="text/javascript" src="/corehtml/pmc/js/bookshelf/2.26/books.min.js"> </script><meta name="book-collection" content="NONE" />

        <!-- Page meta end -->
    <link rel="shortcut icon" href="//www.ncbi.nlm.nih.gov/favicon.ico" /><meta name="ncbi_phid" content="CE8E9BF57C822BA10000000000440039.m_13" />
<meta name='referrer' content='origin-when-cross-origin'/><link type="text/css" rel="stylesheet" href="//static.pubmed.gov/portal/portal3rc.fcgi/4216699/css/3852956/3985586/3808861/4121862/3974050/3917732/251717/4216701/14534/45193/4113719/3849091/3984811/3751656/4033350/3840896/3577051/3852958/4008682/4207974/4206132/4062871/12930/3964959/3854974/36029/4128070/9685/3549676/3609192/3609193/3609213/3395586.css" /><link type="text/css" rel="stylesheet" href="//static.pubmed.gov/portal/portal3rc.fcgi/4216699/css/3411343/3882866.css" media="print" /></head>
    <body class="book-part">
        <div class="grid">
            <div class="col twelve_col nomargin shadow">
                <!-- System messages like service outage or JS required; this is handled by the TemplateResources portlet -->
                <div class="sysmessages">
                    <noscript>
	<p class="nojs">
	<strong>Warning:</strong>
	The NCBI web site requires JavaScript to function.
	<a href="/guide/browsers/#enablejs" title="Learn how to enable JavaScript" target="_blank">more...</a>
	</p>
	</noscript>
                </div>
                <!--/.sysmessage-->
                <div class="wrap">
                    <div class="page">
                        <div class="top">
                            <div id="universal_header">
	<section class="usa-banner">
		<div class="usa-accordion">
			<header class="usa-banner-header">
				<div class="usa-grid usa-banner-inner">
					<img src="https://www.ncbi.nlm.nih.gov/coreutils/uswds/img/favicons/favicon-57.png" alt="U.S. flag" />
					<p>An official website of the United States government</p>
					<button class="non-usa-accordion-button usa-banner-button" aria-expanded="false" aria-controls="gov-banner-top" type="button">
						<span class="usa-banner-button-text">Here's how you know</span>
					</button>
				</div>
			</header>
			<div class="usa-banner-content usa-grid usa-accordion-content" id="gov-banner-top" aria-hidden="true">
				<div class="usa-banner-guidance-gov usa-width-one-half">
					<img class="usa-banner-icon usa-media_block-img" src="https://www.ncbi.nlm.nih.gov/coreutils/uswds/img/icon-dot-gov.svg" alt="Dot gov" />
					<div class="usa-media_block-body">
						<p>
							<strong>The .gov means it's official.</strong>
							<br />
							Federal government websites often end in .gov or .mil. Before
							sharing sensitive information, make sure you're on a federal
							government site.
						</p>
					</div>
				</div>
				<div class="usa-banner-guidance-ssl usa-width-one-half">
					<img class="usa-banner-icon usa-media_block-img" src="https://www.ncbi.nlm.nih.gov/coreutils/uswds/img/icon-https.svg" alt="Https" />
					<div class="usa-media_block-body">
						<p>
							<strong>The site is secure.</strong>
							<br />
							The <strong>https://</strong> ensures that you are connecting to the
							official website and that any information you provide is encrypted
							and transmitted securely.
						</p>
					</div>
				</div>
			</div>
		</div>
	</section>
	<div class="usa-overlay"></div>
	<header class="ncbi-header" role="banner" data-section="Header">

		<div class="usa-grid">
			<div class="usa-width-one-whole">

				<div class="ncbi-header__logo">
					<a href="/" class="logo" aria-label="NCBI Logo" data-ga-action="click_image" data-ga-label="NIH NLM Logo">
						<img src="https://www.ncbi.nlm.nih.gov/coreutils/nwds/img/logos/AgencyLogo.svg" alt="NIH NLM Logo" />
					</a>
				</div>

				<div class="ncbi-header__account">
					<a id="account_login" href="https://account.ncbi.nlm.nih.gov" class="usa-button header-button" style="display:none" data-ga-action="open_menu" data-ga-label="account_menu">Log in</a>
					<button id="account_info" class="header-button" style="display:none" aria-controls="account_popup" type="button">
						<span class="fa fa-user" aria-hidden="true">
							<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="20px" height="20px">
								<g style="fill: #fff">
									<ellipse cx="12" cy="8" rx="5" ry="6"></ellipse>
									<path d="M21.8,19.1c-0.9-1.8-2.6-3.3-4.8-4.2c-0.6-0.2-1.3-0.2-1.8,0.1c-1,0.6-2,0.9-3.2,0.9s-2.2-0.3-3.2-0.9    C8.3,14.8,7.6,14.7,7,15c-2.2,0.9-3.9,2.4-4.8,4.2C1.5,20.5,2.6,22,4.1,22h15.8C21.4,22,22.5,20.5,21.8,19.1z"></path>
								</g>
							</svg>
						</span>
						<span class="username desktop-only" aria-hidden="true" id="uname_short"></span>
						<span class="sr-only">Show account info</span>
					</button>
				</div>

				<div class="ncbi-popup-anchor">
					<div class="ncbi-popup account-popup" id="account_popup" aria-hidden="true">
						<div class="ncbi-popup-head">
							<button class="ncbi-close-button" data-ga-action="close_menu" data-ga-label="account_menu" type="button">
								<span class="fa fa-times">
									<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 48 48" width="24px" height="24px">
										<path d="M38 12.83l-2.83-2.83-11.17 11.17-11.17-11.17-2.83 2.83 11.17 11.17-11.17 11.17 2.83 2.83 11.17-11.17 11.17 11.17 2.83-2.83-11.17-11.17z"></path>
									</svg>
								</span>
								<span class="usa-sr-only">Close</span></button>
							<h4>Account</h4>
						</div>
						<div class="account-user-info">
							Logged in as:<br />
							<b><span class="username" id="uname_long">username</span></b>
						</div>
						<div class="account-links">
							<ul class="usa-unstyled-list">
								<li><a id="account_myncbi" href="/myncbi/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_myncbi">Dashboard</a></li>
								<li><a id="account_pubs" href="/myncbi/collections/bibliography/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_pubs">Publications</a></li>
								<li><a id="account_settings" href="/account/settings/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_settings">Account settings</a></li>
								<li><a id="account_logout" href="/account/signout/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_logout">Log out</a></li>
							</ul>
						</div>
					</div>
				</div>

			</div>
		</div>
	</header>
	<div role="navigation" aria-label="access keys">
		<a id="nws_header_accesskey_0" href="https://www.ncbi.nlm.nih.gov/guide/browsers/#ncbi_accesskeys" class="usa-sr-only" accesskey="0" tabindex="-1">Access keys</a>
		<a id="nws_header_accesskey_1" href="https://www.ncbi.nlm.nih.gov" class="usa-sr-only" accesskey="1" tabindex="-1">NCBI Homepage</a>
		<a id="nws_header_accesskey_2" href="/myncbi/" class="set-base-url usa-sr-only" accesskey="2" tabindex="-1">MyNCBI Homepage</a>
		<a id="nws_header_accesskey_3" href="#maincontent" class="usa-sr-only" accesskey="3" tabindex="-1">Main Content</a>
		<a id="nws_header_accesskey_4" href="#" class="usa-sr-only" accesskey="4" tabindex="-1">Main Navigation</a>
	</div>
	<section data-section="Alerts">
		<div class="ncbi-alerts-placeholder"></div>
	</section>
</div>
                            <div class="header">
    <div class="res_logo"><h1 class="res_name"><a href="/books/" title="Bookshelf home">Bookshelf</a></h1><h2 class="res_tagline"></h2></div>
    <div class="search"><form method="get" action="/books/"><div class="search_form"><label for="database" class="offscreen_noflow">Search database</label><select id="database"><optgroup label="Recent"><option value="books" selected="selected" data-ac_dict="bookshelf-search">Books</option><option value="nuccore">Nucleotide</option><option value="gquery">All Databases</option><option value="sra" class="last">SRA</option></optgroup><optgroup label="All"><option value="gquery">All Databases</option><option value="assembly">Assembly</option><option value="biocollections">Biocollections</option><option value="bioproject">BioProject</option><option value="biosample">BioSample</option><option value="books" data-ac_dict="bookshelf-search">Books</option><option value="clinvar">ClinVar</option><option value="cdd">Conserved Domains</option><option value="gap">dbGaP</option><option value="dbvar">dbVar</option><option value="gene">Gene</option><option value="genome">Genome</option><option value="gds">GEO DataSets</option><option value="geoprofiles">GEO Profiles</option><option value="gtr">GTR</option><option value="ipg">Identical Protein Groups</option><option value="medgen">MedGen</option><option value="mesh">MeSH</option><option value="nlmcatalog">NLM Catalog</option><option value="nuccore">Nucleotide</option><option value="omim">OMIM</option><option value="pmc">PMC</option><option value="protein">Protein</option><option value="proteinclusters">Protein Clusters</option><option value="protfam">Protein Family Models</option><option value="pcassay">PubChem BioAssay</option><option value="pccompound">PubChem Compound</option><option value="pcsubstance">PubChem Substance</option><option value="pubmed">PubMed</option><option value="snp">SNP</option><option value="sra">SRA</option><option value="structure">Structure</option><option value="taxonomy">Taxonomy</option><option value="toolkit">ToolKit</option><option value="toolkitall">ToolKitAll</option><option value="toolkitbookgh">ToolKitBookgh</option></optgroup></select><div class="nowrap"><label for="term" class="offscreen_noflow" accesskey="/">Search term</label><div class="nowrap"><input type="text" name="term" id="term" title="Search Books. Use up and down arrows to choose an item from the autocomplete." value="" class="jig-ncbiclearbutton jig-ncbiautocomplete" data-jigconfig="dictionary:'bookshelf-search',disableUrl:'NcbiSearchBarAutoComplCtrl'" autocomplete="off" data-sbconfig="ds:'no',pjs:'no',afs:'no'" /></div><button id="search" type="submit" class="button_search nowrap" cmd="go">Search</button></div></div></form><ul class="searchlinks inline_list"><li>
                        <a href="/books/browse/">Browse Titles</a>
                    </li><li>
                        <a href="/books/advanced/">Advanced</a>
                    </li><li class="help">
                        <a href="/books/NBK3833/">Help</a>
                    </li><li class="disclaimer">
                        <a target="_blank" data-ga-category="literature_resources" data-ga-action="link_click" data-ga-label="disclaimer_link" href="https://www.ncbi.nlm.nih.gov/books/about/disclaimer/">Disclaimer</a>
                    </li></ul></div>
</div>


                        <!--<component id="Page" label="headcontent"/>-->

                        </div>
                        <div class="content">
                            <!-- site messages -->
                            <!-- Custom content 1 -->
<div class="col1">

</div>

<div class="container">
    <div id="maincontent" class="content eight_col col">
        <!-- Custom content in the left column above book nav -->
        <div class="col2">

        </div>

        <!-- Book content -->


        <!-- Custom content between navigation and content -->
        <div class="col3">

        </div>

        <div class="document">
            <div class="pre-content"><div><div class="bk_prnt"><p class="small">NCBI Bookshelf. A service of the National Library of Medicine, National Institutes of Health.</p><p>McEntyre J, Ostell J, editors. The NCBI Handbook [Internet]. Bethesda (MD): National Center for Biotechnology Information (US); 2002-. </p></div><div class="bk_msg_box bk_bttm_mrgn clearfix bk_noprnt"><div class="iconblock clearfix"><a class="img_link icnblk_img" title="Table of Contents Page" href="/books/n/handbook2e/"><img class="source-thumb" src="/corehtml/pmc/pmcgifs/bookshelf/thumbs/th-handbook2e-lrg.png" alt="Cover" height="100px" width="80px" /></a><div class="icnblk_cntnt"><ul class="messages"><li class="info icon"><span class="icon"><a href="/books/n/handbook2e/">See "The NCBI Handbook, 2nd Edition"</a></span></li></ul></div></div></div><div class="messagearea bk_noprnt" style="margin-bottom:1.3846em "><ul class="messages"><li class="warn icon"><span class="icon">This publication is provided for historical reference only and the information may be out of date.</span></li></ul></div><div class="bk_prnt"><p style="color:red;"><strong>This publication is provided for historical reference only and the information may be out of date.</strong></p></div><div class="iconblock clearfix whole_rhythm no_top_margin bk_noprnt"><a class="img_link icnblk_img" title="Table of Contents Page" href="/books/n/handbook/"><img class="source-thumb" src="/corehtml/pmc/pmcgifs/bookshelf/thumbs/th-handbook-lrg.png" alt="Cover of The NCBI Handbook" height="100px" width="80px" /></a><div class="icnblk_cntnt eight_col"><h2>The NCBI Handbook [Internet].</h2><a data-jig="ncbitoggler" href="#__NBK21086_dtls__">Show details</a><div style="display:none" class="ui-widget" id="__NBK21086_dtls__"><div>McEntyre J, Ostell J, editors.</div><div>Bethesda (MD): <a href="https://www.ncbi.nlm.nih.gov/" ref="pagearea=page-banner&amp;targetsite=external&amp;targetcat=link&amp;targettype=publisher">National Center for Biotechnology Information (US)</a>; 2002-.</div></div><div class="half_rhythm"><ul class="inline_list"><li style="margin-right:1em"><a class="bk_cntns" href="/books/n/handbook/">Contents</a></li></ul></div></div><div class="icnblk_cntnt two_col"><div class="pagination bk_noprnt"><a class="active page_link prev" href="/books/n/handbook/ch13/" title="Previous page in this title">&lt; Prev</a><a class="active page_link next" href="/books/n/handbook/Part3.bxml/" title="Next page in this title">Next &gt;</a></div></div></div></div></div>
            <div class="main-content lit-style" itemscope="itemscope" itemtype="http://schema.org/CreativeWork"><div class="meta-content fm-sec"><h1 id="_NBK21086_"><span class="label">Chapter 14</span><span class="title" itemprop="name">Genome Assembly and Annotation Process</span></h1><p class="contrib-group"><span itemprop="author">Paul Kitts</span>.</p><p class="small">Created: <span itemprop="datePublished">October 9, 2002</span>; Last Update: <span itemprop="dateModified">August 13, 2003</span>.</p><p><em>Estimated reading time: 35 minutes</em></p></div><div class="jig-ncbiinpagenav body-content whole_rhythm" data-jigconfig="allHeadingLevels: ['h2'],smoothScroll: false" itemprop="text"><div id="_abs_rndgid_" itemprop="description"><h2 id="__abs_rndgid__">Summary</h2><p>The primary data produced by genome sequencing projects are often highly fragmented and sparsely annotated. This is especially true for the <a href="http://www.genome.gov/page.cfm?pageID=10001772" ref="pagearea=abstract&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">Human Genome Project</a> as a result of its policy of releasing sequence data to the public sequence databases every day (<a class="bk_pop" href="#A1536">1</a>, <a class="bk_pop" href="#A1537">2</a>). So that individual researchers do not have to piece together extended segments of a genome and then relate the sequence to genetic maps and known genes, <a class="def" href="/books/n/handbook/A1237/def-item/app116/">NCBI</a> provides annotated assemblies of public genome sequence data. NCBI assimilates data of various types, from numerous sources, to provide an integrated view of a genome, making it easier for researchers to spot informative relationships that might not have been apparent from looking at the primary data. The annotated genomes can be explored using <a class="def" href="/books/n/handbook/A1237/def-item/app99/">Map Viewer</a> (<a href="/books/n/handbook/ch20/">Chapter 20</a>) to display different types of data side-by-side and to follow links between related pieces of data.</p><p>This chapter describes the series of steps, the &#x0201c;pipeline&#x0201d;, that produces <a class="def" href="/books/n/handbook/A1237/def-item/app116/">NCBI</a>'s annotated genome assembly from data deposited in the public sequence databases. A variant of the annotation process developed for the human genome is used to annotate the mouse genome, and similar procedures will be applied to other genomes (<a href="/books/NBK21086/box/A1742/?report=objectonly" target="object" rid-ob="figobA1742">Box 1</a>).</p><div class="iconblock whole_rhythm clearfix ten_col boxed-text" id="figA1742"><a href="/books/NBK21086/box/A1742/?report=objectonly" target="object" title="Box 1" class="img_link icnblk_img" rid-ob="figobA1742"><img class="small-thumb" src="/corehtml/pmc/css/bookshelf/2.26/img/box-icon.gif" alt="Box Icon" /></a><div class="icnblk_cntnt"><h4 id="A1742"><a href="/books/NBK21086/box/A1742/?report=objectonly" target="object" rid-ob="figobA1742">Box 1</a></h4><p class="float-caption no_bottom_margin">Annotation of other genomes. </p></div></div><p><a class="def" href="/books/n/handbook/A1237/def-item/app116/">NCBI</a> constantly strives to improve the accuracy of its human genome assembly and annotation, to make the data displays more informative, and to enhance the utility of our access tools. Each run through the assembly and annotation procedure, together with feedback from outside groups and individual users, is used to improve the process, refine the parameters for individual steps, and add new features. Consequently, the details of the assembly and annotation process change from one run to the next. This chapter, therefore, describes the overall human genome assembly and annotation process and provides short descriptions of the key steps, but it does not detail specific procedures or parameters. However, sufficient detail is provided to enable users of our assembly and annotations to become familiar with the complexities and possible limitations of the data we provide.</p></div><div id="A1441"><h2 id="_A1441_">Overview of the Genome Assembly and Annotation Process</h2><p>
<a class="figpopup" href="/books/NBK21086/figure/A1442/?report=objectonly" target="object" rid-figpopup="figA1442" rid-ob="figobA1442">Figure 1</a> shows how the main steps in the human genome assembly and annotation process are organized and also shows the most significant interdependencies between the steps. The pipeline is not linear, because whenever possible, steps are performed in parallel to reduce the overall time taken to produce an annotated assembly from a new set of data. Some of the steps are run incrementally on a timetable that is independent from that of the main pipeline to produce a new assembly more quickly.
</p><div class="iconblock whole_rhythm clearfix ten_col fig" id="figA1442" co-legend-rid="figlgndA1442"><a href="/books/NBK21086/figure/A1442/?report=objectonly" target="object" title="Figure 1" class="img_link icnblk_img figpopup" rid-figpopup="figA1442" rid-ob="figobA1442"><img class="small-thumb" src="/books/NBK21086/bin/ch14f1.gif" src-large="/books/NBK21086/bin/ch14f1.jpg" alt="Figure 1. The human genome assembly and annotation process." /></a><div class="icnblk_cntnt" id="figlgndA1442"><h4 id="A1442"><a href="/books/NBK21086/figure/A1442/?report=objectonly" target="object" rid-ob="figobA1442">Figure 1</a></h4><p class="float-caption no_bottom_margin">The human genome assembly and annotation process. </p></div></div><div id="A1443"><h3>Data Freeze</h3><p>New sequence data that could be used to improve the genome assembly and annotation become available on a daily basis. Since the assembly and annotation process takes several weeks to complete, the data are &#x0201c;frozen&#x0201d; at the start of the <a class="def" href="/books/n/handbook/A1237/def-item/app209/">build</a> process by making a copy of all of the data available for use at that time. Freezing the data provides a stable set of inputs for the remainder of the build process. Additional or revised data that become available during the period taken to complete the process are not used until the next build.</p></div><div id="A1444"><h3>The Build Cycle</h3><p>A <a class="def" href="/books/n/handbook/A1237/def-item/app209/">build</a> begins with a <a class="def" href="/books/n/handbook/A1237/def-item/app210/">freeze</a> of the input data and ends with the public release of an annotated assembly of genomic sequences (<a class="figpopup" href="/books/NBK21086/figure/A1442/?report=objectonly" target="object" rid-figpopup="figA1442" rid-ob="figobA1442">Figure 1</a>). There are usually a few months between builds so that the latest build can be evaluated and improvements can be made.</p></div><div id="A1445"><h3>Steps Run Incrementally</h3><p>The early steps of the genome assembly and annotation pipeline involve many computationally intensive processes, including masking the repetitive sequences and aligning each genomic sequence to the other genomic sequences, mRNAs, and Expressed Sequence Tags (<a href="/books/n/handbook/A1237/#app46">EST</a>s). Running these steps incrementally minimizes the time between starting a new <a class="def" href="/books/n/handbook/A1237/def-item/app209/">build</a> and being ready to start the assembly and annotation steps. Approximately once a week, independent from the build cycle, new or updated sequences that have been deposited in <a class="def" href="/books/n/handbook/A1237/def-item/app62/">GenBank</a> are retrieved for processing. Periodically, old versions of the sequences are purged from the set of accumulated data files.</p><p>The manual refinement of the set of assembled genomic <a href="/books/n/handbook/A1237/#app30">contig</a>s produced entirely from <a href="/books/n/handbook/A1237/#app55">finished</a> sequence is another time-consuming step that is carried out incrementally, approximately once a week.</p></div><div id="A1446"><h3>Steps Run Irregularly</h3><p>Because some data change infrequently, some relatively quick steps are executed on time frames that are not tied to the <a class="def" href="/books/n/handbook/A1237/def-item/app209/">build</a> cycle. For example, the list of special cases used to override the automatic process is updated whenever the need becomes apparent.</p></div></div><div id="A1447"><h2 id="_A1447_">The Input Data</h2><p>The main inputs for the genome assembly and annotation process are genomic sequences, transcript sequences, and Sequence Tagged Site (<a class="def" href="/books/n/handbook/A1237/def-item/app173/">STS</a>) maps.</p><div id="A1448"><h3>Human Genomic Sequences</h3><div id="A1449"><h4>Genomic Sequences Used for Assembly</h4><p>Genomic sequences from the following five data sets are processed for use in the assembly:</p><p>
<b>High-Throughput Genomic Sequences.</b> Human high-throughput genomic sequences (<a href="/books/n/handbook/A1237/#app74">HTGS</a>s) are retrieved using the <a href="/books/n/handbook/A1237/#app45">Entrez</a> query system (<a href="/books/n/handbook/ch15/">Chapter 15</a>). The query used returns sequences for all entries that contain the HTG keyword, regardless of whether the sequence is <a href="/books/n/handbook/A1237/#app55">finished</a> or is in any of the unfinished <a href="/books/n/handbook/A1237/#app39">draft</a> phases.</p><p>
<b>Finished Chromosome Sequences.</b> The center that coordinates the sequencing of a finished chromosome submits a specification regarding how to <a class="def" href="/books/n/handbook/A1237/def-item/app209/">build</a> the sequence of that chromosome from its component clone sequences to a <a href="http://cvsweb.sanger.ac.uk/cgi-bin/cvsweb.cgi/genome-map/data/?cvsroot=Ensembl" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">data repository</a> at the European Bioinformatics Institute (EBI). The sequences specified for any finished chromosomes are retrieved from <a class="def" href="/books/n/handbook/A1237/def-item/app62/">GenBank</a> and included in the set of genomic sequences to be processed for an assembly.</p><p>
<b>Genomic Sequences from the Tiling Paths of Individual Chromosomes.</b> The human genome sequencing centers use a variety of experimental evidence to compile an <a href="http://genome-archive.cse.ucsc.edu/goldenPath/chromReports/" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">ordered list</a> of clones they believe provides the best coverage for each chromosome. At least once every 2 months, the sequencing centers submit an updated <a class="def" href="/books/n/handbook/A1237/def-item/app213/">minimal tiling path</a> for each chromosome to a <a href="http://cvs.sanger.ac.uk/cgi-bin/cvsweb.cgi/genome-map/data/?cvsroot=Ensembl" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">data repository</a> at the EBI. These tiling path files (TPFs) include Accession numbers if sequence for a clone is available. The tiling path repository is checked each day for the Accession numbers from any tiling path that has been updated. Any secondary Accession numbers are replaced with the corresponding primary Accession numbers, and any invalid Accession numbers are flagged to prevent those sequences from being used for assembly. The latest version of the sequence for each <a class="def" href="/books/n/handbook/A1237/def-item/app208/">Accession number</a> in the most recent clone tiling paths is retrieved from <a class="def" href="/books/n/handbook/A1237/def-item/app62/">GenBank</a> and included in the set of genomic sequences to be processed for assembly.</p><p>
<b>Assembled Blocks of Contiguous Finished Genomic Sequence.</b> As the sequences for individual clones are finished, they are merged with overlapping finished sequences to form contigs (<a class="bk_pop" href="#A1538">3</a>). The primary source for identifying neighboring clones is the clone <a class="def" href="/books/n/handbook/A1237/def-item/app182/">tiling path</a> for each chromosome. Additional information is obtained from some <a class="def" href="/books/n/handbook/A1237/def-item/app62/">GenBank</a> entries that contain annotation specifying the neighboring clones. <a href="/books/n/handbook/A1237/#app9">BLAST</a> (<a class="bk_pop" href="#A1539">4</a>) is used to align the sequences of candidate pairs of clones, and a merged sequence is produced automatically if the expected overlap is confirmed by the sequence alignment. When the automatic processes do not find an expected overlap, there is a manual review to find the correct overlap, refining the clone order if necessary. The most recent set of finished contigs are processed for assembly.</p><p>
<b>Additional Genomic Sequences.</b> A few other specific human genomic sequences are added to the assembly set because they contain genes that may not be represented in the genomic sequences from the other sources.</p></div><div id="A1450"><h4>Genomic Sequences Used for Ordering and Orienting</h4><p>Sequences known to come from both ends of the same cloned genomic fragment provide valuable linking information that helps to order and orient sequence contigs in the assembly step (<a class="bk_pop" href="#A1540">5</a>, <a class="bk_pop" href="#A1541">6</a>). The <a href="http://snp.cshl.org" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">SNP Consortium</a> sequenced the ends of the inserts in several million plasmid clones containing small (0.8&#x02013;6 Kbp) fragments of human genomic <a class="def" href="/books/n/handbook/A1237/def-item/app37/">DNA</a>. In many cases, both ends of the same insert were sequenced (Table 4 in Ref. <a class="bk_pop" href="#A1542">7</a>), thereby providing a set of plasmid paired-end sequences.</p></div><div id="A1451"><h4>Genomic Sequences Used for Annotation</h4><p>
<b>Curated Genomic Regions.</b> The Reference Sequence project (<a href="/books/n/handbook/ch18/">Chapter 18</a>; Refs. <a class="bk_pop" href="#A1543">8</a>, <a class="bk_pop" href="#A1544">9</a>) provides reviewed annotated sequences for a number of genomic regions that are difficult to annotate correctly by automated processes (e.g., immunoglobulin gene regions). These Reference Sequences (RefSeqs) are aligned to the assembled genome so that the curated annotation can be transferred to the assembled genomic sequence. RefSeqs for known pseudogenes are also aligned, not only to enable transfer of the correct annotation but, more importantly, to prevent prediction of erroneous model transcripts and proteins.</p><p>
<b><a class="def" href="/books/n/handbook/A1237/def-item/app6/">BAC</a> End Sequences</b>. Sequences from the ends of human genomic inserts in Bacterial Artificial Chromosome (<a href="/books/n/handbook/A1237/#app6">BAC</a>) clones are used to help map the location of specific clones onto the assembled genome sequence. The BAC end sequences are obtained from <a href="/dbGSS/" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">dbGSS</a> (see <a href="/books/n/handbook/ch1/">Chapter 1</a>). The clone names are extracted and converted to a <a href="/genome/clone/nomenclature.shtml" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">standardized format</a> to facilitate linking of the BAC end sequences with mapping data and additional sequences for the same clone, when these are available.</p></div><div id="A1452"><h4>Genomic Sequences Used for Alignment</h4><p>Human genomic sequences that are not used for either assembly or annotation are processed so that their relationships to the assembled genome can be displayed in <a class="def" href="/books/n/handbook/A1237/def-item/app99/">Map Viewer</a> (<a href="/books/n/handbook/ch20/">Chapter 20</a>). Most genomic sequences deposited in <a class="def" href="/books/n/handbook/A1237/def-item/app62/">GenBank</a> by individual scientists will not be HTG and therefore will not be used for assembly; however, they are used for alignment. The exceptions are a few non-HTG sequences that are used for assembly, because they are included in the clone <a class="def" href="/books/n/handbook/A1237/def-item/app182/">tiling path</a> of an individual chromosome or in an assembled block of <a class="def" href="/books/n/handbook/A1237/def-item/app55/">finished sequence</a>. Any sequences intended for assembly but not used, either because they are redundant or are rejected by one of the quality screens in the assembly process, are also used for alignment.</p></div></div><div id="A1453"><h3>Human Transcript Sequences</h3><p>Human transcript sequences are used to help order and orient genomic fragments in the assembly step, for feature annotation and also to produce maps that show the locations of the transcripts on the assembled genome. Transcripts used include: (<i>a</i>) human <a class="def" href="/books/n/handbook/A1237/def-item/app114/">mRNA</a> RefSeqs (<a class="bk_pop" href="#A1543">8</a>, <a class="bk_pop" href="#A1544">9</a>), except model transcripts produced from previous rounds of genome annotation; (<i>b</i>) human mRNA sequences deposited in <a class="def" href="/books/n/handbook/A1237/def-item/app62/">GenBank</a> by individual scientists, except those mRNAs produced after a translocation or other rearrangement of the genome; and (<i>c</i>) a nonredundant set of <a class="def" href="/books/n/handbook/A1237/def-item/app46/">EST</a> sequences from the <a class="def" href="/books/n/handbook/A1237/def-item/app9/">BLAST</a> <a href="ftp://ftp.ncbi.nih.gov/blast/db" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=ftp">FTP</a> site. Additional information relating to these EST sequences is obtained from <a href="/UniGene/index.html" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">UniGene</a> (<a href="/books/n/handbook/ch21/">Chapter 21</a>).</p></div><div id="A1454"><h3>Transcripts from Other Organisms</h3><p>Transcripts from other organisms may be aligned to the genome being processed. These data may reveal the location of potential genes not identified by other means. <a class="def" href="/books/n/handbook/A1237/def-item/app155/">RefSeq</a> mRNAs, <a class="def" href="/books/n/handbook/A1237/def-item/app62/">GenBank</a> mRNAs, and ESTs, obtained from the same sources that provide the human transcripts, are used. Their alignments are processed for display in <a class="def" href="/books/n/handbook/A1237/def-item/app99/">Map Viewer</a> but are not used in the assembly step or for feature annotation.</p></div><div id="A1455"><h3>Sequence Tagged Site (STS) Maps</h3><p>Genetic linkage maps, radiation hybrid (<a href="/books/n/handbook/A1237/#app157">RH</a>) maps, and a <a href="/books/n/handbook/A1237/#app201">YAC</a> map are used to help avoid assembling genomic contigs incorrectly and to help place the contigs along the chromosomes. The positions of <a href="/books/n/handbook/A1237/#app173">STS</a> markers on various maps (<a class="figpopup" href="/books/NBK21086/table/A1456/?report=objectonly" target="object" rid-figpopup="figA1456" rid-ob="figobA1456">Table 1</a>) are transformed into a common format that allows us to compare the maps to each other during the assembly process. Additional maps are processed so that they can be displayed in <a href="/PMGifs/Genomes/humansearch.html" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">Map Viewer</a>.</p><div class="iconblock whole_rhythm clearfix ten_col table-wrap" id="figA1456"><a href="/books/NBK21086/table/A1456/?report=objectonly" target="object" title="Table 1" class="img_link icnblk_img figpopup" rid-figpopup="figA1456" rid-ob="figobA1456"><img class="small-thumb" src="/books/NBK21086/table/A1456/?report=thumb" src-large="/books/NBK21086/table/A1456/?report=previmg" alt="Table 1. STS maps used for assembly or display." /></a><div class="icnblk_cntnt"><h4 id="A1456"><a href="/books/NBK21086/table/A1456/?report=objectonly" target="object" rid-ob="figobA1456">Table 1</a></h4><p class="float-caption no_bottom_margin">STS maps used for assembly or display. </p></div></div><p>The maps listed in <a class="figpopup" href="/books/NBK21086/table/A1456/?report=objectonly" target="object" rid-figpopup="figA1456" rid-ob="figobA1456">Table 1</a> are static and are not updated with additional markers. Any new <a class="def" href="/books/n/handbook/A1237/def-item/app173/">STS</a> maps are added to our data set soon after they are released.</p></div><div id="A1457"><h3>Special Cases</h3><p>Our own review of previous genome assemblies or feedback from users sometimes identify particular cases in which bad data or overlooked data prevent the automated processes from producing the best possible assembly of a particular segment of the genome. To help guide the assembly process, a list of such special cases is maintained. The list is used to provide supplemental data that override the automatic processes that assign a particular input genomic sequence to a chromosome or determine whether it is used for assembly.</p></div></div><div id="A1458"><h2 id="_A1458_">Preparation of the Input Sequences</h2><p>The raw input genomic sequences are screened for contaminants, the repetitive sequences are masked, and the draft genomic sequences are split into fragments in preparation for alignment to other sequences. The input transcript sequences are also screened for contaminants before they are aligned to the genomic sequences. The <a class="def" href="/books/n/handbook/A1237/def-item/app173/">STS</a> content of the input genomic sequences is determined.</p><div id="A1459"><h3>Preparation of Genomic Sequences</h3><div id="A1460"><h4>Removing Contaminants</h4><p>Draft-quality HTGSs sometimes contain segments of sequence derived from foreign sources, most commonly the cloning vector or bacterial host. Finished sequences are usually, but not always, free of such contaminants. Common contaminants introduce artificial blocks of <a class="def" href="/books/n/handbook/A1237/def-item/app73/">homologous</a> sequence that can give rise to misleading alignments between two unrelated genomic sequences. <a href="/books/n/handbook/A1237/#app103">MegaBLAST</a> (<a class="bk_pop" href="#A1545">10</a>) is used to compare the raw genomic sequences to a database of contaminant sequences (including the <a href="/VecScreen/UniVec.html" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">UniVec</a> database of vector sequences, the <i>Escherichia coli</i> genome, bacterial insertion sequences, and bacteriophage). Any foreign segments are removed from draft-quality sequence or masked in <a class="def" href="/books/n/handbook/A1237/def-item/app55/">finished sequence</a> to prevent them from participating in alignments.</p></div><div id="A1461"><h4>Masking of Repetitive Sequences</h4><p>Sequences that occur in many copies in the genome will align to many different clones. Such repetitive sequences include interspersed repeats (SINEs, LINEs, LTR elements, and <a class="def" href="/books/n/handbook/A1237/def-item/app37/">DNA</a> transposons), satellite sequences, and low-complexity sequences (<a class="bk_pop" href="#A1542">7</a>, <a class="bk_pop" href="#A1546">11</a>, <a class="bk_pop" href="#A1547">12</a>). Matches between repetitive sequences on unrelated clones make it difficult to identify alignments that indicate a genuine overlap between clones. To eliminate the confounding matches that are based only on repetitive sequences, the genomic sequences are run through <a href="http://ftp.genome.washington.edu/cgi-bin/RepeatMasker" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">RepeatMasker</a> to identify known repeats. Repeats are masked by converting the sequence to lowercase letters so that they do not initiate alignments.</p></div><div id="A1462"><h4>Fragmentation of Draft Sequences</h4><p>Draft HTGSs consist of a set of sequence contigs derived from a particular clone artificially linked together to form a single sequence. The masked, draft genomic sequences are split at the gaps between their constituent contigs to create separate sequence fragments that can be aligned independently. Vector sequences and other contaminants are also removed at this stage by trimming or further splitting the sequence fragments.</p></div><div id="A1463"><h4>Determination of STS Content</h4><p>Any <a class="def" href="/books/n/handbook/A1237/def-item/app173/">STS</a> markers contained within the input genomic sequences are identified by <a href="/genome/sts/epcr.cgi" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">e-PCR</a> (<a class="bk_pop" href="#A1548">13</a>) using the <a href="/entrez/query.fcgi?db=unists" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">UniSTS</a> database. The resulting data are used primarily to relate the genomic sequences to independently derived STS maps (genetic, radiation hybrid) but are also used to identify some foreign sequences.</p></div><div id="A1464"><h4>Filtering</h4><p>Sequences from other clones being sequenced at the same institution can occasionally cross-contaminate draft HTGSs. The contaminating sequences may come from another clone from the same organism or from another organism. The raw genomic sequences are screened in several ways to detect cross-contamination: (<i>a</i>) they are compared with the genome sequences from completely sequenced organisms using <a class="def" href="/books/n/handbook/A1237/def-item/app103/">MegaBLAST</a> (<a class="bk_pop" href="#A1545">10</a>); (<i>b</i>) they are screened for the presence of organism-specific interspersed repeats using <a href="http://ftp.genome.washington.edu/cgi-bin/RepeatMasker" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">RepeatMasker</a>; and (<i>3</i>) they are screened for the presence of mapped <a class="def" href="/books/n/handbook/A1237/def-item/app173/">STS</a> markers from other organisms using <a class="def" href="/books/n/handbook/A1237/def-item/app47/">e-PCR</a> (<a class="bk_pop" href="#A1548">13</a>). Any input sequence that contains foreign sequences, repeats, or markers is flagged for removal from the data set used for assembly. Draft sequences longer than the maximum insert length expected for a genomic clone are also rejected because it is likely they are contaminated with sequences from at least one other clone.</p><p>At this stage, draft sequences composed of fragments that are too small to contribute significantly to the assembly or that are tagged with the <a class="def" href="/books/n/handbook/A1237/def-item/app211/">HTGS_CANCELLED</a> keyword are also flagged for removal. Another filter rejects sequences annotated as being from another organism or as being <a class="def" href="/books/n/handbook/A1237/def-item/app158/">RNA</a>, erroneously included in the input sequences.</p></div><div id="A1465"><h4>Chromosome Assignment</h4><p>To improve assembly of the genomic sequences, the input genomic sequences are assigned to a specific chromosome before attempting to merge the sequences. Genomic sequences that appear on any of the chromosome tiling paths are automatically assigned to the designated chromosome. Other genomic sequences are assigned to a chromosome based on: (<i>a</i>) annotation on the submitted <a class="def" href="/books/n/handbook/A1237/def-item/app62/">GenBank</a> record; (<i>b</i>) the presence of multiple <a class="def" href="/books/n/handbook/A1237/def-item/app173/">STS</a> markers that have been mapped to the same chromosome; (<i>c</i>) fluorescence <i>in situ</i> hybridization (<a href="/books/n/handbook/A1237/#app56">FISH</a>) mapping (<a class="bk_pop" href="#A1549">14</a>, <a class="bk_pop" href="#A1550">15</a>); or (<i>d</i>) personal communication from a scientist with specialized knowledge. If there is no assignment, or the assignments are conflicted, the sequences are treated as unassigned and assembled without constraint by chromosome.</p></div></div><div id="A1466"><h3>Filtering of Transcript Sequences</h3><p>Transcript sequences that contain sequences derived from vectors or other common contaminants can produce artificial alignments to the assembled genomic sequence. The input transcript sequences are therefore compared with a database of contaminants using <a class="def" href="/books/n/handbook/A1237/def-item/app103/">MegaBLAST</a> (<a class="bk_pop" href="#A1545">10</a>), as described for genomic sequences. Any transcripts with significant matches to the sequence of a contaminant are excluded from the set of transcript sequences used for genome assembly or annotation.</p><p><a class="def" href="/books/n/handbook/A1237/def-item/app114/">mRNA</a> sequences shorter than 300 bases are excluded from the set of sequences that are aligned to the genomic sequences because they are too small to contribute significantly to genome assembly or annotation. Also excluded are any mRNA sequences flagged because they do not represent the true sequence of a transcript, e.g., those that are chimeric or contain genomic sequences.</p></div></div><div id="A1467"><h2 id="_A1467_">Alignment of Sequences to the Input Genomic Sequences</h2><p>Alignment of the input genomic sequences to each other and to various other sequences is essential for both genome assembly and genome annotation. All relevant sequences are initially aligned to the unassembled genomic sequences because this means that the computationally intensive alignment processes can be run incrementally at an early stage in the pipeline. If necessary, these alignments are remapped to the sequence of the assembled genome at a later stage by a process that requires relatively little computation.</p><div id="A1468"><h3>Alignment of Genomic Sequences to Each Other</h3><p>Assembly of the genomic sequences from individual clones into longer contiguous sequences (contigs) requires knowledge of which sequences overlap. The overlaps between genomic sequences are evaluated by aligning the sequences from individual genomic clones to each other. After masking of repeats, decontamination and fragmentation, each fragment of genomic sequence is aligned pairwise to all of the other fragments using <a class="def" href="/books/n/handbook/A1237/def-item/app103/">MegaBLAST</a> (<a class="bk_pop" href="#A1545">10</a>). Alignments that are sufficiently long and of sufficiently high percentage identity are saved for consideration in the assembly step.</p></div><div id="A1469"><h3>Alignment of Clone End Sequences to the Genomic Sequences</h3><p>
The pairs of short genomic sequences derived from the ends of plasmid clones help to order and orient sequence fragments in the assembly step. These clone end sequences are aligned to the processed genomic sequences, as described for <i>Alignment of Genomic Sequences to Each Other</i>.</p></div><div id="A1470"><h3>Alignment of Transcripts to the Genomic Sequences</h3><p>Annotation of genes requires knowledge of where the sequences for known transcripts align to the assembled genomic sequences. <a class="def" href="/books/n/handbook/A1237/def-item/app155/">RefSeq</a> <a class="def" href="/books/n/handbook/A1237/def-item/app158/">RNA</a> sequences, <a class="def" href="/books/n/handbook/A1237/def-item/app114/">mRNA</a> sequences from <a class="def" href="/books/n/handbook/A1237/def-item/app62/">GenBank</a>, and <a class="def" href="/books/n/handbook/A1237/def-item/app46/">EST</a> sequences from dbEST are aligned to the processed genomic sequences, as described for <i>Alignment of Genomic Sequences to Each Other</i>. Later, the alignments are remapped to the assembled genomic sequence.</p></div><div id="A1471"><h3>Alignment of Curated Genomic Regions to the Genomic Sequences</h3><p>Curated genomic regions provide accurate annotation for regions of the genome that are difficult to annotate correctly by automated processes. Sequences from curated genomic regions are initially aligned to the unassembled genomic sequences and later remapped to the sequence of the assembled genome, as described for <i>Alignment of Transcripts to the Genomic Sequences</i>.</p></div><div id="A1472"><h3>Alignment of Translated Genomic Sequences to Proteins</h3><p>Homologies between the polypeptides encoded by the genomic sequences and known proteins/conserved protein domains provide hints for the gene prediction process. The repeat-masked genomic sequences are compared with a non-redundant database of vertebrate proteins and to the <a class="def" href="/books/n/handbook/A1237/def-item/app116/">NCBI</a> Conserved Domain Database (<a class="def" href="/books/n/handbook/A1237/def-item/app20/">CDD</a>; Ref. <a class="bk_pop" href="#A1551">16</a>) using different versions of <a class="def" href="/books/n/handbook/A1237/def-item/app9/">BLAST</a> (<a class="bk_pop" href="#A1539">4</a>) (BLASTX and <a href="/Structure/cdd/wrpsb.cgi" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">RPS-BLAST</a>, respectively). Significant alignments are saved for use in the gene prediction step.</p></div></div><div id="A1473"><h2 id="_A1473_">Genome Assembly</h2><p>The input genomic sequences are assembled into a series of genomic sequence contigs. These are then ordered, oriented with respect to each other, and placed along each chromosome with appropriately sized gaps inserted between adjacent contigs. The resulting genome assembly thus consists of a set of genomic sequence contigs and a specification for how to arrange the sequence contigs along each chromosome.</p><div id="A1474"><h3>Finished Chromosomes</h3><p>A chromosome sequence is considered <a href="http://www.genome.gov/page.cfm?pageID=10000923" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">finished</a> when any gaps that remain cannot be closed using current cloning and sequencing technology. In practice, therefore, the sequence for a finished chromosome usually consists of a small number of genomic sequence contigs. These are assembled from their component clone sequences according to the <a href="http://cvsweb.sanger.ac.uk/cgi-bin/cvsweb.cgi/genome-map/data/?cvsroot=Ensembl" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">specification</a> provided by the center responsible for sequencing that chromosome. This specification also prescribes the order, orientation, and estimated sizes for the gaps between contigs.</p></div><div id="A1475"><h3>Unfinished Chromosomes</h3><p>Genomic sequence contigs for unfinished chromosomes are assembled and laid out based largely on the clone <a href="http://genome-archive.cse.ucsc.edu/goldenPath/chromReports/" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">tiling path</a>. However, the tiling paths do not specify the orientation of the clone sequences or how they should be joined; therefore, data on the alignment of the input genomic sequences to each other and to other sequences are also used to guide the assembly. Genomic sequences that augment the initial set of genomic contigs based on the tiling path clones are also incorporated.</p><div id="A1476"><h4>Resolution of Conflicts in the Chromosome Tiling Paths</h4><p>Before the tiling paths are used in the genome assembly, the order of the finished clone sequences included in the tiling paths is compared with the specifications used to assemble the curated contigs of <a class="def" href="/books/n/handbook/A1237/def-item/app55/">finished sequence</a>. Discrepancies are resolved before proceeding with the assembly. Sequence from any clone should appear at just one place in the assembled genome; therefore, if a clone is listed more than once in the tiling paths, only the location with the best evidence is used in the assembly step.</p></div><div id="A1477"><h4>Genomic Sequences Excluded from the Assembly Step</h4><p>Clone sequences that consist only of unassembled reads (HTGS_PHASE0) or that were flagged because of suspected cross-contamination or other problem detected in the pre-assembly screens are not used in the assembly step.</p></div><div id="A1478"><h4>Assembly of the Genomic Sequence Contigs</h4><p>Adjacent, finished clone sequences from the chromosome <a class="def" href="/books/n/handbook/A1237/def-item/app182/">tiling path</a> that have good sequence overlap are merged. Tiling path draft sequences that are adjacent to and overlap the finished clone sequences or other draft clone sequences are added to extend the initial genomic sequence contigs. After that, genomic sequences from clones not on any chromosome tiling path are added, provided they have good overlaps with the assembled tiling path clones. Genomic sequences from additional clones may be added if they provide the sequence for a known gene that is missing from the existing genomic sequence contigs. Finally, the individual fragments of draft sequences are ordered and oriented.</p><p><b>Assembly of Finished Sequences from Tiling Path Clones.</b> The quality of any overlaps between finished clone sequences that are adjacent in the clone <a class="def" href="/books/n/handbook/A1237/def-item/app182/">tiling path</a> are assessed using the alignments between pairs of genomic sequences that were produced in advance. Sequences that have high-quality overlaps, or that are known from annotation or other data to abut, are merged to form a genomic sequence <a class="def" href="/books/n/handbook/A1237/def-item/app30/">contig</a>. Clone sequences that have no good overlaps are retained as separate contigs.</p><p><b>Addition of Draft Sequences from Tiling Path Clones.</b> The procedure used for merging <a class="def" href="/books/n/handbook/A1237/def-item/app39/">draft sequence</a> from <a class="def" href="/books/n/handbook/A1237/def-item/app182/">tiling path</a> clones is similar to that described for merging finished sequences, except that the minimum-overlap quality required for merging is different. An overlap involving a draft sequence can contain more mismatches, but must be longer, than an overlap between two finished sequences. Preference is given to finished sequences so that a <a class="def" href="/books/n/handbook/A1237/def-item/app30/">contig</a> made by merging finished and draft sequences will contain the <a class="def" href="/books/n/handbook/A1237/def-item/app55/">finished sequence</a> for the overlapping portion. Draft clone sequences that have no good overlaps are retained as separate contigs.</p><p><b>Addition of Sequences from Other Genomic Clones.</b> Genomic sequences from clones that are not on any chromosome <a class="def" href="/books/n/handbook/A1237/def-item/app182/">tiling path</a> are used to close, or extend into, gaps in the backbone of genomic contigs assembled based on the tiling paths. Genomic sequences that are fully contained within the existing genomic contigs are not used. Any remaining genomic sequences that were either assigned to the relevant chromosome or could not be assigned to any chromosome are evaluated, and sequences that have good-quality overlaps with genomic contigs are merged in to extend a <a class="def" href="/books/n/handbook/A1237/def-item/app30/">contig</a> or to join two adjacent contigs, if two additional conditions are met: (1) the <a class="def" href="/books/n/handbook/A1237/def-item/app59/">gap</a> must be sufficiently large to accommodate the additional sequence; and (2) the Sequence Tagged Site (<a class="def" href="/books/n/handbook/A1237/def-item/app173/">STS</a>) marker content of the additional clone sequence must be compatible with that of the flanking clone sequences when compared with various STS maps.</p><p>After all of the chromosomes have been assembled, any remaining genomic clone sequence that contains a known gene not present in the other contigs is added to the assembly as a separate <a class="def" href="/books/n/handbook/A1237/def-item/app30/">contig</a>.</p><p><b>Ordering and Orienting Draft Sequence Fragments.</b> The order and orientation of the fragments of HTGS_PHASE1 <a class="def" href="/books/n/handbook/A1237/def-item/app39/">draft sequence</a> need to be defined before sequence made from contigs that include this category of draft sequence can be completed. Some fragments may be ordered and oriented by overlaps with sequences from adjacent clones. Many more can be defined by aligning them with mRNAs, ESTs, or plasmid paired-end sequences. Any fragments whose order and orientation remain undefined are placed in the nearest open <a class="def" href="/books/n/handbook/A1237/def-item/app59/">gap</a> and given an arbitrary orientation. Fragments of draft sequence are connected to flanking sequences and to each other by runs of 100 unknown bases (Ns), which represent an arbitrarily sized gap in the sequence.</p></div><div id="A1483"><h4>Placement of the Genomic Contigs</h4><p>After the genomic sequence contigs are assembled, they are oriented and placed in order along each chromosome with appropriately sized gaps inserted between adjacent contigs. The chromosome <a href="http://genome-archive.cse.ucsc.edu/goldenPath/chromReports/" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">tiling paths</a> specify the order of the clone sequences and the sizes for some gaps. Therefore, the order and orientation of most of the genomic sequence contigs are derived from the tiling paths. Many of the remaining contigs are placed by comparison of the <a class="def" href="/books/n/handbook/A1237/def-item/app173/">STS</a> marker content of the contigs, as determined by <a class="def" href="/books/n/handbook/A1237/def-item/app47/">e-PCR</a> (<a class="bk_pop" href="#A1548">13</a>) using the <a href="/entrez/query.fcgi?db=unists" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">UniSTS</a> database, to various STS maps. There are some contigs that can be assigned to a specific chromosome but cannot be placed along that chromosome. Others cannot even be assigned to a specific chromosome and therefore remain unplaced within the genome assembly.</p><p>Gaps between the clone contigs laid out in the chromosome tiling paths are arbitrarily set at 50 Kbp, and 3 Mbp for the centromere, unless another <a class="def" href="/books/n/handbook/A1237/def-item/app59/">gap</a> size is specified in the <a class="def" href="/books/n/handbook/A1237/def-item/app182/">tiling path</a>. Any remaining gaps between genomic sequence contigs are arbitrarily set to 10 Kbp.</p></div></div><div id="A1484"><h3>Preparing a Provisional Genome Assembly</h3><p>A set of sequences and data files is produced to represent the provisional assembly. This set includes: (<i>a</i>) sequences for each genomic <a class="def" href="/books/n/handbook/A1237/def-item/app30/">contig</a> in <a class="def" href="/books/n/handbook/A1237/def-item/app53/">FASTA</a> format; (<i>b</i>) specifications describing how to assemble each genomic contig from its components; and (<i>c</i>) a specification for how to arrange the contigs along each chromosome. A raw <a class="def" href="/books/n/handbook/A1237/def-item/app155/">RefSeq</a> entry is also made for each genomic sequence contig.</p></div><div id="A1485"><h3>Quality Control</h3><p>The provisional assembly is checked for consistency with the chromosome tiling paths and various <a class="def" href="/books/n/handbook/A1237/def-item/app173/">STS</a> maps. The order in which the component clone sequences appear in the assembled chromosomes is compared with their order in the tiling paths on which the assembly was based. The STS marker order along each chromosome in the provisional assembly, as determined by <a class="def" href="/books/n/handbook/A1237/def-item/app47/">e-PCR</a> (<a class="bk_pop" href="#A1548">13</a>) using the <a class="def" href="/books/n/handbook/A1237/def-item/app188/">UniSTS</a> database, is checked for consistency with a set of STS maps. Haussler et al. at the University of California at Santa Cruz (UCSC) also perform a set of independent quality checks on the provisional assembly. In addition to comparing the assembly to the chromosome tiling paths and to various STS maps, they also look for potentially misassembled contigs using alignments of <a class="def" href="/books/n/handbook/A1237/def-item/app6/">BAC</a> end sequences. Any serious errors in the assembly may be corrected by repeating the assembly steps using different parameters or by manually editing the assembly.</p></div></div><div id="A1486"><h2 id="_A1486_">Annotation of Genes</h2><p>Identification of genes within the genome assembly reveals the functional significance of particular stretches of genomic sequence. Genes are found using three complementary approaches: (<i>a</i>) known genes are placed primarily by aligning mRNAs to the assembled genomic contigs; (<i>b</i>) additional genes are located based on alignment of ESTs to the assembled genomic contigs; and (<i>c</i>) previously unknown genes are predicted using hints provided by protein homologies. Whenever possible, predicted genes are identified by homology between the protein they encode and known protein sequences.</p><div id="A1487"><h3>Generation of Transcript-based Gene Models</h3><p>Alignments between known transcripts and the assembled genomic sequences are processed to produce gene models. Each gene model consists of an ordered series of exons. The transcripts defining each gene model are used as evidence to support that model.</p><div id="A1488"><h4>Alignment of Transcripts to the Assembled Genome</h4><p>The alignments between <a class="def" href="/books/n/handbook/A1237/def-item/app155/">RefSeq</a> <a class="def" href="/books/n/handbook/A1237/def-item/app158/">RNA</a> sequences, <a class="def" href="/books/n/handbook/A1237/def-item/app114/">mRNA</a> and <a class="def" href="/books/n/handbook/A1237/def-item/app46/">EST</a> sequences from <a class="def" href="/books/n/handbook/A1237/def-item/app62/">GenBank</a> and the component genomic sequences are remapped to produce alignments of these transcripts to the assembled genomic contigs.</p></div><div id="A1489"><h4>Production of Candidate Gene Models</h4><p>A candidate gene model is produced from each set of alignments between a particular transcript and one strand of a particular genomic <a class="def" href="/books/n/handbook/A1237/def-item/app30/">contig</a> as follows: (1) putative exons are identified by looking for <a class="def" href="/books/n/handbook/A1237/def-item/app114/">mRNA</a> <a class="def" href="/books/n/handbook/A1237/def-item/app170/">splice sites</a> near the ends of those alignments that satisfy minimum length and percentage identity criteria; (2) a mutually compatible set of exons for the model is selected by applying rules, such as restrictions on the size of an <a class="def" href="/books/n/handbook/A1237/def-item/app86/">intron</a>, that define plausible <a class="def" href="/books/n/handbook/A1237/def-item/app50/">exon</a>&#x02013;intron structures; and (3) <a class="def" href="/books/n/handbook/A1237/def-item/app9/">BLAST</a> (<a class="bk_pop" href="#A1539">4</a>) may be used to produce additional alignments to try to identify exons that were missed because they were too short to be represented in the initial set of transcript alignments. Candidate gene models are only retained if good-quality alignments between their exons and the defining transcript cover either more than half the length of the transcript or more than 1 Kbp.</p></div><div id="A1490"><h4>Selection of the Best RefSeq RNA-based Gene Models</h4><p>Each <a class="def" href="/books/n/handbook/A1237/def-item/app155/">RefSeq</a> <a class="def" href="/books/n/handbook/A1237/def-item/app158/">RNA</a> represents a distinct transcript produced from a particular gene (<a href="/books/n/handbook/ch18/">Chapter 18</a>; Refs. <a class="bk_pop" href="#A1543">8</a>, <a class="bk_pop" href="#A1544">9</a>). Hence, there should not be more than one gene model corresponding to any given RefSeq RNA. Therefore, all gene models based on a particular RefSeq RNA are compared, and the best one is selected. Because the RefSeq RNA is taken to be the best representation of a particular transcript, this gene model is preserved without any further modification. Any extra models may represent paralogs; therefore, they are included with the <a class="def" href="/books/n/handbook/A1237/def-item/app114/">mRNA</a>- and <a class="def" href="/books/n/handbook/A1237/def-item/app46/">EST</a>-based models for further processing. Between builds, RefSeq RNAs are refined based on a review of related gene models and transcript alignments produced during the genome annotation process.</p></div><div id="A1491"><h4>Exon Refinement</h4><p>Many gene models may be produced for the same gene because the input data set frequently contains multiple <a class="def" href="/books/n/handbook/A1237/def-item/app46/">EST</a> or <a class="def" href="/books/n/handbook/A1237/def-item/app114/">mRNA</a> sequences representing the same transcript. This redundancy is used to refine the <a class="def" href="/books/n/handbook/A1237/def-item/app170/">splice sites</a> defining a particular <a class="def" href="/books/n/handbook/A1237/def-item/app50/">exon</a>. Similar exons are clustered, and splice sites may be adjusted in some models to match those used by the majority of models containing the same exon. Inconsistent models may be discarded at this stage, unless they have sufficient support to be retained as likely splice variants.</p></div><div id="A1492"><h4>Chaining of Transcript-based Gene Models</h4><p>Many of the mRNAs and most of the ESTs used to generate the initial gene models provide sequence for only part of the native transcript. Overlapping gene models that are compatible with each other are combined into an extended model. This chaining step produces models more likely to represent the full gene.</p></div></div><div id="A1493"><h3><i>Ab Initio</i> Gene Prediction</h3><p>
<a href="http://genes.mit.edu/genomescan/" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">GenomeScan</a>, an <i>ab initio</i> gene prediction program, is used to provide models for genes inferred from the genomic sequence using hints provided by protein homologies (<a class="bk_pop" href="#A1552">17</a>). The genes predicted by GenomeScan are combined with the transcript-based gene models, but they are also retained as a distinct set of models that can be viewed or searched separately.</p><div id="A1494"><h4>Dividing the Genomic Sequences into Segments</h4><p><a class="def" href="/books/n/handbook/A1237/def-item/app64/">GenomeScan</a> produces better results when long genomic sequences are broken into shorter segments at putative gene boundaries. The locations of gene models based on <a class="def" href="/books/n/handbook/A1237/def-item/app155/">RefSeq</a> <a class="def" href="/books/n/handbook/A1237/def-item/app158/">RNA</a> alignments are, therefore, used to divide the assembled genomic contigs into segments. Repetitive sequences are masked by remapping the repeats found in the component genomic sequences.</p></div><div id="A1495"><h4>Producing Protein Hints for GenomeScan</h4><p><a class="def" href="/books/n/handbook/A1237/def-item/app64/">GenomeScan</a> can use data on protein homologies to improve its gene predictions (<a class="bk_pop" href="#A1552">17</a>). The locations of genomic sequences that potentially code for polypeptides with homology to other proteins are obtained from three sources. Significant alignments between translated genomic segments and vertebrate proteins are obtained by filtering and remapping the precomputed alignments. Significant alignments between translated genomic sequences and conserved protein domains are obtained in the same manner. A third set of alignments comes from running GenomeScan without any hints. The proteins predicted by this initial run are aligned to proteins from <a href="http://www.expasy.ch/sprot/" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">SWISS-PROT</a> (<a class="bk_pop" href="#A1553">18</a>) and <a class="def" href="/books/n/handbook/A1237/def-item/app116/">NCBI</a> <a class="def" href="/books/n/handbook/A1237/def-item/app155/">RefSeq</a> proteins (<a class="bk_pop" href="#A1543">8</a>, <a class="bk_pop" href="#A1544">9</a>) using <a class="def" href="/books/n/handbook/A1237/def-item/app11/">blastp</a> (<a class="bk_pop" href="#A1539">4</a>). The eukaryotic protein with the best match is then aligned to the genomic sequence segments using tblastn (<a class="bk_pop" href="#A1539">4</a>). These three sets of data are converted into the format required by GenomeScan and merged to produce a single set of protein hints.</p></div><div id="A1496"><h4>Predicting Genes Using GenomeScan</h4><p>Each segment of genomic sequence is processed by <a class="def" href="/books/n/handbook/A1237/def-item/app64/">GenomeScan</a> using the combined set of protein homology-based hints as an additional input. This produces one model containing all of the predicted exons for each putative gene. Models with coding sequences shorter than 90 amino acids are discarded. Each remaining model is aligned to proteins from <a class="def" href="/books/n/handbook/A1237/def-item/app175/">SWISS-PROT</a> and <a class="def" href="/books/n/handbook/A1237/def-item/app116/">NCBI</a> <a class="def" href="/books/n/handbook/A1237/def-item/app155/">RefSeq</a> proteins using <a class="def" href="/books/n/handbook/A1237/def-item/app11/">blastp</a>. The eukaryotic protein with the best match to any model is used as evidence for that model and to provide a clue as to the possible function of that model.</p></div></div><div id="A1497"><h3>Consolidation of Gene Models</h3><p>Consolidation of the transcript-based gene models and the predicted gene models forms a single set of models. Models are clustered into genes if they share one or more exons or if <a class="def" href="/books/n/handbook/A1237/def-item/app97/">Entrez Gene</a> (<a href="/books/n/handbook/ch19/">Chapter 19</a>; Refs. <a class="bk_pop" href="#A1543">8</a>, <a class="bk_pop" href="#A1544">9</a>) indicates that the transcripts used as evidence for the models come from the same gene. If a model is entirely contained within a longer model, it is redundant and, therefore, eliminated. Sets of identical models are reduced to a single representative model linked to all of the supporting evidence. For sets of very similar models, a single model is picked as a representative, giving preference to models based on <a class="def" href="/books/n/handbook/A1237/def-item/app155/">RefSeq</a> RNAs or on <a class="def" href="/books/n/handbook/A1237/def-item/app62/">GenBank</a> mRNAs. Predicted gene models that significantly overlap transcript-based models but that are not sufficiently similar to consolidate are discarded.</p></div><div id="A1498"><h3>Pruning of Gene Models</h3><p>Some gene models are discarded because: superior gene annotation is available from a curated genomic region, they are likely to represent pseudogenes, or they are incompatible with other gene models.</p><div id="A1499"><h4>Gene Models Superceded by Curated Genomic Regions</h4><p>The manually reviewed annotations from curated genomic region RefSeqs are used in preference to any corresponding gene models generated by automated processing. The curated genomic regions are aligned to the assembled genomic contigs by remapping the alignments between these RefSeqs and the component genomic sequences. Any gene model that significantly overlaps a segment of the assembled sequence that corresponds to a curated genomic region is discarded.</p></div><div id="A1500"><h4>Gene Models Likely to Be Pseudogenes</h4><p>When transcripts from a particular gene are aligned to the genomic sequences, they will align not only to the active copy of the gene but also to any segment of the genome containing a <a class="def" href="/books/n/handbook/A1237/def-item/app147/">pseudogene</a> derived from the active gene. Because model transcripts or model proteins that represent nontranscribed pseudogenes are undesirable, an attempt is made to identify and remove such models.</p><p>Whenever possible, alignments of RefSeqs for pseudogenes, either curated genomic regions or RNAs, are used to annotate pseudogenes. Some additional models derived from pseudogenes that are not yet represented by RefSeqs are eliminated by the following mechanism. All models based on the same supporting <a class="def" href="/books/n/handbook/A1237/def-item/app114/">mRNA</a> are compared with respect to the percent identity of the alignments and the number of exons. Only the model with the strongest evidence is retained.</p></div><div id="A1501"><h4>Conflicting Gene Models</h4><p>When two gene models are found to have an extensive overlap, then in general only the model with the stronger evidence is retained. However, models based on RefSeqs are always retained. Whereas any model not based on a <a class="def" href="/books/n/handbook/A1237/def-item/app155/">RefSeq</a> is discarded if it overlaps a model that is RefSeq based, two RefSeq-based models that overlap are both retained.</p></div></div><div id="A1502"><h3>Location of Model Coding Regions</h3><p>Initially, the longest open reading frame from each gene model is annotated as the protein coding sequence. This annotation can be revised if evidence associated with that model provides support for an alternative coding region. The protein coding sequence from any transcript used as evidence for a gene model is compared with the longest open reading frame in that model using <a class="def" href="/books/n/handbook/A1237/def-item/app9/">BLAST</a> (<a class="bk_pop" href="#A1539">4</a>). If the two do not match, the conflict is noted, and the annotation is revised if there is evidence to support an alternative coding region. For example, the coding sequence from the transcript evidence may indicate that an alternate <a class="def" href="/books/n/handbook/A1237/def-item/app207/">translation start site</a> is used, or that the model contains a premature termination <a class="def" href="/books/n/handbook/A1237/def-item/app206/">codon</a>. Models with coding regions less than 90 amino acids long are discarded, unless they are based on a <a class="def" href="/books/n/handbook/A1237/def-item/app155/">RefSeq</a>.</p></div><div id="A1503"><h3>Relating Gene Models to Known Genes, Transcripts, and Proteins</h3><p>The set of gene models produced by the preceding steps is a mixture of models for predicted genes and for known genes. To help identify models representing known genes, the model transcripts are compared with known transcripts. To help name the predicted genes, the proteins encoded by the models are also compared with known proteins.</p><div id="A1504"><h4>Relating the Model Transcripts to Known Transcripts</h4><p>To provide continuity from <a class="def" href="/books/n/handbook/A1237/def-item/app209/">build</a> to build and to identify genes based on their predicted transcripts, <a class="def" href="/books/n/handbook/A1237/def-item/app103/">MegaBLAST</a> (<a class="bk_pop" href="#A1545">10</a>) is used to compare model RNAs to: (<i>a</i>) <a class="def" href="/books/n/handbook/A1237/def-item/app155/">RefSeq</a> RNAs; (<i>b</i>) mRNAs from <a class="def" href="/books/n/handbook/A1237/def-item/app62/">GenBank</a>; and (<i>c</i>) model RNAs from the previous build. These comparisons are reported as reciprocal best hits if: (<i>a</i>) they produce a significant hit; (<i>b</i>) no other model has a better hit to that particular <a class="def" href="/books/n/handbook/A1237/def-item/app158/">RNA</a>; and (<i>c</i>) no other RNA has a better hit to that particular model.</p></div><div id="A1505"><h4>Relating the Model Proteins to Known Proteins</h4><p>The eukaryotic proteins with the best match to each protein predicted by the annotation process are used to identify the best model for a possible gene and to assign a name to gene models that are novel. The proteins encoded by the models are aligned to proteins from <a class="def" href="/books/n/handbook/A1237/def-item/app175/">SWISS-PROT</a> (<a class="bk_pop" href="#A1553">18</a>), <a class="def" href="/books/n/handbook/A1237/def-item/app116/">NCBI</a> <a class="def" href="/books/n/handbook/A1237/def-item/app155/">RefSeq</a> proteins (<a class="bk_pop" href="#A1543">8</a>, <a class="bk_pop" href="#A1544">9</a>), and the NCBI non-redundant protein <a href="/blast/html/blastcgihelp.html#protein_databases" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">database</a> using <a class="def" href="/books/n/handbook/A1237/def-item/app11/">blastp</a> (<a class="bk_pop" href="#A1539">4</a>). The name of the eukaryotic protein with the best match, its sequence identifier, and match score are recorded for each predicted protein with a significant hit.</p></div></div><div id="A1506"><h3>Assigning Gene Identifiers to Models</h3><p>Gene models are attributed to known genes whenever the correspondence is clear. If a model <a class="def" href="/books/n/handbook/A1237/def-item/app158/">RNA</a> has a reciprocal best hit with a known RNA, then the annotation of the known RNA is used to identify the gene. The first models to be assigned to genes are those that have reciprocal best hits with <a class="def" href="/books/n/handbook/A1237/def-item/app155/">RefSeq</a> RNAs. This is followed by assignment of those models that have reciprocal best hits to models from the previous <a class="def" href="/books/n/handbook/A1237/def-item/app209/">build</a> or to <a class="def" href="/books/n/handbook/A1237/def-item/app62/">GenBank</a> mRNAs. Gene data for models that match a <a class="def" href="/books/n/handbook/A1237/def-item/app114/">mRNA</a> not yet represented by a RefSeq are obtained from <a class="def" href="/books/n/handbook/A1237/def-item/app116/">NCBI</a> gene-specific databases (currently <a class="def" href="/books/n/handbook/A1237/def-item/app97/">Entrez Gene</a>, <a href="/books/n/handbook/ch19/">Chapter 19</a>). If the mRNA is associated with an entry in one of these databases, then the information attached to that gene record (e.g., symbols, names, and database cross-references) is used in the annotation. If the correspondence with known genes is ambiguous, as may occur if there are undocumented paralogs, then an interim gene identifier is assigned.</p></div><div id="A1511"><h3>Selection of Transcript Models to Represent Each Gene</h3><p>Multiple models based on alternative transcripts for some genes may be produced. In most of these cases, one transcript model is selected to represent the product of the gene for annotation purposes. Any homology between eukaryotic proteins and proteins encoded by the models guides the choice between alternative models. Multiple transcripts are annotated only if the models are based on <a class="def" href="/books/n/handbook/A1237/def-item/app155/">RefSeq</a> mRNAs representing alternative transcripts from the same gene.</p><p>Although alternative transcript models are not annotated, the alignments between the transcripts that represent alternative splicing and genomic contigs are processed for display in <a class="def" href="/books/n/handbook/A1237/def-item/app99/">Map Viewer</a>, Evidence Viewer, and Model Maker (see <a href="/books/n/handbook/ch20/">Chapter 20</a>).</p></div><div id="A1512"><h3>Naming of Gene Products</h3><p>The transcripts and protein products of any models that have been assigned to a known gene are given the product names that appear in the LocusLink entry for that gene. The gene products from other genes are named based on any significant homology to other eukaryotic proteins, provided that the matching protein has a meaningful name (i.e., names such as &#x0201c;Hypothetical...&#x0201d; are ignored).</p></div><div id="A1513"><h3>Annotation of the Assembled Genomic Contigs</h3><p>The genomic <a class="def" href="/books/n/handbook/A1237/def-item/app30/">contig</a> RefSeqs are annotated with features that provide information about the location of genes, mRNAs, and coding regions. Features from curated genomic region RefSeqs are copied to the contigs based on the alignment between the curated sequence and the corresponding contig. Protein domains from the Conserved Domain Database (<a class="def" href="/books/n/handbook/A1237/def-item/app20/">CDD</a>; Ref. <a class="bk_pop" href="#A1551">16</a>) are identified using reverse position-specific <a class="def" href="/books/n/handbook/A1237/def-item/app9/">BLAST</a> (<a class="def" href="/books/n/handbook/A1237/def-item/app159/">RPS-BLAST</a>; Ref. <a class="bk_pop" href="#A1539">4</a>), and their locations are annotated. A description of the evidence supporting those RNAs and proteins that are not curated RefSeqs, i.e., those that are models, is also recorded.</p></div></div><div id="A1514"><h2 id="_A1514_">Annotation of Other Features</h2><p>Reference sequences produced by the genome assembly process are annotated with features that provide landmarks valuable for making connections between maps based on different coordinate systems and for associating genes with diseases.</p><div id="A1515"><h3>Annotation of STSs</h3><p>Placement of STSs on the genome assembly allows sequence-based data to be integrated with non-sequence-based maps that contain <a class="def" href="/books/n/handbook/A1237/def-item/app173/">STS</a> markers, such as genetic and radiation hybrid maps. STSs are identified by using <a class="def" href="/books/n/handbook/A1237/def-item/app47/">e-PCR</a> (<a class="bk_pop" href="#A1548">13</a>) to find sequences that match the STS primer pairs from <a class="def" href="/books/n/handbook/A1237/def-item/app188/">UniSTS</a>, the spacing of which is consistent with the reported PCR product size. The number of times that each STS appears in the assembled genome is recorded so that only those STSs that appear at only one or two locations in the assembled genome are annotated.</p></div><div id="A1516"><h3>Annotation of Clones</h3><p>Placement on the genome assembly of clones that have been mapped to cytogenetic bands by <a class="def" href="/books/n/handbook/A1237/def-item/app56/">FISH</a> provides the means to determine the correspondence between the sequence and cytogenetic coordinate systems (<a class="bk_pop" href="#A1549">14</a>, <a class="bk_pop" href="#A1550">15</a>). Knowing this correspondence allows the integration of sequence-based data with cytogenetic data. For human, only those clones mapped by fluorescence <i>in situ</i> hybridization (FISH) by the human <a class="def" href="/books/n/handbook/A1237/def-item/app6/">BAC</a> Resource Consortium (see the <a href="/genome/cyto/hbrc.shtml" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">Human BAC Resource</a>) are annotated. Clones are placed using three types of sequence tags. Clones that have sequence for the genomic insert, either draft or finished, with a <a class="def" href="/books/n/handbook/A1237/def-item/app62/">GenBank</a> <a class="def" href="/books/n/handbook/A1237/def-item/app208/">Accession number</a> are localized by remapping the alignment between the clone sequence and other genomic clones to the assembled genomic contigs. Similarly, clones that have BAC end sequences are localized by remapping the alignment between the BAC end sequences and genomic clone sequences to the assembled genomic contigs. Clones that have <a class="def" href="/books/n/handbook/A1237/def-item/app173/">STS</a> markers confirmed by <a class="def" href="/books/n/handbook/A1237/def-item/app129/">PCR</a> or hybridization experiments are mapped using the locations in the assembled contigs of STS markers that were identified by <a class="def" href="/books/n/handbook/A1237/def-item/app47/">e-PCR</a>. The number of places that each clone appears in the assembled genome is recorded so that only those clones that either have a unique placement in the assembled genome or are placed twice on the same chromosome are annotated.</p></div><div id="A1517"><h3>Annotation of Sequence Variation</h3><p>Placement of Single Nucleotide Polymorphisms (SNPs) and other variations on the genome provides numerous landmarks that are valuable for associating genes with diseases (<a href="/books/n/handbook/ch5/">Chapter 5</a>). Variations from <a href="/SNP" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">dbSNP</a> (<a class="bk_pop" href="#A1554">19</a>) are placed in their genomic contexts using the sequences that flank the variation. Flanking sequences are first run through <a href="http://ftp.genome.washington.edu/cgi-bin/RepeatMasker" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">RepeatMasker</a> to mask repetitive sequences and then aligned to the assembled genomic sequence contigs using <a class="def" href="/books/n/handbook/A1237/def-item/app103/">MegaBLAST</a> (<a class="bk_pop" href="#A1545">10</a>). The resulting matches are classified as either high or low confidence, depending on the quality of the alignment, and the number of matches for each <a class="def" href="/books/n/handbook/A1237/def-item/app168/">SNP</a> is recorded so that only those SNPs that map to one or two locations in the assembled genome are annotated.</p></div></div><div id="A1519"><h2 id="_A1519_">Product Data Sets</h2><p>The products of our assembly and annotation process are made available to the public as RefSeqs of assembled chromosome sequences, genomic sequence contigs, model transcripts, and model proteins. RefSeqs are produced in alternative formats so that they can be retrieved by <a class="def" href="/books/n/handbook/A1237/def-item/app45/">Entrez</a>, <a class="def" href="/books/n/handbook/A1237/def-item/app9/">BLAST</a>, or <a class="def" href="/books/n/handbook/A1237/def-item/app58/">FTP</a>.</p><div id="A1520"><h3>RefSeqs</h3><p>A fully annotated Refseq entry is made for each genomic sequence <a class="def" href="/books/n/handbook/A1237/def-item/app30/">contig</a>. Separate <a class="def" href="/books/n/handbook/A1237/def-item/app155/">RefSeq</a> model <a class="def" href="/books/n/handbook/A1237/def-item/app158/">RNA</a> and protein entries are also made for any of the transcripts and coding regions annotated on genomic contigs not identified as existing RefSeqs. Finally, a RefSeq entry is made for each chromosome by combining the annotated sequences of the genomic contigs in the appropriate order and with the appropriate spacing. The resulting RefSeqs can be retrieved through <a class="def" href="/books/n/handbook/A1237/def-item/app45/">Entrez</a>.</p></div><div id="A1521"><h3>BLAST Databases</h3><p>The assembled genomic <a class="def" href="/books/n/handbook/A1237/def-item/app30/">contig</a> RefSeqs are formatted as a <a class="def" href="/books/n/handbook/A1237/def-item/app9/">BLAST</a> database (<a href="/books/n/handbook/ch16/">Chapter 16</a>). Separate BLAST databases are also produced from the set of transcripts and the set of proteins annotated on the assembled genome. These databases include both known and model RefSeqs. In addition, separate BLAST databases are produced from the complete sets of transcripts and proteins predicted by <a class="def" href="/books/n/handbook/A1237/def-item/app64/">GenomeScan</a>.</p></div><div id="A1522"><h3>Data Files for FTP</h3><p>The annotated genomic sequence <a class="def" href="/books/n/handbook/A1237/def-item/app30/">contig</a>, model transcript, and model protein RefSeqs are saved in <a class="def" href="/books/n/handbook/A1237/def-item/app62/">GenBank</a> flatfile and <a class="def" href="/books/n/handbook/A1237/def-item/app5/">ASN.1</a> formats. The same sets of sequences that are used to make <a class="def" href="/books/n/handbook/A1237/def-item/app9/">BLAST</a> databases are also saved in <a class="def" href="/books/n/handbook/A1237/def-item/app53/">FASTA</a> format. All of these data files, together with files that specify the construction of the genomic contigs and their arrangement along the chromosomes, are made available for download by <a href="ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=ftp">FTP</a>.</p></div></div><div id="A1523"><h2 id="_A1523_">Production of Maps That Display Genome Features</h2><p>We produce many maps showing the locations of various features annotated on our genome assembly. Maps containing whatever combination of features that interests the user can be selected and displayed side-by-side using <a href="/PMGifs/Genomes/MapViewerHelp.html" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">Map Viewer</a> (<a href="/books/n/handbook/ch20/">Chapter 20</a>). Detailed descriptions of the maps available for each genome are available in the relevant Genome Map Viewer <a href="/PMGifs/Genomes/humansearch.html" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">help document</a>.</p><div id="A1524"><h3>Preparation of Map Data</h3><p>Basic map data are prepared for each map to identify each feature, delineate its position on the chromosome, and specify how it is to be displayed. For many maps, supplemental data are prepared to provide more information about each feature. <a class="def" href="/books/n/handbook/A1237/def-item/app99/">Map Viewer</a> displays this map-specific supplemental information when users select a particular map as the Master Map (<a href="/books/n/handbook/ch20/">Chapter 20</a>).</p><div id="A1525"><h4>Maps Based on Sequence Coordinates</h4><p>Maps that display those features annotated on the genomic sequence contigs (genes, STSs, clones, and SNPs) are generated by translating the positions of the features on the contigs into chromosome coordinates. Contig coordinates are translated into chromosome coordinates using the positions of the contigs along each chromosome, as determined in the genome assembly step. Using this same method, alignments between various sequences and the genomic contigs are translated into chromosome coordinates to produce additional maps that show the locations of the aligned sequences on the chromosomes. Maps generated from sequence alignments include maps that show the genomic positions of <a class="def" href="/books/n/handbook/A1237/def-item/app114/">mRNA</a> plus <a class="def" href="/books/n/handbook/A1237/def-item/app46/">EST</a> sequences, or genomic sequences from <a class="def" href="/books/n/handbook/A1237/def-item/app62/">GenBank</a>. The specifications used to <a class="def" href="/books/n/handbook/A1237/def-item/app209/">build</a> each genomic <a class="def" href="/books/n/handbook/A1237/def-item/app30/">contig</a> are also translated into chromosome coordinates to produce one map that shows the component sequences used to assemble each contig and another that simply shows the finished and draft sections of the contigs.</p></div><div id="A1526"><h4>Maps Based on Other Coordinate Systems</h4><p>Cytogenetic maps, genetic linkage maps, and radiation hybrid maps use different coordinate systems that are not based on sequence. To generate data for these types of maps, the locations of the map elements are listed in the coordinate system appropriate to each map. <a class="def" href="/books/n/handbook/A1237/def-item/app99/">Map Viewer</a> can scale maps defined in different coordinate systems so that they can de displayed side by side.</p></div></div><div id="A1527"><h3>Making the Map Data Available for Use</h3><p>All of the map data for the new genome assembly are loaded into the <a class="def" href="/books/n/handbook/A1237/def-item/app99/">Map Viewer</a> database. Next, the objects in the new maps are indexed so that users can search for and then display specific features (<a href="/books/n/handbook/ch20/">Chapter 20</a>). The data from the Map Viewer database are exported to produce a set of map data files that is made available via <a href="ftp://ftp.ncbi.nih.gov/genomes/H_sapiens/maps/" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=ftp">FTP</a>.</p></div></div><div id="A1528"><h2 id="_A1528_">Public Release of Assembly and Models</h2><p>To ensure that a consistent view of the annotated genome assembly is presented, the release of databases and <a class="def" href="/books/n/handbook/A1237/def-item/app58/">FTP</a> files is coordinated. When everything is ready for release, the <a class="def" href="/books/n/handbook/A1237/def-item/app99/">Map Viewer</a> <a href="/mapview/map_search.cgi" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">display</a> is switched to the new <a class="def" href="/books/n/handbook/A1237/def-item/app209/">build</a>, the <a class="def" href="/books/n/handbook/A1237/def-item/app9/">BLAST</a> <a href="/genome/seq/page.cgi?F=HsBlast.html&#x00026;org=Hs" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">databases</a> are swapped, and the files on the <a href="ftp://ftp.ncbi.nlm.nih.gov/genomes/H_sapiens" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=ftp">FTP</a> site are replaced. Several associated databases are then refreshed, including LocusLink, dbSNP, and <a class="def" href="/books/n/handbook/A1237/def-item/app188/">UniSTS</a>, so that the data they contain reflect the new build. Finally, the Web pages that provide <a href="/genome/guide/human/HsStats.html" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">statistics</a> for the build and record <a href="/genome/guide/human/release_notes.html" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">changes</a> to the genome assembly and annotation process are updated.</p></div><div id="A1529"><h2 id="_A1529_">Integration with Other Resources</h2><p>The products of the genome assembly and annotation process are linked extensively to various <a class="def" href="/books/n/handbook/A1237/def-item/app116/">NCBI</a> resources. These links provide different views of the data and more information for researchers as they follow a particular line of investigation.</p><div id="A1530"><h3>Links between Map Viewer and Other Resources</h3><p>The maps displayed by <a class="def" href="/books/n/handbook/A1237/def-item/app99/">Map Viewer</a> have embedded links between map objects and relevant <a class="def" href="/books/n/handbook/A1237/def-item/app116/">NCBI</a> resources (<a class="figpopup" href="/books/NBK21086/table/A1531/?report=objectonly" target="object" rid-figpopup="figA1531" rid-ob="figobA1531">Table 2</a>). Many of these resources also have reciprocal links back to Map Viewer, allowing, for example, a gene in LocusLink to be displayed in its genomic context.</p><div class="iconblock whole_rhythm clearfix ten_col table-wrap" id="figA1531"><a href="/books/NBK21086/table/A1531/?report=objectonly" target="object" title="Table 2" class="img_link icnblk_img figpopup" rid-figpopup="figA1531" rid-ob="figobA1531"><img class="small-thumb" src="/books/NBK21086/table/A1531/?report=thumb" src-large="/books/NBK21086/table/A1531/?report=previmg" alt="Table 2. Links from Map Viewer objects to other NCBI resources." /></a><div class="icnblk_cntnt"><h4 id="A1531"><a href="/books/NBK21086/table/A1531/?report=objectonly" target="object" rid-ob="figobA1531">Table 2</a></h4><p class="float-caption no_bottom_margin">Links from Map Viewer objects to other NCBI resources. </p></div></div></div><div id="A1532"><h3>Links between Reference Sequences and Other Resources</h3><p>During the production of RefSeqs, links between the annotated features (clones, genes, SNPs, and STSs) and the relevant resources listed in <a class="figpopup" href="/books/NBK21086/table/A1531/?report=objectonly" target="object" rid-figpopup="figA1531" rid-ob="figobA1531">Table 2</a> are created. Links are also made between the genomic <a class="def" href="/books/n/handbook/A1237/def-item/app30/">contig</a> RefSeqs and the RefSeqs for the model transcripts and proteins that they encode.</p></div><div id="A1533"><h3>Integration with BLAST</h3><p>A customized <a class="def" href="/books/n/handbook/A1237/def-item/app9/">BLAST</a> <a href="/genome/seq/page.cgi?F=HsBlast.html&#x00026;ORG=Hs" ref="pagearea=body&amp;targetsite=external&amp;targetcat=link&amp;targettype=uri">Web page</a> allows the comparison of any sequence to a BLAST database of model transcript, model protein, or genomic <a class="def" href="/books/n/handbook/A1237/def-item/app30/">contig</a> RefSeqs. Users can choose to view any hits that result from such a search on a diagram showing the chromosomal location of the hits, with each hit linked to a <a class="def" href="/books/n/handbook/A1237/def-item/app99/">Map Viewer</a> display of the region encompassing the sequence alignment.</p></div></div><div id="A1534"><h2 id="_A1534_">Contributors</h2><p>Richa Agarwala, Jonathan Baker, Hsiu-Chuan Chen, Vyacheslav Chetvernin, Deanna Church, Cliff Clausen, Dmitry Dernovoy, Olga Ermolaeva, Wratko Hlavina, Wonhee Jang, Philip Johnson, Jonathan Kans, Paul Kitts, Alex Lash, David Lipman, Donna Maglott, Jim Ostell, Keith Oxenrider, Kim Pruitt, Sergei Resenchuk, Victor Sapojnikov, Greg Schuler, Steve Sherry, Andrei Shkeda, Alexandre Souvorov, Tugba Suzek, Tatiana Tatusova, Lukas Wagner, and Sarah Wheelan</p></div><div id="A1535"><h2 id="_A1535_">References</h2><dl class="temp-labeled-list"><dt>1.</dt><dd><div class="bk_ref" id="A1536">Bently DR . Genomic sequence information should be released immediately and freely in the public domain. <span><span class="ref-journal">Science. </span>1996;<span class="ref-vol">274</span>:533–534.</span> [<a href="https://pubmed.ncbi.nlm.nih.gov/8928006" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 8928006</span></a>]</div></dd><dt>2.</dt><dd><div class="bk_ref" id="A1537">Guyer M . Statement on the rapid release of genomic DNA sequence. <span><span class="ref-journal">Genome Res. </span>1998;<span class="ref-vol">8</span>:413.</span> [<a href="https://pubmed.ncbi.nlm.nih.gov/9582183" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 9582183</span></a>]</div></dd><dt>3.</dt><dd><div class="bk_ref" id="A1538">Jang W , Chen HC , Sicotte H , Schuler GD . Making effective use of human genomic sequence data. <span><span class="ref-journal">Trends Genet. </span>1999;<span class="ref-vol">15</span>:284–286.</span> [<a href="https://pubmed.ncbi.nlm.nih.gov/10390628" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 10390628</span></a>]</div></dd><dt>4.</dt><dd><div class="bk_ref" id="A1539">Altschul SF , Madden TL , Schaffer AA , Zhang J , Zhang Z , Miller W , Lipman DJ . Gapped BLAST and PSI-BLAST: a new generation of protein database search programs. <span><span class="ref-journal">Nucleic Acids Res. </span>1997;<span class="ref-vol">25</span>:3389–3402.</span> [<a href="/pmc/articles/PMC146917/" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pmc">PMC free article<span class="bk_prnt">: PMC146917</span></a>] [<a href="https://pubmed.ncbi.nlm.nih.gov/9254694" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 9254694</span></a>]</div></dd><dt>5.</dt><dd><div class="bk_ref" id="A1540">Zhao S , Malek J , Mahairas G , Fu L , Nierman W , Venter JC , Adams MD . Human BAC ends quality assessment and sequence analyses. <span><span class="ref-journal">Genomics. </span>2000;<span class="ref-vol">63</span>:321–332.</span> [<a href="https://pubmed.ncbi.nlm.nih.gov/10704280" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 10704280</span></a>]</div></dd><dt>6.</dt><dd><div class="bk_ref" id="A1541">Mahairas GG , Wallace JC , Smith K , Swartzell S , Holzman T , Keller A , Shaker R , Furlong J , Young J , Zhao S , Adams MD , Hood L . Sequence-tagged connectors: a sequence approach to mapping and scanning the human genome. <span><span class="ref-journal">Proc Natl Acad Sci U S A. </span>1999;<span class="ref-vol">96</span>:9739–9744.</span> [<a href="/pmc/articles/PMC22280/" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pmc">PMC free article<span class="bk_prnt">: PMC22280</span></a>] [<a href="https://pubmed.ncbi.nlm.nih.gov/10449764" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 10449764</span></a>]</div></dd><dt>7.</dt><dd><div class="bk_ref" id="A1542">Lander ES , Linton LM , Birren B , Nusbaum C , Zody MC , Baldwin J , Devon K , Dewar K , Doyle M , FitzHugh W , Funke R , Gage D , Harris K , Heaford A , Howland J , Kann L , Lehoczky J , LeVine R , McEwan P , McKernan K , Meldrim J , Mesirov JP , Miranda C , Morris W , Naylor J , Raymond C , Rosetti M , Santos R , Sheridan A , Sougnez C , Stange-Thomann N , Stojanovic N , Subramanian A , Wyman D , Rogers J , Sulston J , Ainscough R , Beck S , Bentley D , Burton J , Clee C , Carter N , Coulson A , Deadman R , Deloukas P , Dunham A , Dunham I , Durbin R , French L , Grafham D , Gregory S , Hubbard T , Humphray S , Hunt A , Jones M , Lloyd C , McMurray A , Matthews L , Mercer S , Milne S , Mullikin JC , Mungall A , Plumb R , Ross M , Shownkeen R , Sims S , Waterston RH , Wilson RK , Hillier LW , McPherson JD , Marra MA , Mardis ER , Fulton LA , Chinwalla AT , Pepin KH , Gish WR , Chissoe SL , Wendl MC , Delehaunty KD , Miner TL , Delehaunty A , Kramer JB , Cook LL , Fulton RS , Johnson DL , Minx PJ , Clifton SW , Hawkins T , Branscomb E , Predki P , Richardson P , Wenning S , Slezak T , Doggett N , Cheng JF , Olsen A , Lucas S , Elkin C , Uberbacher E , Frazier M , Gibbs RA , Muzny DM , Scherer SE , Bouck JB , Sodergren EJ , Worley KC , Rives CM , Gorrell JH , Metzker ML , Naylor SL , Kucherlapati RS , Nelson DL , Weinstock GM , Sakaki Y , Fujiyama A , Hattori M , Yada T , Toyoda A , Itoh T , Kawagoe C , Watanabe H , Totoki Y , Taylor T , Weissenbach J , Heilig R , Saurin W , Artiguenave F , Brottier P , Bruls T , Pelletier E , Robert C , Wincker P , Smith DR , Doucette-Stamm L , Rubenfield M , Weinstock K , Lee HM , Dubois J , Rosenthal A , Platzer M , Nyakatura G , Taudien S , Rump A , Yang H , Yu J , Wang J , Huang G , Gu J , Hood L , Rowen L , Madan A , Qin S , Davis RW , Federspiel NA , Abola AP , Proctor MJ , Myers RM , Schmutz J , Dickson M , Grimwood J , Cox DR , Olson MV , Kaul R , Raymond C , Shimizu N , Kawasaki K , Minoshima S , Evans GA , Athanasiou M , Schultz R , Roe BA , Chen F , Pan H , Ramser J , Lehrach H , Reinhardt R , McCombie WR , de la Bastide M , Dedhia N , Blocker H , Hornischer K , Nordsiek G , Agarwala R , Aravind L , Bailey JA , Bateman A , Batzoglou S , Birney E , Bork P , Brown DG , Burge CB , Cerutti L , Chen HC , Church D , Clamp M , Copley RR , Doerks T , Eddy SR , Eichler EE , Furey TS , Galagan J , Gilbert JG , Harmon C , Hayashizaki Y , Haussler D , Hermjakob H , Hokamp K , Jang W , Johnson LS , Jones TA , Kasif S , Kaspryzk A , Kennedy S , Kent WJ , Kitts P , Koonin EV , Korf I , Kulp D , Lancet D , Lowe TM , McLysaght A , Mikkelsen T , Moran JV , Mulder N , Pollara VJ , Ponting CP , Schuler G , Schultz J , Slater G , Smit AF , Stupka E , Szustakowski J , Thierry-Mieg D , Thierry-Mieg J , Wagner L , Wallis J , Wheeler R , Williams A , Wolf YI , Wolfe KH , Yang SP , Yeh RF , Collins F , Guyer MS , Peterson J , Felsenfeld A , Wetterstrand KA , Patrinos A , Morgan MJ , Szustakowki J , de Jong P , Catanese JJ , Osoegawa K , Shizuya H , Choi S , Chen YJ . Initial sequencing and analysis of the human genome. <span><span class="ref-journal">Nature. </span>2001;<span class="ref-vol">409</span>:860–921.</span> [<a href="https://pubmed.ncbi.nlm.nih.gov/11237011" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 11237011</span></a>]</div></dd><dt>8.</dt><dd><div class="bk_ref" id="A1543">Pruitt KD , Katz KS , Sicotte H , Maglott DR . Introducing RefSeq and LocusLink: curated human genome resources at the NCBI. <span><span class="ref-journal">Trends Genet. </span>2000;<span class="ref-vol">16</span>:44–47.</span> [<a href="https://pubmed.ncbi.nlm.nih.gov/10637631" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 10637631</span></a>]</div></dd><dt>9.</dt><dd><div class="bk_ref" id="A1544">Pruitt KD , Maglott DR . RefSeq and LocusLink: NCBI gene-centered resources. <span><span class="ref-journal">Nucleic Acids Res. </span>2001;<span class="ref-vol">29</span>:137–140.</span> [<a href="/pmc/articles/PMC29787/" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pmc">PMC free article<span class="bk_prnt">: PMC29787</span></a>] [<a href="https://pubmed.ncbi.nlm.nih.gov/11125071" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 11125071</span></a>]</div></dd><dt>10.</dt><dd><div class="bk_ref" id="A1545">Zhang Z , Schwartz S , Wagner L , Miller W . A GREEDY algorithm for aligning DNA sequences. <span><span class="ref-journal">J Comput Biol. </span>2000;<span class="ref-vol">7</span>:203–214.</span> [<a href="https://pubmed.ncbi.nlm.nih.gov/10890397" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 10890397</span></a>]</div></dd><dt>11.</dt><dd><div class="bk_ref" id="A1546">Jurka J . Repeats in genomic DNA: mining and meaning. <span><span class="ref-journal">Curr Opin Struct Biol. </span>1998;<span class="ref-vol">8</span>:333–337.</span> [<a href="https://pubmed.ncbi.nlm.nih.gov/9666329" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 9666329</span></a>]</div></dd><dt>12.</dt><dd><div class="bk_ref" id="A1547">Smit AF . Interspersed repeats and other mementos of transposable elements in mammalian genomes. <span><span class="ref-journal">Curr Opin Genet Dev. </span>1999;<span class="ref-vol">9</span>:657–663.</span> [<a href="https://pubmed.ncbi.nlm.nih.gov/10607616" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 10607616</span></a>]</div></dd><dt>13.</dt><dd><div class="bk_ref" id="A1548">Schuler GD . Sequence mapping by electronic PCR. <span><span class="ref-journal">Genome Res. </span>1997;<span class="ref-vol">7</span>:541–550.</span> [<a href="/pmc/articles/PMC310656/" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pmc">PMC free article<span class="bk_prnt">: PMC310656</span></a>] [<a href="https://pubmed.ncbi.nlm.nih.gov/9149949" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 9149949</span></a>]</div></dd><dt>14.</dt><dd><div class="bk_ref" id="A1549">Kirsch IR , Green ED , Yonescu R , Strausberg R , Carter N , Bentley D , Leversha MA , Dunham I , Braden VV , Hilgenfeld E , Schuler G , Lash AE , Shen GL , Martelli M , Kuehl WM , Klausner RD , Ried T . A systematic, high-resolution linkage of the cytogenetic and physical maps of the human genome. <span><span class="ref-journal">Nat Genet. </span>2000;<span class="ref-vol">24</span>:339–340.</span> [<a href="https://pubmed.ncbi.nlm.nih.gov/10742091" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 10742091</span></a>]</div></dd><dt>15.</dt><dd><div class="bk_ref" id="A1550">Cheung VG , Nowak N , Jang W , Kirsch IR , Zhao S , Chen XN , Furey TS , Kim UJ , Kuo WL , Olivier M , Conroy J , Kasprzyk A , Massa H , Yonescu R , Sait S , Thoreen C , Snijders A , Lemyre E , Bailey JA , Bruzel A , Burrill WD , Clegg SM , Collins S , Dhami P , Friedman C , Han CS , Herrick S , Lee J , Ligon AH , Lowry S , Morley M , Narasimhan S , Osoegawa K , Peng Z , Plajzer-Frick I , Quade BJ , Scott D , Sirotkin K , Thorpe AA , Gray JW , Hudson J , Pinkel D , Ried T , Rowen L , Shen-Ong GL , Strausberg RL , Birney E , Callen DF , Cheng JF , Cox DR , Doggett NA , Carter NP , Eichler EE , Haussler D , Korenberg JR , Morton CC , Albertson D , Schuler G , de Jong PJ , Trask BJ . Integration of cytogenetic landmarks into the draft sequence of the human genome. <span><span class="ref-journal">Nature. </span>2001;<span class="ref-vol">409</span>:953–958.</span> [<a href="/pmc/articles/PMC7845515/" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pmc">PMC free article<span class="bk_prnt">: PMC7845515</span></a>] [<a href="https://pubmed.ncbi.nlm.nih.gov/11237021" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 11237021</span></a>]</div></dd><dt>16.</dt><dd><div class="bk_ref" id="A1551">Marchler-Bauer A , Panchenko AR , Shoemaker BA , Thiessen PA , Geer LY , Bryant SH . CDD: a database of conserved domain alignments with links to domain three-dimensional structure. <span><span class="ref-journal">Nucleic Acids Res. </span>2002;<span class="ref-vol">30</span>:281–283.</span> [<a href="/pmc/articles/PMC99109/" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pmc">PMC free article<span class="bk_prnt">: PMC99109</span></a>] [<a href="https://pubmed.ncbi.nlm.nih.gov/11752315" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 11752315</span></a>]</div></dd><dt>17.</dt><dd><div class="bk_ref" id="A1552">Yeh RF , Lim LP , Burge CB . Computational inference of homologous gene structures in the human genome. <span><span class="ref-journal">Genome Res. </span>2001;<span class="ref-vol">11</span>:803–816.</span> [<a href="/pmc/articles/PMC311055/" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pmc">PMC free article<span class="bk_prnt">: PMC311055</span></a>] [<a href="https://pubmed.ncbi.nlm.nih.gov/11337476" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 11337476</span></a>]</div></dd><dt>18.</dt><dd><div class="bk_ref" id="A1553">Bairoch A , Apweiler R . The SWISS-PROT protein sequence data bank and its supplement TrEMBL in 1998. <span><span class="ref-journal">Nucleic Acids Res. </span>1998;<span class="ref-vol">26</span>:38–42.</span> [<a href="/pmc/articles/PMC147215/" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pmc">PMC free article<span class="bk_prnt">: PMC147215</span></a>] [<a href="https://pubmed.ncbi.nlm.nih.gov/9399796" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 9399796</span></a>]</div></dd><dt>19.</dt><dd><div class="bk_ref" id="A1554">Sherry ST , Ward M , Sirotkin K . dbSNP&#x02014;database for single nucleotide polymorphisms and other classes of minor genetic variation. <span><span class="ref-journal">Genome Res. </span>1999;<span class="ref-vol">9</span>:677–679.</span> [<a href="https://pubmed.ncbi.nlm.nih.gov/10447503" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 10447503</span></a>]</div></dd><dt>20.</dt><dd><div class="bk_ref" id="A1555">Dib C , Faure S , Fizames C , Samson D , Drouot N , Vignal A , Millasseau P , Marc S , Hazan J , Seboun E , Lathrop M , Gyapay G , Morissette J , Weissenbach J . A comprehensive genetic map of the human genome based on 5,264 microsatellites. <span><span class="ref-journal">Nature. </span>1996;<span class="ref-vol">380</span>:152–154.</span> [<a href="https://pubmed.ncbi.nlm.nih.gov/8600387" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 8600387</span></a>]</div></dd><dt>21.</dt><dd><div class="bk_ref" id="A1556">Broman KW , Murray JC , Sheffield VC , White RL , Weber JL . Comprehensive human genetic maps: individual and sex-specific variation in recombination. <span><span class="ref-journal">Am J Hum Genet. </span>1998;<span class="ref-vol">63</span>:861–869.</span> [<a href="/pmc/articles/PMC1377399/" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pmc">PMC free article<span class="bk_prnt">: PMC1377399</span></a>] [<a href="https://pubmed.ncbi.nlm.nih.gov/9718341" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 9718341</span></a>]</div></dd><dt>22.</dt><dd><div class="bk_ref" id="A1557">Kong A , Gudbjartsson DF , Sainz J , Jonsdottir GM , Gudjonsson SA , Richardsson B , Sigurdardottir S , Barnard J , Hallbeck B , Masson G , Shlien A , Palsson ST , Frigge ML , Thorgeirsson TE , Gulcher JR , Stefansson K . A high-resolution recombination map of the human genome. <span><span class="ref-journal">Nat Genet. </span>2002;<span class="ref-vol">31</span>:241–247.</span> [<a href="https://pubmed.ncbi.nlm.nih.gov/12053178" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 12053178</span></a>]</div></dd><dt>23.</dt><dd><div class="bk_ref" id="A1558">Schuler GD , Boguski MS , Stewart EA , Stein LD , Gyapay G , Rice K , White RE , Rodriguez-Tome P , Aggarwal A , Bajorek E , Bentolila S , Birren BB , Butler A , Castle AB , Chiannilkulchai N , Chu A , Clee C , Cowles S , Day PJ , Dibling T , Drouot N , Dunham I , Duprat S , East C , Hudson TJ . et al. A gene map of the human genome. <span><span class="ref-journal">Science. </span>1996;<span class="ref-vol">274</span>:540–546.</span> [<a href="https://pubmed.ncbi.nlm.nih.gov/8849440" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 8849440</span></a>]</div></dd><dt>24.</dt><dd><div class="bk_ref" id="A1559">Deloukas P , Schuler GD , Gyapay G , Beasley EM , Soderlund C , Rodriguez-Tome P , Hui L , Matise TC , McKusick KB , Beckmann JS , Bentolila S , Bihoreau M , Birren BB , Browne J , Butler A , Castle AB , Chiannilkulchai N , Clee C , Day PJ , Dehejia A , Dibling T , Drouot N , Duprat S , Fizames C , Bentley DR . et al. A physical map of 30,000 human genes. <span><span class="ref-journal">Science. </span>1998;<span class="ref-vol">282</span>:744–746.</span> [<a href="https://pubmed.ncbi.nlm.nih.gov/9784132" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 9784132</span></a>]</div></dd><dt>25.</dt><dd><div class="bk_ref" id="A1560">Agarwala R , Applegate DL , Maglott D , Schuler GD , Schaffer AA . A fast and scalable radiation hybrid map construction and integration strategy. <span><span class="ref-journal">Genome Res. </span>2000;<span class="ref-vol">10</span>:350–364.</span> [<a href="/pmc/articles/PMC311427/" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pmc">PMC free article<span class="bk_prnt">: PMC311427</span></a>] [<a href="https://pubmed.ncbi.nlm.nih.gov/10720576" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 10720576</span></a>]</div></dd><dt>26.</dt><dd><div class="bk_ref" id="A1561">Olivier M , Aggarwal A , Allen J , Almendras AA , Bajorek ES , Beasley EM , Brady SD , Bushard JM , Bustos VI , Chu A , Chung TR , De Witte A , Denys ME , Dominguez R , Fang NY , Foster BD , Freudenberg RW , Hadley D , Hamilton LR , Jeffrey TJ , Kelly L , Lazzeroni L , Levy MR , Lewis SC , Liu X , Lopez FJ , Louie B , Marquis JP , Martinez RA , Matsuura MK , Misherghi NS , Norton JA , Olshen A , Perkins SM , Perou AJ , Piercy C , Piercy M , Qin F , Reif T , Sheppard K , Shokoohi V , Smick GA , Sun WL , Stewart EA , Fernando J , Tejeda, Tran NM , Trejo T , Vo NT , Yan SC , Zierten DL , Zhao S , Sachidanandam R , Trask BJ , Myers RM , Cox DR . A high-resolution radiation hybrid map of the human genome draft sequence. <span><span class="ref-journal">Science. </span>2001;<span class="ref-vol">291</span>:1298–1302.</span> [<a href="https://pubmed.ncbi.nlm.nih.gov/11181994" ref="pagearea=cite-ref&amp;targetsite=entrez&amp;targetcat=link&amp;targettype=pubmed">PubMed<span class="bk_prnt">: 11181994</span></a>]</div></dd></dl></div><div id="bk_toc_contnr"></div></div></div>
            <div class="post-content"><div><div class="half_rhythm"><a href="/books/about/copyright/">Copyright Notice</a></div><div class="small"><span class="label">Bookshelf ID: NBK21086</span></div><div style="margin-top:2em" class="bk_noprnt"><a class="bk_cntns" href="/books/n/handbook/">Contents</a><div class="pagination bk_noprnt"><a class="active page_link prev" href="/books/n/handbook/ch13/" title="Previous page in this title">&lt; Prev</a><a class="active page_link next" href="/books/n/handbook/Part3.bxml/" title="Next page in this title">Next &gt;</a></div></div></div></div>

        </div>

        <!-- Custom content below content -->
        <div class="col4">

        </div>


        <!-- Book content -->

        <!-- Custom contetnt below bottom nav -->
        <div class="col5">

        </div>
    </div>

    <div id="rightcolumn" class="four_col col last">
        <!-- Custom content above discovery portlets -->
        <div class="col6">
            <div id="ncbi_share_book"><a href="#" class="ncbi_share" data-ncbi_share_config="popup:false,shorten:true" ref="id=NBK21086&amp;db=books">Share</a></div>

        </div>
        <div xmlns:np="http://ncbi.gov/portal/XSLT/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"></div><div class="portlet"><div class="portlet_head"><div class="portlet_title"><h3><span>Views</span></h3></div><a name="Shutter" sid="1" href="#" class="portlet_shutter" title="Show/hide content" remembercollapsed="true" pgsec_name="PDF_download" id="Shutter"></a></div><div class="portlet_content"><ul xmlns:np="http://ncbi.gov/portal/XSLT/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" class="simple-list"><li><a href="/books/NBK21086/?report=reader">PubReader</a></li><li><a href="/books/NBK21086/?report=printable">Print View</a></li><li><a data-jig="ncbidialog" href="#_ncbi_dlg_citbx_NBK21086" data-jigconfig="width:400,modal:true">Cite this Page</a><div id="_ncbi_dlg_citbx_NBK21086" style="display:none" title="Cite this Page"><div class="bk_tt">Kitts P. Genome Assembly and Annotation Process. 2002 Oct 9 [Updated 2003 Aug 13]. In: McEntyre J, Ostell J, editors. The NCBI Handbook [Internet]. Bethesda (MD): National Center for Biotechnology Information (US); 2002-.  Chapter 14.<span class="bk_cite_avail"></span></div></div></li><li><a href="/books/NBK21086/pdf/Bookshelf_NBK21086.pdf">PDF version of this page</a> (249K)</li><li><a href="/books/n/handbook/pdf/">PDF version of this title</a> (7.2M)</li><li><a href="#" class="toggle-glossary-link" title="Enable/disable links to the glossary">Disable Glossary Links</a></li></ul></div></div><div class="portlet"><div class="portlet_head"><div class="portlet_title"><h3><span>In this Page</span></h3></div><a name="Shutter" sid="1" href="#" class="portlet_shutter" title="Show/hide content" remembercollapsed="true" pgsec_name="page-toc" id="Shutter"></a></div><div class="portlet_content"><ul xmlns:np="http://ncbi.gov/portal/XSLT/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" class="simple-list"><li><a href="#_abs_rndgid_" ref="log$=inpage&amp;link_id=inpage">Summary</a></li><li><a href="#A1441" ref="log$=inpage&amp;link_id=inpage">Overview of the Genome Assembly and Annotation Process</a></li><li><a href="#A1447" ref="log$=inpage&amp;link_id=inpage">The Input Data</a></li><li><a href="#A1458" ref="log$=inpage&amp;link_id=inpage">Preparation of the Input Sequences</a></li><li><a href="#A1467" ref="log$=inpage&amp;link_id=inpage">Alignment of Sequences to the Input Genomic Sequences</a></li><li><a href="#A1473" ref="log$=inpage&amp;link_id=inpage">Genome Assembly</a></li><li><a href="#A1486" ref="log$=inpage&amp;link_id=inpage">Annotation of Genes</a></li><li><a href="#A1514" ref="log$=inpage&amp;link_id=inpage">Annotation of Other Features</a></li><li><a href="#A1519" ref="log$=inpage&amp;link_id=inpage">Product Data Sets</a></li><li><a href="#A1523" ref="log$=inpage&amp;link_id=inpage">Production of Maps That Display Genome Features</a></li><li><a href="#A1528" ref="log$=inpage&amp;link_id=inpage">Public Release of Assembly and Models</a></li><li><a href="#A1529" ref="log$=inpage&amp;link_id=inpage">Integration with Other Resources</a></li><li><a href="#A1534" ref="log$=inpage&amp;link_id=inpage">Contributors</a></li><li><a href="#A1535" ref="log$=inpage&amp;link_id=inpage">References</a></li></ul></div></div><div class="portlet"><div class="portlet_head"><div class="portlet_title"><h3><span>Recent Activity</span></h3></div><a name="Shutter" sid="1" href="#" class="portlet_shutter" title="Show/hide content" remembercollapsed="true" pgsec_name="recent_activity" id="Shutter"></a></div><div class="portlet_content"><div xmlns:np="http://ncbi.gov/portal/XSLT/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" id="HTDisplay" class=""><div class="action"><a href="javascript:historyDisplayState('ClearHT')">Clear</a><a href="javascript:historyDisplayState('HTOff')" class="HTOn">Turn Off</a><a href="javascript:historyDisplayState('HTOn')" class="HTOff">Turn On</a></div><ul id="activity"><li class="ra_rcd ralinkpopper two_line"><a class="htb ralinkpopperctrl" ref="log$=activity&amp;linkpos=1" href="/portal/utils/pageresolver.fcgi?recordid=67c825c7b70fbb196005df65">Genome Assembly and Annotation Process - The NCBI Handbook</a><div class="ralinkpop offscreen_noflow">Genome Assembly and Annotation Process - The NCBI Handbook<div class="brieflinkpopdesc"></div></div><div class="tertiary"></div></li><li class="ra_rcd ralinkpopper two_line"><a class="htb ralinkpopperctrl" ref="log$=activity&amp;linkpos=2" href="/portal/utils/pageresolver.fcgi?recordid=67c825c6d5edb449bf4326fc">The Processing of Biological Sequence Data at NCBI - The NCBI Handbook</a><div class="ralinkpop offscreen_noflow">The Processing of Biological Sequence Data at NCBI - The NCBI Handbook<div class="brieflinkpopdesc"></div></div><div class="tertiary"></div></li><li class="ra_rcd ralinkpopper two_line"><a class="htb ralinkpopperctrl" ref="log$=activity&amp;linkpos=3" href="/portal/utils/pageresolver.fcgi?recordid=67c825c5d5edb449bf432137">Sequin: A Sequence Submission and Editing Tool - The NCBI Handbook</a><div class="ralinkpop offscreen_noflow">Sequin: A Sequence Submission and Editing Tool - The NCBI Handbook<div class="brieflinkpopdesc"></div></div><div class="tertiary"></div></li><li class="ra_rcd ralinkpopper two_line"><a class="htb ralinkpopperctrl" ref="log$=activity&amp;linkpos=4" href="/portal/utils/pageresolver.fcgi?recordid=67c825c4d5edb449bf431d15">Data Flow and Processing - The NCBI Handbook</a><div class="ralinkpop offscreen_noflow">Data Flow and Processing - The NCBI Handbook<div class="brieflinkpopdesc"></div></div><div class="tertiary"></div></li><li class="ra_rcd ralinkpopper two_line"><a class="htb ralinkpopperctrl" ref="log$=activity&amp;linkpos=5" href="/portal/utils/pageresolver.fcgi?recordid=67c825c3d5edb449bf4315a7">The Major Histocompatibility Complex Database, dbMHC - The NCBI Handbook</a><div class="ralinkpop offscreen_noflow">The Major Histocompatibility Complex Database, dbMHC - The NCBI Handbook<div class="brieflinkpopdesc"></div></div><div class="tertiary"></div></li></ul><p class="HTOn">Your browsing activity is empty.</p><p class="HTOff">Activity recording is turned off.</p><p id="turnOn" class="HTOff"><a href="javascript:historyDisplayState('HTOn')">Turn recording back on</a></p><a class="seemore" href="/sites/myncbi/recentactivity">See more...</a></div></div></div>

        <!-- Custom content below discovery portlets -->
        <div class="col7">

        </div>
    </div>
</div>

<!-- Custom content after all -->
<div class="col8">

</div>
<div class="col9">

</div>

<script type="text/javascript" src="/corehtml/pmc/js/jquery.scrollTo-1.4.2.js"></script>
<script type="text/javascript">
    (function($){
        $('.skiplink').each(function(i, item){
            var href = $($(item).attr('href'));
            href.attr('tabindex', '-1').addClass('skiptarget'); // ensure the target can receive focus
            $(item).on('click', function(event){
                event.preventDefault();
                $.scrollTo(href, 0, {
                    onAfter: function(){
                        href.focus();
                    }
                });
            });
        });
    })(jQuery);
</script>
                        </div>
                        <div class="bottom">

                            <div id="NCBIFooter_dynamic">
    <!--<component id="Breadcrumbs" label="breadcrumbs"/>
    <component id="Breadcrumbs" label="helpdesk"/>-->

</div>

                            <div class="footer" id="footer">
	<section class="icon-section">
		<div id="icon-section-header" class="icon-section_header">Follow NCBI</div>
		<div class="grid-container container">
			<div class="icon-section_container">
				<a class="footer-icon" id="footer_twitter" href="https://twitter.com/ncbi" aria-label="Twitter"><svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" viewBox="0 0 300 300">
					<defs>
						<style>
							.cls-11 {
							fill: #737373;
							}
						</style>
					</defs>
					<title>Twitter</title>
					<path class="cls-11" d="M250.11,105.48c-7,3.14-13,3.25-19.27.14,8.12-4.86,8.49-8.27,11.43-17.46a78.8,78.8,0,0,1-25,9.55,39.35,39.35,0,0,0-67,35.85,111.6,111.6,0,0,1-81-41.08A39.37,39.37,0,0,0,81.47,145a39.08,39.08,0,0,1-17.8-4.92c0,.17,0,.33,0,.5a39.32,39.32,0,0,0,31.53,38.54,39.26,39.26,0,0,1-17.75.68,39.37,39.37,0,0,0,36.72,27.3A79.07,79.07,0,0,1,56,223.34,111.31,111.31,0,0,0,116.22,241c72.3,0,111.83-59.9,111.83-111.84,0-1.71,0-3.4-.1-5.09C235.62,118.54,244.84,113.37,250.11,105.48Z">
					</path>
				</svg></a>
				<a class="footer-icon" id="footer_facebook" href="https://www.facebook.com/ncbi.nlm" aria-label="Facebook"><svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" viewBox="0 0 300 300">
					<title>Facebook</title>
					<path class="cls-11" d="M210.5,115.12H171.74V97.82c0-8.14,5.39-10,9.19-10h27.14V52l-39.32-.12c-35.66,0-42.42,26.68-42.42,43.77v19.48H99.09v36.32h27.24v109h45.41v-109h35Z">
					</path>
				</svg></a>
				<a class="footer-icon" id="footer_linkedin" href="https://www.linkedin.com/company/ncbinlm" aria-label="LinkedIn"><svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" viewBox="0 0 300 300">
						<title>LinkedIn</title>
						<path class="cls-11" d="M101.64,243.37H57.79v-114h43.85Zm-22-131.54h-.26c-13.25,0-21.82-10.36-21.82-21.76,0-11.65,8.84-21.15,22.33-21.15S101.7,78.72,102,90.38C102,101.77,93.4,111.83,79.63,111.83Zm100.93,52.61A17.54,17.54,0,0,0,163,182v61.39H119.18s.51-105.23,0-114H163v13a54.33,54.33,0,0,1,34.54-12.66c26,0,44.39,18.8,44.39,55.29v58.35H198.1V182A17.54,17.54,0,0,0,180.56,164.44Z">
						</path>
					</svg></a>
				<a class="footer-icon" id="footer_github" href="https://github.com/ncbi" aria-label="GitHub"><svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" viewBox="0 0 300 300">
					<defs>
						<style>
							.cls-11,
							.cls-12 {
							fill: #737373;
							}

							.cls-11 {
							fill-rule: evenodd;
							}
						</style>
					</defs>
					<title>GitHub</title>
					<path class="cls-11" d="M151.36,47.28a105.76,105.76,0,0,0-33.43,206.1c5.28,1,7.22-2.3,7.22-5.09,0-2.52-.09-10.85-.14-19.69-29.42,6.4-35.63-12.48-35.63-12.48-4.81-12.22-11.74-15.47-11.74-15.47-9.59-6.56.73-6.43.73-6.43,10.61.75,16.21,10.9,16.21,10.9,9.43,16.17,24.73,11.49,30.77,8.79,1-6.83,3.69-11.5,6.71-14.14C108.57,197.1,83.88,188,83.88,147.51a40.92,40.92,0,0,1,10.9-28.39c-1.1-2.66-4.72-13.42,1-28,0,0,8.88-2.84,29.09,10.84a100.26,100.26,0,0,1,53,0C198,88.3,206.9,91.14,206.9,91.14c5.76,14.56,2.14,25.32,1,28a40.87,40.87,0,0,1,10.89,28.39c0,40.62-24.74,49.56-48.29,52.18,3.79,3.28,7.17,9.71,7.17,19.58,0,14.15-.12,25.54-.12,29,0,2.82,1.9,6.11,7.26,5.07A105.76,105.76,0,0,0,151.36,47.28Z">
					</path>
					<path class="cls-12" d="M85.66,199.12c-.23.52-1.06.68-1.81.32s-1.2-1.06-.95-1.59,1.06-.69,1.82-.33,1.21,1.07.94,1.6Zm-1.3-1">
					</path>
					<path class="cls-12" d="M90,203.89c-.51.47-1.49.25-2.16-.49a1.61,1.61,0,0,1-.31-2.19c.52-.47,1.47-.25,2.17.49s.82,1.72.3,2.19Zm-1-1.08">
					</path>
					<path class="cls-12" d="M94.12,210c-.65.46-1.71,0-2.37-.91s-.64-2.07,0-2.52,1.7,0,2.36.89.65,2.08,0,2.54Zm0,0"></path>
					<path class="cls-12" d="M99.83,215.87c-.58.64-1.82.47-2.72-.41s-1.18-2.06-.6-2.7,1.83-.46,2.74.41,1.2,2.07.58,2.7Zm0,0">
					</path>
					<path class="cls-12" d="M107.71,219.29c-.26.82-1.45,1.2-2.64.85s-2-1.34-1.74-2.17,1.44-1.23,2.65-.85,2,1.32,1.73,2.17Zm0,0">
					</path>
					<path class="cls-12" d="M116.36,219.92c0,.87-1,1.59-2.24,1.61s-2.29-.68-2.3-1.54,1-1.59,2.26-1.61,2.28.67,2.28,1.54Zm0,0">
					</path>
					<path class="cls-12" d="M124.42,218.55c.15.85-.73,1.72-2,1.95s-2.37-.3-2.52-1.14.73-1.75,2-2,2.37.29,2.53,1.16Zm0,0"></path>
				</svg></a>
				<a class="footer-icon" id="footer_blog" href="https://ncbiinsights.ncbi.nlm.nih.gov/" aria-label="Blog">
					<svg xmlns="http://www.w3.org/2000/svg" id="Layer_1" data-name="Layer 1" viewBox="0 0 40 40">
						<defs><style>.cls-1{fill:#737373;}</style></defs>
						<title>NCBI Insights Blog</title>
						<path class="cls-1" d="M14,30a4,4,0,1,1-4-4,4,4,0,0,1,4,4Zm11,3A19,19,0,0,0,7.05,15a1,1,0,0,0-1,1v3a1,1,0,0,0,.93,1A14,14,0,0,1,20,33.07,1,1,0,0,0,21,34h3a1,1,0,0,0,1-1Zm9,0A28,28,0,0,0,7,6,1,1,0,0,0,6,7v3a1,1,0,0,0,1,1A23,23,0,0,1,29,33a1,1,0,0,0,1,1h3A1,1,0,0,0,34,33Z"></path>
					</svg>
				</a>
			</div>
		</div>
	</section>

	<section class="container-fluid bg-primary">
		<div class="container pt-5">
			<div class="row mt-3">
				<div class="col-lg-3 col-12">
					<p><a class="text-white" href="https://www.nlm.nih.gov/socialmedia/index.html">Connect with NLM</a></p>
					<ul class="list-inline social_media">
						<li class="list-inline-item"><a href="https://twitter.com/NLM_NIH" aria-label="Twitter" target="_blank" rel="noopener noreferrer"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" x="0px" y="0px" viewBox="0 0 249 249" style="enable-background:new 0 0 249 249;" xml:space="preserve">
                      <style type="text/css">
                        .st20 {
                          fill: #FFFFFF;
                        }

                        .st30 {
                          fill: none;
                          stroke: #FFFFFF;
                          stroke-width: 8;
                          stroke-miterlimit: 10;
                        }
                      </style>
                      <title>Twitter</title>
                      <g>
                        <g>
                          <g>
                            <path class="st20" d="M192.9,88.1c-5,2.2-9.2,2.3-13.6,0.1c5.7-3.4,6-5.8,8.1-12.3c-5.4,3.2-11.4,5.5-17.6,6.7                                                 c-10.5-11.2-28.1-11.7-39.2-1.2c-7.2,6.8-10.2,16.9-8,26.5c-22.3-1.1-43.1-11.7-57.2-29C58,91.6,61.8,107.9,74,116                                                 c-4.4-0.1-8.7-1.3-12.6-3.4c0,0.1,0,0.2,0,0.4c0,13.2,9.3,24.6,22.3,27.2c-4.1,1.1-8.4,1.3-12.5,0.5c3.6,11.3,14,19,25.9,19.3                                                 c-11.6,9.1-26.4,13.2-41.1,11.5c12.7,8.1,27.4,12.5,42.5,12.5c51,0,78.9-42.2,78.9-78.9c0-1.2,0-2.4-0.1-3.6                                                 C182.7,97.4,189.2,93.7,192.9,88.1z"></path>
                          </g>
                        </g>
                        <circle class="st30" cx="124.4" cy="128.8" r="108.2"></circle>
                      </g>
                    </svg></a></li>
						<li class="list-inline-item"><a href="https://www.facebook.com/nationallibraryofmedicine" aria-label="Facebook" rel="noopener noreferrer" target="_blank">
							<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" x="0px" y="0px" viewBox="0 0 249 249" style="enable-background:new 0 0 249 249;" xml:space="preserve">
                      <style type="text/css">
                        .st10 {
                          fill: #FFFFFF;
                        }

                        .st110 {
                          fill: none;
                          stroke: #FFFFFF;
                          stroke-width: 8;
                          stroke-miterlimit: 10;
                        }
                      </style>
                      <title>Facebook</title>
                      <g>
                        <g>
                          <path class="st10" d="M159,99.1h-24V88.4c0-5,3.3-6.2,5.7-6.2h16.8V60l-24.4-0.1c-22.1,0-26.2,16.5-26.2,27.1v12.1H90v22.5h16.9                                                       v67.5H135v-67.5h21.7L159,99.1z"></path>
                        </g>
                      </g>
                      <circle class="st110" cx="123.6" cy="123.2" r="108.2"></circle>
                    </svg>
						</a></li>
						<li class="list-inline-item"><a href="https://www.youtube.com/user/NLMNIH" aria-label="Youtube" target="_blank" rel="noopener noreferrer"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" x="0px" y="0px" viewBox="0 0 249 249" style="enable-background:new 0 0 249 249;" xml:space="preserve">
                      <title>Youtube</title>
                      <style type="text/css">
                        .st4 {
                          fill: none;
                          stroke: #FFFFFF;
                          stroke-width: 8;
                          stroke-miterlimit: 10;
                        }

                        .st5 {
                          fill: #FFFFFF;
                        }
                      </style>
                      <circle class="st4" cx="124.2" cy="123.4" r="108.2"></circle>
                      <g transform="translate(0,-952.36218)">
                        <path class="st5" d="M88.4,1037.4c-10.4,0-18.7,8.3-18.7,18.7v40.1c0,10.4,8.3,18.7,18.7,18.7h72.1c10.4,0,18.7-8.3,18.7-18.7                                             v-40.1c0-10.4-8.3-18.7-18.7-18.7H88.4z M115.2,1058.8l29.4,17.4l-29.4,17.4V1058.8z"></path>
                      </g>
                    </svg></a></li>
					</ul>
				</div>
				<div class="col-lg-3 col-12">
					<p class="address_footer text-white">National Library of Medicine<br />
						<a href="https://www.google.com/maps/place/8600+Rockville+Pike,+Bethesda,+MD+20894/@38.9959508,-77.101021,17z/data=!3m1!4b1!4m5!3m4!1s0x89b7c95e25765ddb:0x19156f88b27635b8!8m2!3d38.9959508!4d-77.0988323" class="text-white" target="_blank" rel="noopener noreferrer">8600 Rockville Pike<br />
							Bethesda, MD 20894</a></p>
				</div>
				<div class="col-lg-3 col-12 centered-lg">
					<p><a href="https://www.nlm.nih.gov/web_policies.html" class="text-white">Web Policies</a><br />
						<a href="https://www.nih.gov/institutes-nih/nih-office-director/office-communications-public-liaison/freedom-information-act-office" class="text-white">FOIA</a><br />
						<a href="https://www.hhs.gov/vulnerability-disclosure-policy/index.html" class="text-white" id="vdp">HHS Vulnerability Disclosure</a></p>
				</div>
				<div class="col-lg-3 col-12 centered-lg">
					<p><a class="supportLink text-white" href="https://support.nlm.nih.gov/">Help</a><br />
						<a href="https://www.nlm.nih.gov/accessibility.html" class="text-white">Accessibility</a><br />
						<a href="https://www.nlm.nih.gov/careers/careers.html" class="text-white">Careers</a></p>
				</div>
			</div>
			<div class="row">
				<div class="col-lg-12 centered-lg">
					<nav class="bottom-links">
						<ul class="mt-3">
							<li>
								<a class="text-white" href="//www.nlm.nih.gov/">NLM</a>
							</li>
							<li>
								<a class="text-white" href="https://www.nih.gov/">NIH</a>
							</li>
							<li>
								<a class="text-white" href="https://www.hhs.gov/">HHS</a>
							</li>
							<li>
								<a class="text-white" href="https://www.usa.gov/">USA.gov</a>
							</li>
						</ul>
					</nav>
				</div>
			</div>
		</div>
	</section>
	<script type="text/javascript" src="/portal/portal3rc.fcgi/rlib/js/InstrumentOmnitureBaseJS/InstrumentNCBIConfigJS/InstrumentNCBIBaseJS/InstrumentPageStarterJS.js?v=1"> </script>
	<script type="text/javascript" src="/portal/portal3rc.fcgi/static/js/hfjs2.js"> </script>
</div>
                        </div>
                    </div>
                    <!--/.page-->
                </div>
                <!--/.wrap-->
            </div><!-- /.twelve_col -->
        </div>
        <!-- /.grid -->

        <span class="PAFAppResources"></span>

        <!-- BESelector tab -->


        <noscript><img alt="statistics" src="/stat?jsdisabled=true&amp;ncbi_db=books&amp;ncbi_pdid=book-part&amp;ncbi_acc=NBK21086&amp;ncbi_domain=handbook&amp;ncbi_report=record&amp;ncbi_type=fulltext&amp;ncbi_objectid=&amp;ncbi_pcid=/NBK21086/&amp;ncbi_pagename=Genome Assembly and Annotation Process - The NCBI Handbook - NCBI Bookshelf&amp;ncbi_bookparttype=chapter&amp;ncbi_app=bookshelf" /></noscript>


        <!-- usually for JS scripts at page bottom -->
        <!--<component id="PageFixtures" label="styles"></component>-->


<!-- CE8B5AF87C7FFCB1_0191SID /projects/books/PBooks@9.11 portal107 v4.1.r689238 Tue, Oct 22 2024 16:10:51 -->
<span id="portal-csrf-token" style="display:none" data-token="CE8B5AF87C7FFCB1_0191SID"></span>

<script type="text/javascript" src="//static.pubmed.gov/portal/portal3rc.fcgi/4216699/js/3879255/4121861/3501987/4008961/3893018/3821238/4062932/4209313/4212053/4076480/3921943/3400083/3426610.js" snapshot="books"></script></body>
</html>