325 lines
60 KiB
Text
325 lines
60 KiB
Text
<!DOCTYPE html>
|
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" class="no-js no-jr">
|
|
<head>
|
|
<!-- For pinger, set start time and add meta elements. -->
|
|
<script type="text/javascript">var ncbi_startTime = new Date();</script>
|
|
|
|
<!-- Logger begin -->
|
|
<meta name="ncbi_db" content="books">
|
|
<meta name="ncbi_pdid" content="book-part">
|
|
<meta name="ncbi_acc" content="NBK21082">
|
|
<meta name="ncbi_domain" content="handbook">
|
|
<meta name="ncbi_report" content="reader">
|
|
<meta name="ncbi_type" content="fulltext">
|
|
<meta name="ncbi_objectid" content="">
|
|
<meta name="ncbi_pcid" content="/NBK21082/?report=reader">
|
|
<meta name="ncbi_pagename" content="The Processing of Biological Sequence Data at NCBI - The NCBI Handbook - NCBI Bookshelf">
|
|
<meta name="ncbi_bookparttype" content="chapter">
|
|
<meta name="ncbi_app" content="bookshelf">
|
|
<!-- Logger end -->
|
|
|
|
<!--component id="Page" label="meta"/-->
|
|
<script type="text/javascript" src="/corehtml/pmc/jatsreader/ptpmc_3.22/js/jr.boots.min.js"> </script><title>The Processing of Biological Sequence Data at NCBI - The NCBI Handbook - NCBI Bookshelf</title>
|
|
<meta charset="utf-8">
|
|
<meta name="apple-mobile-web-app-capable" content="no">
|
|
<meta name="viewport" content="initial-scale=1,minimum-scale=1,maximum-scale=1,user-scalable=no">
|
|
<meta name="jr-col-layout" content="auto">
|
|
<meta name="jr-prev-unit" content="/books/n/handbook/ch12/?report=reader">
|
|
<meta name="jr-next-unit" content="/books/n/handbook/ch14/?report=reader">
|
|
<meta name="bk-toc-url" content="/books/n/handbook/?report=toc">
|
|
<meta name="robots" content="NOINDEX,NOFOLLOW,NOARCHIVE,NOIMAGEINDEX">
|
|
<meta name="citation_inbook_title" content="The NCBI Handbook [Internet]">
|
|
<meta name="citation_title" content="The Processing of Biological Sequence Data at NCBI">
|
|
<meta name="citation_publisher" content="National Center for Biotechnology Information (US)">
|
|
<meta name="citation_date" content="2006/03/14">
|
|
<meta name="citation_author" content="Karl Sirotkin">
|
|
<meta name="citation_author" content="Tatiana Tatusova">
|
|
<meta name="citation_author" content="Eugene Yaschenko">
|
|
<meta name="citation_author" content="Mark Cavanaugh">
|
|
<meta name="citation_fulltext_html_url" content="https://www.ncbi.nlm.nih.gov/books/NBK21082/">
|
|
<link rel="schema.DC" href="http://purl.org/DC/elements/1.0/">
|
|
<meta name="DC.Title" content="The Processing of Biological Sequence Data at NCBI">
|
|
<meta name="DC.Type" content="Text">
|
|
<meta name="DC.Publisher" content="National Center for Biotechnology Information (US)">
|
|
<meta name="DC.Contributor" content="Karl Sirotkin">
|
|
<meta name="DC.Contributor" content="Tatiana Tatusova">
|
|
<meta name="DC.Contributor" content="Eugene Yaschenko">
|
|
<meta name="DC.Contributor" content="Mark Cavanaugh">
|
|
<meta name="DC.Date" content="2006/03/14">
|
|
<meta name="DC.Identifier" content="https://www.ncbi.nlm.nih.gov/books/NBK21082/">
|
|
<meta name="description" content="The biological sequence information that builds the foundation of NCBI's databases and curated resources comes from many sources. How are these data managed and processed once they reach NCBI? This chapter discusses the flow of sequence data, from the management of data submission to the generation of publicly available data products.">
|
|
<meta name="og:title" content="The Processing of Biological Sequence Data at NCBI">
|
|
<meta name="og:type" content="book">
|
|
<meta name="og:description" content="The biological sequence information that builds the foundation of NCBI's databases and curated resources comes from many sources. How are these data managed and processed once they reach NCBI? This chapter discusses the flow of sequence data, from the management of data submission to the generation of publicly available data products.">
|
|
<meta name="og:url" content="https://www.ncbi.nlm.nih.gov/books/NBK21082/">
|
|
<meta name="og:site_name" content="NCBI Bookshelf">
|
|
<meta name="og:image" content="https://www.ncbi.nlm.nih.gov/corehtml/pmc/pmcgifs/bookshelf/thumbs/th-handbook-lrg.png">
|
|
<meta name="twitter:card" content="summary">
|
|
<meta name="twitter:site" content="@ncbibooks">
|
|
<meta name="warning" content="This publication is provided for historical reference only and the information may be out of date.">
|
|
<meta name="bk-non-canon-loc" content="/books/n/handbook/ch13/?report=reader">
|
|
<link rel="canonical" href="https://www.ncbi.nlm.nih.gov/books/NBK21082/">
|
|
<link href="https://fonts.googleapis.com/css?family=Archivo+Narrow:400,700,400italic,700italic&subset=latin" rel="stylesheet" type="text/css">
|
|
<link rel="stylesheet" href="/corehtml/pmc/jatsreader/ptpmc_3.22/css/libs.min.css">
|
|
<link rel="stylesheet" href="/corehtml/pmc/jatsreader/ptpmc_3.22/css/jr.min.css">
|
|
<meta name="format-detection" content="telephone=no">
|
|
<link rel="stylesheet" href="/corehtml/pmc/css/bookshelf/2.26/css/books.min.css" type="text/css">
|
|
<link rel="stylesheet" href="/corehtml/pmc/css/bookshelf/2.26/css//books_print.min.css" type="text/css" media="print">
|
|
<link rel="stylesheet" href="/corehtml/pmc/css/bookshelf/2.26/css/books_reader.min.css" type="text/css">
|
|
<style type="text/css">.main-content {background:transparent repeat-y top left;background-image:url(/corehtml/pmc/css/bookshelf/2.26/img/archive.png);background-size: auto, contain; padding:0 0 0 3em }</style>
|
|
<style type="text/css">p a.figpopup{display:inline !important} .bk_tt {font-family: monospace} .first-line-outdent .bk_ref {display: inline} .body-content h2, .body-content .h2 {border-bottom: 1px solid #97B0C8} .body-content h2.inline {border-bottom: none} a.page-toc-label , .jig-ncbismoothscroll a {text-decoration:none;border:0 !important} .temp-labeled-list .graphic {display:inline-block !important} .temp-labeled-list img{width:100%}</style>
|
|
|
|
<link rel="shortcut icon" href="//www.ncbi.nlm.nih.gov/favicon.ico">
|
|
<meta name="ncbi_phid" content="CE8E60E27D5C01C10000000000CF00B6.m_5">
|
|
<meta name='referrer' content='origin-when-cross-origin'/><link type="text/css" rel="stylesheet" href="//static.pubmed.gov/portal/portal3rc.fcgi/4216699/css/3852956/3849091.css"></head>
|
|
<body>
|
|
<!-- Book content! -->
|
|
|
|
|
|
<div id="jr" data-jr-path="/corehtml/pmc/jatsreader/ptpmc_3.22/"><div class="jr-unsupported"><table class="modal"><tr><td><span class="attn inline-block"></span><br />Your browser does not support the NLM PubReader view.<br />Go to <a href="/pmc/about/pr-browsers/">this page</a> to see a list of supported browsers<br />or return to the <br /><a href="/books/NBK21082/?report=classic">regular view</a>.</td></tr></table></div><div id="jr-ui" class="hidden"><nav id="jr-head"><div class="flexh tb"><div id="jr-tb1"><a id="jr-links-sw" class="hidden" title="Links"><svg xmlns="http://www.w3.org/2000/svg" version="1.1" x="0px" y="0px" viewBox="0 0 70.6 85.3" style="enable-background:new 0 0 70.6 85.3;vertical-align:middle" xml:space="preserve" width="24" height="24">
|
|
<style type="text/css">.st0{fill:#939598;}</style>
|
|
<g>
|
|
<path class="st0" d="M36,0C12.8,2.2-22.4,14.6,19.6,32.5C40.7,41.4-30.6,14,35.9,9.8"></path>
|
|
<path class="st0" d="M34.5,85.3c23.2-2.2,58.4-14.6,16.4-32.5c-21.1-8.9,50.2,18.5-16.3,22.7"></path>
|
|
<path class="st0" d="M34.7,37.1c66.5-4.2-4.8-31.6,16.3-22.7c42.1,17.9,6.9,30.3-16.4,32.5h1.7c-66.2,4.4,4.8,31.6-16.3,22.7 c-42.1-17.9-6.9-30.3,16.4-32.5"></path>
|
|
</g>
|
|
</svg> Books</a></div><div class="jr-rhead f1 flexh"><div class="head"><a href="/books/n/handbook/ch12/?report=reader"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100" preserveAspectRatio="none"><path d="M75,30 c-80,60 -80,0 0,60 c-30,-60 -30,0 0,-60"></path><text x="20" y="28" textLength="60" style="font-size:25px">Prev</text></svg></a></div><div class="body"><div class="t">Chapter 13, The Processing of Biological Sequence Data at NCBI</div><div class="j">The NCBI Handbook [Internet]</div></div><div class="tail"><a href="/books/n/handbook/ch14/?report=reader"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100" preserveAspectRatio="none"><path d="M25,30c80,60 80,0 0,60 c30,-60 30,0 0,-60"></path><text x="20" y="28" textLength="60" style="font-size:25px">Next</text></svg></a></div></div><div id="jr-tb2"><a id="jr-bkhelp-sw" class="btn wsprkl hidden" title="Help with NLM PubReader">?</a><a id="jr-help-sw" class="btn wsprkl hidden" title="Settings and typography in NLM PubReader"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" preserveAspectRatio="none"><path d="M462,283.742v-55.485l-29.981-10.662c-11.431-4.065-20.628-12.794-25.274-24.001 c-0.002-0.004-0.004-0.009-0.006-0.013c-4.659-11.235-4.333-23.918,0.889-34.903l13.653-28.724l-39.234-39.234l-28.72,13.652 c-10.979,5.219-23.68,5.546-34.908,0.889c-0.005-0.002-0.01-0.003-0.014-0.005c-11.215-4.65-19.933-13.834-24-25.273L283.741,50 h-55.484l-10.662,29.981c-4.065,11.431-12.794,20.627-24.001,25.274c-0.005,0.002-0.009,0.004-0.014,0.005 c-11.235,4.66-23.919,4.333-34.905-0.889l-28.723-13.653l-39.234,39.234l13.653,28.721c5.219,10.979,5.545,23.681,0.889,34.91 c-0.002,0.004-0.004,0.009-0.006,0.013c-4.649,11.214-13.834,19.931-25.271,23.998L50,228.257v55.485l29.98,10.661 c11.431,4.065,20.627,12.794,25.274,24c0.002,0.005,0.003,0.01,0.005,0.014c4.66,11.236,4.334,23.921-0.888,34.906l-13.654,28.723 l39.234,39.234l28.721-13.652c10.979-5.219,23.681-5.546,34.909-0.889c0.005,0.002,0.01,0.004,0.014,0.006 c11.214,4.649,19.93,13.833,23.998,25.271L228.257,462h55.484l10.595-29.79c4.103-11.538,12.908-20.824,24.216-25.525 c0.005-0.002,0.009-0.004,0.014-0.006c11.127-4.628,23.694-4.311,34.578,0.863l28.902,13.738l39.234-39.234l-13.66-28.737 c-5.214-10.969-5.539-23.659-0.886-34.877c0.002-0.005,0.004-0.009,0.006-0.014c4.654-11.225,13.848-19.949,25.297-24.021 L462,283.742z M256,331.546c-41.724,0-75.548-33.823-75.548-75.546s33.824-75.547,75.548-75.547 c41.723,0,75.546,33.824,75.546,75.547S297.723,331.546,256,331.546z"></path></svg></a><a id="jr-fip-sw" class="btn wsprkl hidden" title="Find"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 550 600" preserveAspectRatio="none"><path fill="none" stroke="#000" stroke-width="36" stroke-linecap="round" style="fill:#FFF" d="m320,350a153,153 0 1,0-2,2l170,170m-91-117 110,110-26,26-110-110"></path></svg></a><a id="jr-rtoc-sw" class="btn wsprkl hidden" title="Table of Contents"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100" preserveAspectRatio="none"><path d="M20,20h10v8H20V20zM36,20h44v8H36V20zM20,37.33h10v8H20V37.33zM36,37.33h44v8H36V37.33zM20,54.66h10v8H20V54.66zM36,54.66h44v8H36V54.66zM20,72h10v8 H20V72zM36,72h44v8H36V72z"></path></svg></a></div></div></nav><nav id="jr-dash" class="noselect"><nav id="jr-dash" class="noselect"><div id="jr-pi" class="hidden"><a id="jr-pi-prev" class="hidden" title="Previous page"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100" preserveAspectRatio="none"><path d="M75,30 c-80,60 -80,0 0,60 c-30,-60 -30,0 0,-60"></path><text x="20" y="28" textLength="60" style="font-size:25px">Prev</text></svg></a><div class="pginfo">Page <i class="jr-pg-pn">0</i> of <i class="jr-pg-lp">0</i></div><a id="jr-pi-next" class="hidden" title="Next page"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100" preserveAspectRatio="none"><path d="M25,30c80,60 80,0 0,60 c30,-60 30,0 0,-60"></path><text x="20" y="28" textLength="60" style="font-size:25px">Next</text></svg></a></div><div id="jr-is-tb"><a id="jr-is-sw" class="btn wsprkl hidden" title="Switch between Figures/Tables strip and Progress bar"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100" preserveAspectRatio="none"><rect x="10" y="40" width="20" height="20"></rect><rect x="40" y="40" width="20" height="20"></rect><rect x="70" y="40" width="20" height="20"></rect></svg></a></div><nav id="jr-istrip" class="istrip hidden"><a id="jr-is-prev" href="#" class="hidden" title="Previous"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100" preserveAspectRatio="none"><path d="M80,40 60,65 80,90 70,90 50,65 70,40z M50,40 30,65 50,90 40,90 20,65 40,40z"></path><text x="35" y="25" textLength="60" style="font-size:25px">Prev</text></svg></a><a id="jr-is-next" href="#" class="hidden" title="Next"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100" preserveAspectRatio="none"><path d="M20,40 40,65 20,90 30,90 50,65 30,40z M50,40 70,65 50,90 60,90 80,65 60,40z"></path><text x="15" y="25" textLength="60" style="font-size:25px">Next</text></svg></a></nav><nav id="jr-progress"></nav></nav></nav><aside id="jr-links-p" class="hidden flexv"><div class="tb sk-htbar flexh"><div><a class="jr-p-close btn wsprkl">Done</a></div><div class="title-text f1">NCBI Bookshelf</div></div><div class="cnt lol f1"><a href="/books/">Home</a><a href="/books/browse/">Browse All Titles</a><a class="btn share" target="_blank" rel="noopener noreferrer" href="https://www.facebook.com/sharer/sharer.php?u=https://www.ncbi.nlm.nih.gov/books/NBK21082/"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 33 33" style="vertical-align:middle" width="24" height="24" preserveAspectRatio="none"><g><path d="M 17.996,32L 12,32 L 12,16 l-4,0 l0-5.514 l 4-0.002l-0.006-3.248C 11.993,2.737, 13.213,0, 18.512,0l 4.412,0 l0,5.515 l-2.757,0 c-2.063,0-2.163,0.77-2.163,2.209l-0.008,2.76l 4.959,0 l-0.585,5.514L 18,16L 17.996,32z"></path></g></svg> Share on Facebook</a><a class="btn share" target="_blank" rel="noopener noreferrer" href="https://twitter.com/intent/tweet?url=https://www.ncbi.nlm.nih.gov/books/NBK21082/&text=The%20Processing%20of%20Biological%20Sequence%20Data%20at%20NCBI"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 33 33" style="vertical-align:middle" width="24" height="24"><g><path d="M 32,6.076c-1.177,0.522-2.443,0.875-3.771,1.034c 1.355-0.813, 2.396-2.099, 2.887-3.632 c-1.269,0.752-2.674,1.299-4.169,1.593c-1.198-1.276-2.904-2.073-4.792-2.073c-3.626,0-6.565,2.939-6.565,6.565 c0,0.515, 0.058,1.016, 0.17,1.496c-5.456-0.274-10.294-2.888-13.532-6.86c-0.565,0.97-0.889,2.097-0.889,3.301 c0,2.278, 1.159,4.287, 2.921,5.465c-1.076-0.034-2.088-0.329-2.974-0.821c-0.001,0.027-0.001,0.055-0.001,0.083 c0,3.181, 2.263,5.834, 5.266,6.438c-0.551,0.15-1.131,0.23-1.73,0.23c-0.423,0-0.834-0.041-1.235-0.118 c 0.836,2.608, 3.26,4.506, 6.133,4.559c-2.247,1.761-5.078,2.81-8.154,2.81c-0.53,0-1.052-0.031-1.566-0.092 c 2.905,1.863, 6.356,2.95, 10.064,2.95c 12.076,0, 18.679-10.004, 18.679-18.68c0-0.285-0.006-0.568-0.019-0.849 C 30.007,8.548, 31.12,7.392, 32,6.076z"></path></g></svg> Share on Twitter</a></div></aside><aside id="jr-rtoc-p" class="hidden flexv"><div class="tb sk-htbar flexh"><div><a class="jr-p-close btn wsprkl">Done</a></div><div class="title-text f1">Table of Content</div></div><div class="cnt lol f1"><a href="/books/n/handbook/?report=reader">Title Information</a><a href="/books/n/handbook/toc/?report=reader">Table of Contents Page</a></div></aside><aside id="jr-help-p" class="hidden flexv"><div class="tb sk-htbar flexh"><div><a class="jr-p-close btn wsprkl">Done</a></div><div class="title-text f1">Settings</div></div><div class="cnt f1"><div id="jr-typo-p" class="typo"><div><a class="sf btn wsprkl">A-</a><a class="lf btn wsprkl">A+</a></div><div><a class="bcol-auto btn wsprkl"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 200 100" preserveAspectRatio="none"><text x="10" y="70" style="font-size:60px;font-family: Trebuchet MS, ArialMT, Arial, sans-serif" textLength="180">AUTO</text></svg></a><a class="bcol-1 btn wsprkl"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100" preserveAspectRatio="none"><path d="M15,25 85,25zM15,40 85,40zM15,55 85,55zM15,70 85,70z"></path></svg></a><a class="bcol-2 btn wsprkl"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100" preserveAspectRatio="none"><path d="M5,25 45,25z M55,25 95,25zM5,40 45,40z M55,40 95,40zM5,55 45,55z M55,55 95,55zM5,70 45,70z M55,70 95,70z"></path></svg></a></div></div><div class="lol"><a class="" href="/books/NBK21082/?report=classic">Switch to classic view</a><a href="/books/NBK21082/pdf/Bookshelf_NBK21082.pdf">PDF (317K)</a><a href="/books/n/handbook/pdf/">PDF (7.2M)</a><a href="/books/NBK21082/?report=printable">Print View</a></div></div></aside><aside id="jr-bkhelp-p" class="hidden flexv"><div class="tb sk-htbar flexh"><div><a class="jr-p-close btn wsprkl">Done</a></div><div class="title-text f1">Help</div></div><div class="cnt f1 lol"><a id="jr-helpobj-sw" data-path="/corehtml/pmc/jatsreader/ptpmc_3.22/" data-href="/corehtml/pmc/jatsreader/ptpmc_3.22/img/bookshelf/help.xml" href="">Help</a><a href="mailto:info@ncbi.nlm.nih.gov?subject=PubReader%20feedback%20%2F%20NBK21082%20%2F%20sid%3ACE8B5AF87C7FFCB1_0191SID%20%2F%20phid%3ACE8E60E27D5C01C10000000000CF00B6.4">Send us feedback</a><a id="jr-about-sw" data-path="/corehtml/pmc/jatsreader/ptpmc_3.22/" data-href="/corehtml/pmc/jatsreader/ptpmc_3.22/img/bookshelf/about.xml" href="">About PubReader</a></div></aside><aside id="jr-objectbox" class="thidden hidden"><div class="jr-objectbox-close wsprkl">✘</div><div class="jr-objectbox-inner cnt"><div class="jr-objectbox-drawer"></div></div></aside><nav id="jr-pm-left" class="hidden"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 40 800" preserveAspectRatio="none"><text font-stretch="ultra-condensed" x="800" y="-15" text-anchor="end" transform="rotate(90)" font-size="18" letter-spacing=".1em">Previous Page</text></svg></nav><nav id="jr-pm-right" class="hidden"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 40 800" preserveAspectRatio="none"><text font-stretch="ultra-condensed" x="800" y="-15" text-anchor="end" transform="rotate(90)" font-size="18" letter-spacing=".1em">Next Page</text></svg></nav><nav id="jr-fip" class="hidden"><nav id="jr-fip-term-p"><input type="search" placeholder="search this page" id="jr-fip-term" autocorrect="off" autocomplete="off" /><a id="jr-fip-mg" class="wsprkl btn" title="Find"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 550 600" preserveAspectRatio="none"><path fill="none" stroke="#000" stroke-width="36" stroke-linecap="round" style="fill:#FFF" d="m320,350a153,153 0 1,0-2,2l170,170m-91-117 110,110-26,26-110-110"></path></svg></a><a id="jr-fip-done" class="wsprkl btn" title="Dismiss find">✘</a></nav><nav id="jr-fip-info-p"><a id="jr-fip-prev" class="wsprkl btn" title="Jump to previuos match">◀</a><button id="jr-fip-matches">no matches yet</button><a id="jr-fip-next" class="wsprkl btn" title="Jump to next match">▶</a></nav></nav></div><div id="jr-epub-interstitial" class="hidden"></div><div id="jr-content"><article data-type="main"><p class="vip-notice"><strong><a href="/books/n/handbook2e/?report=reader">See "The NCBI Handbook, 2nd Edition"</a></strong></p><p class="vip-notice retraction"><strong>This publication is provided for historical reference only and the information may be out of date.</strong></p><div class="main-content lit-style" itemscope="itemscope" itemtype="http://schema.org/CreativeWork"><div class="meta-content fm-sec"><div class="fm-sec"><h1 id="_NBK21082_"><span class="label">Chapter 13</span><span class="title" itemprop="name">The Processing of Biological Sequence Data at NCBI</span></h1><p class="contribs">Sirotkin K, Tatusova T, Yaschenko E, et al.</p><p class="fm-aai"><a href="#_NBK21082_pubdet_">Publication Details</a></p><p><em>Estimated reading time: 13 minutes</em></p></div></div><div class="jig-ncbiinpagenav body-content whole_rhythm" data-jigconfig="allHeadingLevels: ['h2'],smoothScroll: false" itemprop="text"><p>The biological sequence information that builds the foundation of NCBI's databases and
|
|
curated resources comes from many sources. How are these data managed and processed once they
|
|
reach NCBI? This chapter discusses the flow of sequence data, from the management of data
|
|
submission to the generation of publicly available data products. </p><div id="ch13.Overview"><h2 id="_ch13_Overview_">Overview</h2><p>The central dogma of molecular biology asserts that sequences flow from DNA to RNA to
|
|
protein. In Entrez, DNA and RNA sequences are retrieved together as nucleotides and then
|
|
integrated, along with proteins, into the NCBI system. Once in the system nucleotides and
|
|
proteins are both available for public use in at least three ways: </p><dl class="temp-labeled-list"><dl class="bkr_refwrap"><dt>1.</dt><dd id="A2095"><p class="no_top_margin">The <a href="/Sitemap/index.html#Entrez" ref="pagearea=body&targetsite=external&targetcat=link&targettype=uri">Entrez
|
|
system</a> (<a href="/books/n/handbook/ch15/?report=reader">Chapter 15</a>) retrieves
|
|
nucleotide and protein sequences according to text queries that are entered into the
|
|
search box. Text queries can be followed by search fields, such as author, definition line, and organism (for example, "homo sapiens"[orgn]), and are used to further define
|
|
raw sequence data being used for retrieval.</p></dd></dl><dl class="bkr_refwrap"><dt>2.</dt><dd id="A2096"><p class="no_top_margin">The sequences themselves can be searched directly by using <a href="/BLAST/" ref="pagearea=body&targetsite=external&targetcat=link&targettype=uri">BLAST</a> (<a href="/books/n/handbook/ch16/?report=reader">Chapter 16</a>), which uses a sequence as a query
|
|
to find similar sequences. </p></dd></dl><dl class="bkr_refwrap"><dt>3.</dt><dd id="A2097"><p class="no_top_margin">Large subsets of sequences can be downloaded by <a href="/Sitemap/index.html#FTPSite" ref="pagearea=body&targetsite=external&targetcat=link&targettype=uri">FTP</a>.
|
|
</p></dd></dl></dl><p>There are many sources for both nucleotide and protein sequences. Sequences submitted
|
|
directly to GenBank (<a href="/books/n/handbook/ch1/?report=reader">Chapter 1</a>) or replicated
|
|
from one of our two collaborating databases, the European Molecular Biology Laboratory
|
|
(EMBL) Data Library and the DNA Data Bank of Japan (DDBJ), are the major sources. The
|
|
Reference Sequence collection (<a href="/books/n/handbook/ch18/?report=reader">Chapter 18</a>) and
|
|
the UniProt database, which incorporates data from SWISS-PROT, are yet additional sources. </p><p>An information management system that consists of two major components, the ID database and
|
|
the IQ database, underlies the submission, storage, and access of GenBank, BLAST, and other
|
|
curated data resources (such as the Reference Sequences (<a href="/books/n/handbook/ch18/?report=reader">Chapter 18</a>), the Map Viewer (<a href="/books/n/handbook/ch20/?report=reader">Chapter 20</a>), or Entrez Gene (<a href="/books/n/handbook/ch19/?report=reader">Chapter
|
|
19</a>)). Whereas ID handles incoming sequences and feeds other databases with subsets
|
|
to suit different needs, IQ holds links between sequences stored in ID and between these
|
|
sequences and other resources. </p><div id="ch13.Abstract_Syntax_Nota"><h3>Abstract Syntax Notation 1 (ASN.1) Is the Data Format Used by the ID System</h3><p>ASN.1 is the data description language in which all sequence data at NCBI are structured.
|
|
ASN.1 allows a detailed description of both the sequences and the information associated
|
|
with them, such as author names, source organism, and biological features (known as
|
|
“features”). The image below shows <span class="bk_pgobj">FEATURES</span> as displayed in GenBank format. </p><p>
|
|
<span class="graphic"><img src="/books/NBK21082/bin/ch13.chapt13a.jpg" alt="Image ch13.chapt13a.jpg" /></span>
|
|
</p><p>In the ASN.1 format, the organism information is presented as shown below. You can also
|
|
<a href="/entrez/viewer.fcgi?db=nucleotide&qty=1&c_start=1&list_uids=71106260&dopt=asn&dispmax=5&sendto=&from=begin&to=end&extrafeatpresent=1&ef_MGC=16" ref="pagearea=body&targetsite=external&targetcat=link&targettype=uri">see a complete ASN.1 record</a>.</p><pre>orgname { name binomial { genus "Macaca" , species "mulatta" } , </pre><p>Maintaining all data in the same structured format simplifies data parsing, manipulation,
|
|
and quality assurance, and eases the task of data integration and software development for
|
|
sequence analysis. All of the various divisions of GenBank can be downloaded in ASN.1 from
|
|
the <a href="ftp://ftp.ncbi.nih.gov" ref="pagearea=body&targetsite=external&targetcat=link&targettype=ftp">NCBI FTP site</a>. In the ID data management
|
|
system, data are stored as ASN.1 blobs, minimizing the amount of biological information
|
|
that is captured and updated in the relational database schema.</p><p>Similar to an XML DTD, ASN.1 has an associated file that contains the description of the
|
|
legal data structure. This file is called asn.all and is available as part of the
|
|
“C” toolkit in an archive named
|
|
“ncbi.tar.gz” located in the <a href="ftp://ftp.ncbi.nih.gov/toolbox/ncbi_tools" ref="pagearea=body&targetsite=external&targetcat=link&targettype=ftp">FTP directory</a>. When
|
|
unpacked, the directory “/demo”, found in the
|
|
“ncbi.tar.gz” archive, contains the asn.all file. In the same
|
|
“/demo” directory is testval.c, a tool that validates the data
|
|
against asn.all. Additionally, a set of utilities for producing ASN.1 while programming in
|
|
“C” is found in the subutil.c file of the
|
|
“/api” directory, which is unpacked from the same
|
|
“ncbi.tar.gz” archive.</p></div><div id="ch13.Sources_of_Sequence_"><h3>Sources of Sequence Data</h3><p>The sequence data available at NCBI comes from many different sources (<a class="figpopup" href="/books/NBK21082/figure/ch13.F1/?report=objectonly" target="object" rid-figpopup="figch13F1" rid-ob="figobch13F1">Figure 1</a>). In
|
|
summary, the data consist of:</p><div class="iconblock whole_rhythm clearfix ten_col fig" id="figch13F1" co-legend-rid="figlgndch13F1"><a href="/books/NBK21082/figure/ch13.F1/?report=objectonly" target="object" title="Figure" class="img_link icnblk_img figpopup" rid-figpopup="figch13F1" rid-ob="figobch13F1"><img class="small-thumb" src="/books/NBK21082/bin/ch13.ch13f1Karl.gif" src-large="/books/NBK21082/bin/ch13.ch13f1Karl.jpg" alt="Figure 1" /></a><div class="icnblk_cntnt" id="figlgndch13F1"><h4 id="ch13.F1"><a href="/books/NBK21082/figure/ch13.F1/?report=objectonly" target="object" rid-ob="figobch13F1">Figure</a></h4><p class="float-caption no_bottom_margin">Figure 1. Sources of sequence data available at NCBI. </p></div></div><ul><li id="A2098" class="half_rhythm"><div>GenBank sequences (<a href="/books/n/handbook/ch1/?report=reader">Chapter 1</a>)</div></li><li id="A2099" class="half_rhythm"><div>Reference sequences (<a href="/books/n/handbook/ch18/?report=reader">Chapter 18</a>)</div></li><li id="A2100" class="half_rhythm"><div>sequences from other databases, such as SWISS-PROT, PIR, PRF, and PDB</div></li><li id="A2101" class="half_rhythm"><div>sequences from the United States patents</div></li></ul><p>The submission pathway depends on the data source (see <a class="figpopup" href="/books/NBK21082/figure/ch13.F1/?report=objectonly" target="object" rid-figpopup="figch13F1" rid-ob="figobch13F1">Figure 1</a>) and volume. HTGS and other large-volume submitters use FTP, usually
|
|
after converting their data to ASN.1 with tools such as tabl2asn. Small-volume submitters
|
|
typically use either BankIt (<a href="/books/n/handbook/ch1/?report=reader">Chapter 1</a>) or
|
|
Sequin (<a href="/books/n/handbook/ch12/?report=reader">Chapter 12</a>) to prepare the ASN.1 for
|
|
submission.</p><p>The data received are then subjected to some quality control by the submission tools
|
|
BankIt, Sequin, and fa2htgs. These tools have built-in validation mechanisms to check if
|
|
the data submitted have the correct structure and contain the essential information. The
|
|
work of the GenBank indexing staff, who uses Sequin, adds one more layer of quality
|
|
control and provides assistance to submitters. The staff also helps with the use of Sequin
|
|
for complex submissions </p></div></div><div id="ch13.Data_Flow_Components"><h2 id="_ch13_Data_Flow_Components_">Data Flow Components</h2><div id="ch13.The_ID_Database"><h3>The ID Database</h3><p>The ID database is a group of standard relational databases that holds both ASN.1 objects
|
|
and sequence identifier-related information. ASN.1 objects follow the specifications in
|
|
the asn.all file for NCBI sequence data objects. ID holds data for GenBank and the many
|
|
databases in the Entrez system. Details of the architecture of relational ID databases and
|
|
the software associated with them are described <a href="#ch13.Data_Flow_Architectu">later in this chapter</a>. All of the sequences from
|
|
the International Nucleotide Sequence Database Collaboration (INSDC)are in GenBank, and
|
|
they all have Accession numbers assigned to them. Accession numbers point to sequences and
|
|
their associated biological information and annotation. </p><p>In the ID database, blobs are added into a single column of a relational database.
|
|
Although the columns behave as in a relational database, the information that makes each
|
|
blob, such as biological features, raw sequence data, and author information, are neither
|
|
parsed nor split out. In this sense, the ID database can be considered as a hybrid
|
|
database that stores complex objects. </p><p>Note: Blob stands for Binary Large Object (or binary data object) and refers to a large
|
|
piece of data, a large structured data object that can be stored as a unit and processed
|
|
by software that knows the structure. For more information, check the <a href="/books/n/handbook/A1237/?report=reader">Glossary</a>.</p></div><div id="ch13.Versions__GIs__Annot"><h3>Versions, GIs, Annotation Changes, and Takeovers</h3><p>Every time a change is made to a sequence, a new version of the sequence is produced.
|
|
This new version has a new GI number (GI or GenInfo Identifier is a sequence
|
|
identification number for a nucleotide sequence) assigned to it (<b>A</b> and
|
|
<b>B</b> in the image below). When a change is made to the annotation associated
|
|
with a sequence, a new blob is produced, but no new version or GI is assigned. This series
|
|
of events marks the history of the sequence since its first days in GenBank.</p><p>You can track annotation and sequence changes, as well as the
|
|
“takeover” of one record by another by using the Sequence Revision
|
|
History tool. The tool can be accessed from the side blue bar in Entrez Nucleotide and
|
|
Entrez Protein and is used to highlight differences in sequence versions and annotations.
|
|
To understand how the History tool works, let’s examine the <a href="/entrez/sutils/girevhist.cgi?val=AF123456" ref="pagearea=body&targetsite=external&targetcat=link&targettype=uri">history of the Gallus gallus doublesex and mab-3 related transcription factor 1
|
|
mRNA</a> (Accession <a href="/entrez/viewer.fcgi?db=nucleotide&val=6633795" ref="pagearea=body&targetsite=external&targetcat=link&targettype=uri">AF123456</a>), which was first added to GenBank March 20, 1999. </p><p>Click on <span class="bk_pgobj">Check sequence revision
|
|
history</span> in the blue side bar of Entrez Nucleotide or Entrez Protein to
|
|
be directed to the <span class="bk_pgobj">Sequence Revision
|
|
History</span> page. Enter the Accession or GI numbers or the FASTA-style
|
|
Sequence IDs (<span class="bk_pgobj">SeqIds</span>) into the
|
|
<span class="bk_pgobj">Find</span> box. The <span class="bk_pgobj">Revision history</span> for <a href="/nuccore/6633795" class="bk_tag" ref="pagearea=body&targetsite=entrez&targetcat=link&targettype=nuccore">AF123456</a> is
|
|
displayed.</p><p>
|
|
<span class="graphic"><img src="/books/NBK21082/bin/ch13.chapt13f3.jpg" alt="Image ch13.chapt13f3.jpg" /></span>
|
|
</p><p>The Update Date column (<b>C</b> in the image above) contains the date of every
|
|
update to <a href="/nuccore/6633795" class="bk_tag" ref="pagearea=body&targetsite=entrez&targetcat=link&targettype=nuccore">AF123456</a>. Some involve sequence changes, others involve only annotation changes.
|
|
Click on a date in the column to retrieve <a href="/nuccore/6633795" class="bk_tag" ref="pagearea=body&targetsite=entrez&targetcat=link&targettype=nuccore">AF123456</a> as it existed at that point in time.
|
|
The status column (<b>D</b>) reports which version is live and which ones are dead.
|
|
Columns I and II (<b>E</b>) are used to compare two different sequences. </p><p>Notice that on <span class="bk_pgobj">Mar 23 1999</span>, at
|
|
1:24 PM, a new ASN.1 blob was produced for Accession AF12345. However, no new GI number
|
|
(<b>A</b>) or version (<b>B</b>) was assigned because the changes were
|
|
limited to the annotation and biological features of the sequence, with no changes made to
|
|
the sequence data. On December 23, 1999, Accession <a href="/nuccore/6633795" class="bk_tag" ref="pagearea=body&targetsite=entrez&targetcat=link&targettype=nuccore">AF123456</a> gained a new GI
|
|
(<span class="bk_pgobj">6633795</span>) and version
|
|
(<span class="bk_pgobj">Version 2</span>) because in this
|
|
case a change was made to the sequence data. </p><p>Compare the two blobs produced on March 23, 1999 and December 23, 1999 to see the
|
|
difference between them. </p><ul><li id="A2102" class="half_rhythm"><div>Start by <a href="/entrez/sutils/girevhist.cgi?val=AF123456" ref="pagearea=body&targetsite=external&targetcat=link&targettype=uri">accessing the Revision history for AF12345</a>. </div></li><li id="A2103" class="half_rhythm"><div>Select one sequence in each column (I or II) as shown in the image above
|
|
(<b>E</b>). </div></li><li id="A2104" class="half_rhythm"><div>Push the <span class="bk_pgobj">Show</span> button at the
|
|
upper left of the page to display the two blobs (<b>G</b>).</div></li></ul><p>The differences between blobs are highlighted, with each blob displaying a different
|
|
color. Compare ASN.1 blobs produced on March 20, 1999 and March 23, 1999 and you will see
|
|
that the differences between the two are limited to the annotation and biological features
|
|
described in the blobs, whereas the sequence data remain the same. </p><p>The understanding of the biological features related to a sequence can change with or
|
|
without a change in the underlying genetic sequence. For example, <a href="/entrez/sutils/girevhist.cgi?val=J00179http://www.ncbi.nlm.nih.gov/entrez/sutils/girevhist.cgi?val=J00179" ref="pagearea=body&targetsite=external&targetcat=link&targettype=uri">the sequence revision history of J00179</a> reveals that although the annotation
|
|
changed four times, there has been only one sequence version (<span class="bk_pgobj"><a href="/nuccore/183807" class="bk_tag" ref="pagearea=body&targetsite=entrez&targetcat=link&targettype=nuccore">J00179</a></span>) with one GI (<span class="bk_pgobj">183807</span>). <a href="/nuccore/183807" class="bk_tag" ref="pagearea=body&targetsite=entrez&targetcat=link&targettype=nuccore">J00179</a> can still be retrieved in
|
|
Entrez by searching its Accession or GI number, but this record has been replaced by
|
|
<a href="/entrez/sutils/girevhist.cgi?val=U01317" ref="pagearea=body&targetsite=external&targetcat=link&targettype=uri">Accession U01317</a> and therefore is no longer indexed. The version number
|
|
assigned to the “take over” record <a href="/nuccore/455025" class="bk_tag" ref="pagearea=body&targetsite=entrez&targetcat=link&targettype=nuccore">U01317</a> is 1, whereas the
|
|
replaced version of this record (<a href="/nuccore/183807" class="bk_tag" ref="pagearea=body&targetsite=entrez&targetcat=link&targettype=nuccore">J00179</a>) remains as <span class="bk_pgobj">Version 0</span>. All sequences deposited before
|
|
February 1999 received no sequence version, that’s why <a href="/nuccore/183807" class="bk_tag" ref="pagearea=body&targetsite=entrez&targetcat=link&targettype=nuccore">J00179</a> is version zero.
|
|
In February 1999, the use of a sequence version was implemented, and all sequences
|
|
deposited in GenBank at that time received a version number 1. Since then, ordinals
|
|
assigned to sequence versions have increased every time a change is made to the sequence
|
|
data. </p><p>The use of both systems, Version and GI, leads to two parallel ways of tracking sequence
|
|
versions for an object. In the GenBank flatfile, the Accession Version provides the
|
|
ordinal instance (version) of the sequence. Within ID, each unique sequence is assigned a
|
|
GI number; and therefore the instances of an Accession can be tracked by checking its
|
|
chain of GI numbers. Note that Accession and Accession Version are different things, with
|
|
the former been used to designate a DNA sequence of some molecule or piece of some
|
|
molecule deposited in GenBank and the latter to indicate the version of that sequence. A
|
|
single Accession can have many GIs that are assigned every time the sequence changes,
|
|
whereas an Accession Version has only one GI.</p><p>Within the ID relational databases, there is a chain identifier that can be used to link
|
|
these GI numbers. Not all sequences within ID are in GenBank and not all have sequence
|
|
versions, but all sequences have a chain of GI numbers. For this reason, internally, the
|
|
GI number is the universal pointer to a particular sequence, as opposed to the Accession
|
|
Version, which would work only for versioned sequences. The ID database is also the
|
|
controller for allowed “takeovers” of one Accession by another. In
|
|
the example above, GI 4454562 is taken over by GI 6633795. A takeover can also occur when
|
|
the sequences of two clones are merged into a single clone. One or several of the
|
|
Accessions of older clones can be taken over by a new Accession. </p></div><div id="ch13.Output_of_Data_from_"><h3>Output of Data from the ID System</h3><p>Once all incoming data have been converted to ASN.1 format and entered into ID, the data
|
|
are then replicated into several different servers and transformed into several different
|
|
formats (<a class="figpopup" href="/books/NBK21082/figure/ch13.F2/?report=objectonly" target="object" rid-figpopup="figch13F2" rid-ob="figobch13F2">Figure
|
|
2</a>). The replication is necessary for a number of reasons: (i) it separates the
|
|
“incoming” data system (ID) from the
|
|
“outgoing” data which is the data used in response to scientific
|
|
queries by users; (ii) it helps balance the load of queries, thus providing quicker
|
|
response times and allowing different servers to specialize in different functions; and
|
|
(iii) it protects against data loss should one server fail. The details of the internal
|
|
structure of the ID system and how the structure is replicated are discussed in the <a href="#ch13.Data_Flow_Architectu">Data Flow Architecture</a> section. </p><div class="iconblock whole_rhythm clearfix ten_col fig" id="figch13F2" co-legend-rid="figlgndch13F2"><a href="/books/NBK21082/figure/ch13.F2/?report=objectonly" target="object" title="Figure" class="img_link icnblk_img figpopup" rid-figpopup="figch13F2" rid-ob="figobch13F2"><img class="small-thumb" src="/books/NBK21082/bin/ch13.ch13f2Karl.gif" src-large="/books/NBK21082/bin/ch13.ch13f2Karl.jpg" alt="Figure 2" /></a><div class="icnblk_cntnt" id="figlgndch13F2"><h4 id="ch13.F2"><a href="/books/NBK21082/figure/ch13.F2/?report=objectonly" target="object" rid-ob="figobch13F2">Figure</a></h4><p class="float-caption no_bottom_margin"> Figure 2. Products of the ID system. </p></div></div></div><div id="ch13.The_IQ_Database"><h3>The IQ Database</h3><p>The IQ database is a Sybase data-warehousing product that preserves its SQL language
|
|
interface but which inverts its data by storing it by column, not by row. Its strength is
|
|
in its ability to speed up results from queries based on the anticipated indexing. This
|
|
non-relational database holds links between many different objects.</p><p>For example, as part of the processing of incoming sequences, each protein and nucleotide
|
|
sequence is searched for similar sequences (<a href="/books/n/handbook/ch16/?report=reader">Chapter
|
|
16</a>) against the rest of the database. Users can then select the <span class="bk_pgobj">Related Sequences</span> link that is displayed next
|
|
to each record in Entrez Nucleotide and Entrez Protein (<a href="/books/n/handbook/ch15/?report=reader">Chapter 15</a>) to see a set of similar sequences, sometimes known as
|
|
“neighbors”. The IQ database keeps track of the neighbors for any
|
|
given sequence. These relationships are all pre-computed to save users’ time. </p><p>IQ stores the relationships between similar nucleotide sequences and between similar
|
|
protein sequences and which proteins are coded for by which nucleotides and also holds
|
|
information on the links between entries in different Entrez databases. This might
|
|
include, for example, information on the publications cited within sequence records, which
|
|
links to PubMed or to an organism in the Taxonomy database. Some of this information comes
|
|
from the analysis of the ASN.1 in ID by e2index, a tool that extracts terms from NCBI
|
|
sequence ASN.1 during “indexing” for Entrez. </p></div><div id="ch13.The_BLAST_Control_Da"><h3>The BLAST Control Database</h3><p>The BLAST Control database receives information from ID that is used to generate BLAST
|
|
databases (<a href="/books/n/handbook/ch16/?report=reader">Chapter 16</a>) for the BLAST query
|
|
service and for stand-alone BLAST users. The information is used internally to generate
|
|
the sequence neighbors stored in IQ.</p></div><div id="ch13.The_GenBank_Flatfile"><h3>The GenBank Flatfile and Error Capture Databases</h3><p>Many NCBI users think of the GenBank flatfile as the archetypal sequence data format (see
|
|
an <a href="/entrez/viewer.fcgi?db=nucleotide&val=21536375" ref="pagearea=body&targetsite=external&targetcat=link&targettype=uri">example of a GenBank flatfile</a>). However, within NCBI and especially within
|
|
the ID internal data flow system, ASN.1 is considered the original format from which
|
|
reports such as the GenBank flatfile can be generated (see an <a href="/entrez/viewer.fcgi?db=nucleotide&qty=1&c_start=1&list_uids=21536375&dopt=asn&dispmax=5&sendto=&from=begin&to=end&extrafeatpresent=1&ef_MGC=16" ref="pagearea=body&targetsite=external&targetcat=link&targettype=uri">example of an ASN.1 file</a>). </p><p>Although the GenBank flatfile is usually generated on demand from the ASN.1, for certain
|
|
products such as complete GenBank releases, a GenBank flatfile image is made for each
|
|
active sequence. This flatfile is stored in a database called FF4Release, which consists
|
|
of the latest transformation of ASN.1 to the GenBank flatfile format.</p><p>The FF4Release database is also a place where internal error reports are captured. The
|
|
reports can be analyzed and displayed for different time points in the data processing
|
|
pathway: </p><ul><li id="A2105" class="half_rhythm"><div>ASN.1 itself can be validated using the testval (or its replacement, asnval)
|
|
tool—syntax checking is not necessary, because the underlying ASN.1
|
|
libraries enforce proper syntax according to the definition file. </div></li><li id="A2106" class="half_rhythm"><div>Errors can be discovered during conversion to the GenBank flatfile format. </div></li><li id="A2107" class="half_rhythm"><div>Through a reparse from the GenBank flatfile format to ASN.1. This is done as a
|
|
further check for legality of the ASN.1, and our current software for producing
|
|
GenBank format reports from it.</div></li></ul></div><div id="ch13.Entrez_Postings_File"><h3>Entrez Postings Files</h3><p>When sequences are submitted to GenBank or one of our collaborating databases, additional
|
|
information about the sequence is often included. This might be a brief description of a
|
|
gene in the definition line, along with annotated sequence features such as the source
|
|
organism name. To make this information searchable via Entrez, these words have to be
|
|
indexed. They are extracted from the ASN.1 using e2index and then stored in the Entrez
|
|
posting files, which are optimized for Boolean queries by the Entrez system (see <a href="/books/n/handbook/ch15/?report=reader">Chapter 15</a>).</p><p>All of these products from the ID system are listed in <a class="figpopup" href="/books/NBK21082/table/ch13.T1/?report=objectonly" target="object" rid-figpopup="figch13T1" rid-ob="figobch13T1">Table 1</a>. NCBI also generates weekly
|
|
“LiveLists” for public, collaborator, and in-house use. LiveLists
|
|
show all Accession numbers currently in use. Accession numbers that have been replaced or
|
|
otherwise removed from circulation because of error or submitter request are not in the
|
|
LiveList.</p><div class="iconblock whole_rhythm clearfix ten_col table-wrap" id="figch13T1"><a href="/books/NBK21082/table/ch13.T1/?report=objectonly" target="object" title="Table" class="img_link icnblk_img figpopup" rid-figpopup="figch13T1" rid-ob="figobch13T1"><img class="small-thumb" src="/books/NBK21082/table/ch13.T1/?report=thumb" src-large="/books/NBK21082/table/ch13.T1/?report=previmg" alt="Table 1. Products of the ID system." /></a><div class="icnblk_cntnt"><h4 id="ch13.T1"><a href="/books/NBK21082/table/ch13.T1/?report=objectonly" target="object" rid-ob="figobch13T1">Table</a></h4><p class="float-caption no_bottom_margin">Table 1. Products of the ID system. </p></div></div></div></div><div id="ch13.Data_Flow_Architectu"><h2 id="_ch13_Data_Flow_Architectu_">Data Flow Architecture</h2><p>Sequences enter ID when a client (internal to NCBI) loads data into the system. The ASN.1
|
|
data can be loaded either through a stand-alone program or a client API. In both cases, the
|
|
data are submitted to ID through IDProdOS, an open server (commonly called
|
|
“middleware”) that sits between the clients and the database system.
|
|
An overview of the flow of sequence data through the ID architecture with its multiple
|
|
components is shown in <a class="figpopup" href="/books/NBK21082/figure/ch13.F3/?report=objectonly" target="object" rid-figpopup="figch13F3" rid-ob="figobch13F3">Figure 3</a> and discussed below.</p><div class="iconblock whole_rhythm clearfix ten_col fig" id="figch13F3" co-legend-rid="figlgndch13F3"><a href="/books/NBK21082/figure/ch13.F3/?report=objectonly" target="object" title="Figure" class="img_link icnblk_img figpopup" rid-figpopup="figch13F3" rid-ob="figobch13F3"><img class="small-thumb" src="/books/NBK21082/bin/ch13.ch13f3Karl.gif" src-large="/books/NBK21082/bin/ch13.ch13f3Karl.jpg" alt="Figure 3" /></a><div class="icnblk_cntnt" id="figlgndch13F3"><h4 id="ch13.F3"><a href="/books/NBK21082/figure/ch13.F3/?report=objectonly" target="object" rid-ob="figobch13F3">Figure</a></h4><p class="float-caption no_bottom_margin">Figure 3. The ID system architecture. </p></div></div><p>IDProdOS hides details of the underlying complexity from the client API, which was shown to
|
|
be useful when the previous version of the ID system (a single database and an open server)
|
|
was converted to the current system without requiring any changes to the clients. </p><p>IDProdOS does an initial check of the actions required by the load. For example, in a
|
|
record that has DNA and protein sequences, including annotation and sequence identifiers,
|
|
the identifier on the protein has to be unique. The same identifier should not be given to
|
|
an outdated DNA sequence and a current sequence, unless the current sequence has replaced
|
|
the old one. That’s because proteins, generally, are not allowed to move between
|
|
GenBank records, although proteins moving between segments of a complete genome submission
|
|
are sometimes allowed. </p><p>Additional checking is performed by stored procedures in the IdMain database. The details
|
|
of what is allowed vary according to the source of the ASN.1, which includes direct
|
|
submissions from collaborators and the NCBI RefSeq project. These procedures check (i) which
|
|
sequence identifiers may be used, (ii) which sequences may be replaced by which other
|
|
sequences, and (iii) which sequence version may be used in a record. </p><p>If the sequences pass all these checks, three things happen: (i) IDProdOS changes the SeqId
|
|
pointers in the blob to GI numbers, which are now used as sequence-specific pointers, (ii)
|
|
IdMain retains the sequence identifier information that was also used for the checking, and
|
|
(iii) IDProdOS loads the ASN.1 blobs to the blob satellites. </p><p>The IdMain database contains the sequence identifiers for each of the sequence records,
|
|
including all those for ASN.1 blobs that contain multiple sequences. It enforces sequence
|
|
version rules, among other rules.</p><p>Relational satellite databases are fully normalized databases that hold records for which
|
|
there is only one sequence per intended ASN.1 blob. Few, if any, features are allowed on
|
|
records intended for relational satellite databases (the PubSeqOS produces the ASN.1 by
|
|
converting the data extracted from relational tables). This contrasts with the Blob
|
|
satellite databases, from which ASN.1 is retrieved as-is. Blob satellite databases,
|
|
different from relational databases, contain ASN.1 objects as unnormalized data objects.</p><p>Recently, annotation-only satellite databases have been added to the ID system. These
|
|
satellites contain annotation to be added to Bioseqs, linked by GI number. Because there are
|
|
multiple such annotation satellite databases, more than one set of additional annotation may
|
|
be added to a Bioseq.</p><p>The SnpAnnot database contains feature information that is limited to simple mutation
|
|
information from dbSNP (<a href="/books/n/handbook/ch5/?report=reader">Chapter 5</a>). The CDD
|
|
Annotation database contains feature information that is limited to protein domains for the
|
|
protein sequences known to ID. In both cases, these features might be added to NCBI-curated
|
|
records by the PubSeqOS when the records are requested.</p><p>To visualize the role of replication, the rectangle in the middle of <a class="figpopup" href="/books/NBK21082/figure/ch13.F3/?report=objectonly" target="object" rid-figpopup="figch13F3" rid-ob="figobch13F3">Figure 3</a> represents the use of the Sybase Replication Server to copy
|
|
information from the loading side of the system to the query side.</p><p>Similar to IDProdOS, PubSeqOS is a open server (also called
|
|
“middleware”) that sits between the clients and the database system.
|
|
It hides details of the underlying complexity from the client API. It actually has an almost
|
|
identical code base as IDProdOS because they both serve similar functions. When a record is
|
|
requested in a format other than ASN.1, psansconvert is called to do the conversion. This
|
|
distinct <i>child</i> process allows both insulation from any possible instability
|
|
and allows for use of multiple central processing units (CPUs) in a natural way.</p><p>Note: The <i>child</i> process is a technical term used to describe a process
|
|
that is owned by and completely dependent on a parent process that initiated it.</p><p>At the query side are all records in Entrez, plus graveyards and EntrezControl, a special
|
|
database that is not queried by the public. EntrezControl is used to control the indexing of
|
|
blobs for Entrez. Its rows are initiated by a trigger that fires when rows are added by
|
|
replication to the IdMan database. A trigger is a special, database-stored procedure that
|
|
responds to changes in a database table. </p><p>The graveyards are databases that contain blobs that were replaced or taken over and
|
|
therefore no longer indexed in Entrez. Once replaced or taken over, blobs do not
|
|
change—which is the reason why they are limited to the query
|
|
side—but they are still retrievable by GI or other sequence identifier. </p></div><div id="bk_toc_contnr"></div></div></div><div class="fm-sec"><h2 id="_NBK21082_pubdet_">Publication Details</h2><h3>Author Information and Affiliations</h3><p class="contrib-group"><h4>Authors</h4><span itemprop="author">Karl Sirotkin</span>, <span itemprop="author">Tatiana Tatusova</span>, <span itemprop="author">Eugene Yaschenko</span>, and <span itemprop="author">Mark Cavanaugh</span>.</p><h3>Publication History</h3><p class="small">Created: <span itemprop="datePublished">October 9, 2002</span>; Last Update: <span itemprop="dateModified">March 14, 2006</span>.</p><h3>Copyright</h3><div><div class="half_rhythm"><a href="/books/about/copyright/">Copyright Notice</a></div></div><h3>Publisher</h3><p><a href="https://www.ncbi.nlm.nih.gov/" ref="pagearea=page-banner&targetsite=external&targetcat=link&targettype=publisher">National Center for Biotechnology Information (US)</a>, Bethesda (MD)</p><h3>NLM Citation</h3><p>Sirotkin K, Tatusova T, Yaschenko E, et al. The Processing of Biological Sequence Data at NCBI. 2002 Oct 9 [Updated 2006 Mar 14]. In: McEntyre J, Ostell J, editors. The NCBI Handbook [Internet]. Bethesda (MD): National Center for Biotechnology Information (US); 2002-. Chapter 13.<span class="bk_cite_avail"></span></p></div><div class="small-screen-prev"><a href="/books/n/handbook/ch12/?report=reader"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100" preserveAspectRatio="none"><path d="M75,30 c-80,60 -80,0 0,60 c-30,-60 -30,0 0,-60"></path><text x="20" y="28" textLength="60" style="font-size:25px">Prev</text></svg></a></div><div class="small-screen-next"><a href="/books/n/handbook/ch14/?report=reader"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100" preserveAspectRatio="none"><path d="M25,30c80,60 80,0 0,60 c30,-60 30,0 0,-60"></path><text x="20" y="28" textLength="60" style="font-size:25px">Next</text></svg></a></div></article><article data-type="fig" id="figobch13F1"><div id="ch13.F1" class="figure bk_fig"><div class="graphic"><img data-src="/books/NBK21082/bin/ch13.ch13f1Karl.jpg" alt="Figure 1" /></div><div class="caption"><p>Figure 1. Sources of sequence data available at NCBI.</p></div></div></article><article data-type="fig" id="figobch13F2"><div id="ch13.F2" class="figure bk_fig"><div class="graphic"><img data-src="/books/NBK21082/bin/ch13.ch13f2Karl.jpg" alt="Figure 2" /></div><div class="caption"><p> Figure 2. Products of the ID system.</p></div></div></article><article data-type="fig" id="figobch13F3"><div id="ch13.F3" class="figure bk_fig"><div class="graphic"><img data-src="/books/NBK21082/bin/ch13.ch13f3Karl.jpg" alt="Figure 3" /></div><div class="caption"><p>Figure 3. The ID system architecture.</p></div></div></article><article data-type="table-wrap" id="figobch13T1"><div id="ch13.T1" class="table"><h3><span class="title">Table 1. Products of the ID system</span></h3><p class="large-table-link" style="display:none"><span class="right"><a href="/books/NBK21082/table/ch13.T1/?report=objectonly" target="object">View in own window</a></span></p><div class="large_tbl" id="__ch13.T1_lrgtbl__"><table class="no_margin"><thead><tr><th id="hd_h_ch13.T1_1_1_1_1" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;">Type</th><th id="hd_h_ch13.T1_1_1_1_2" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;">Source</th><th id="hd_h_ch13.T1_1_1_1_3" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;">ASN.1</th><th id="hd_h_ch13.T1_1_1_1_4" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;">GBFF<sup> a </sup></th><th id="hd_h_ch13.T1_1_1_1_5" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;">Qscore</th><th id="hd_h_ch13.T1_1_1_1_6" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;">GenPept</th><th id="hd_h_ch13.T1_1_1_1_7" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;">Protein FASTA</th></tr></thead><tbody><tr><td headers="hd_h_ch13.T1_1_1_1_1" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;">Cumulative</td><td headers="hd_h_ch13.T1_1_1_1_2" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;">GenBank</td><td headers="hd_h_ch13.T1_1_1_1_3" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"> X</td><td headers="hd_h_ch13.T1_1_1_1_4" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"></td><td headers="hd_h_ch13.T1_1_1_1_5" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"> X</td><td headers="hd_h_ch13.T1_1_1_1_6" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"> X</td><td headers="hd_h_ch13.T1_1_1_1_7" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"> X</td></tr><tr><td headers="hd_h_ch13.T1_1_1_1_1" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;">Incremental</td><td headers="hd_h_ch13.T1_1_1_1_2" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;">GenBank</td><td headers="hd_h_ch13.T1_1_1_1_3" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"> X</td><td headers="hd_h_ch13.T1_1_1_1_4" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"></td><td headers="hd_h_ch13.T1_1_1_1_5" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"> X</td><td headers="hd_h_ch13.T1_1_1_1_6" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"> X</td><td headers="hd_h_ch13.T1_1_1_1_7" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"> X</td></tr><tr><td headers="hd_h_ch13.T1_1_1_1_1" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;">Incremental</td><td headers="hd_h_ch13.T1_1_1_1_2" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;">GenBank<sup>b</sup></td><td headers="hd_h_ch13.T1_1_1_1_3" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"></td><td headers="hd_h_ch13.T1_1_1_1_4" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"> X</td><td headers="hd_h_ch13.T1_1_1_1_5" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"> X</td><td headers="hd_h_ch13.T1_1_1_1_6" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"></td><td headers="hd_h_ch13.T1_1_1_1_7" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"></td></tr><tr><td headers="hd_h_ch13.T1_1_1_1_1" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;">Cumulative</td><td headers="hd_h_ch13.T1_1_1_1_2" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;">RefSeq</td><td headers="hd_h_ch13.T1_1_1_1_3" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"> X</td><td headers="hd_h_ch13.T1_1_1_1_4" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"> X</td><td headers="hd_h_ch13.T1_1_1_1_5" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"></td><td headers="hd_h_ch13.T1_1_1_1_6" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"> X</td><td headers="hd_h_ch13.T1_1_1_1_7" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"> X</td></tr><tr><td headers="hd_h_ch13.T1_1_1_1_1" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;">Incremental</td><td headers="hd_h_ch13.T1_1_1_1_2" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;">RefSeq</td><td headers="hd_h_ch13.T1_1_1_1_3" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"> X</td><td headers="hd_h_ch13.T1_1_1_1_4" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"> X</td><td headers="hd_h_ch13.T1_1_1_1_5" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"></td><td headers="hd_h_ch13.T1_1_1_1_6" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"> X</td><td headers="hd_h_ch13.T1_1_1_1_7" rowspan="1" colspan="1" style="text-align:left;vertical-align:middle;"> X</td></tr></tbody></table></div><div class="tblwrap-foot"><div><dl class="temp-labeled-list small"><dl class="bkr_refwrap"><dt></dt><dd><div id="N0x1c45510N0x3a2d6b0"><p class="no_margin"><sup>a</sup> GBFF, GenBank flatfile; Qscore, sequencing quality score; GenPept,
|
|
GenBank Gene Products.</p></div></dd></dl><dl class="bkr_refwrap"><dt></dt><dd><div id="N0x1c45510N0x3a2d788"><p class="no_margin"><sup>b</sup> NCBI records only.</p></div></dd></dl></dl></div></div></div></article></div><div id="jr-scripts"><script src="/corehtml/pmc/jatsreader/ptpmc_3.22/js/libs.min.js"> </script><script src="/corehtml/pmc/jatsreader/ptpmc_3.22/js/jr.min.js"> </script></div></div>
|
|
|
|
|
|
|
|
|
|
<!-- Book content -->
|
|
|
|
<script type="text/javascript" src="/portal/portal3rc.fcgi/rlib/js/InstrumentNCBIBaseJS/InstrumentPageStarterJS.js"> </script>
|
|
|
|
|
|
<!-- CE8B5AF87C7FFCB1_0191SID /projects/books/PBooks@9.11 portal107 v4.1.r689238 Tue, Oct 22 2024 16:10:51 -->
|
|
<span id="portal-csrf-token" style="display:none" data-token="CE8B5AF87C7FFCB1_0191SID"></span>
|
|
|
|
<script type="text/javascript" src="//static.pubmed.gov/portal/portal3rc.fcgi/4216699/js/3968615.js" snapshot="books"></script></body>
|
|
</html>
|