nih-gov/www.ncbi.nlm.nih.gov/gap/docs/submissionguide/index.html

<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">

    <head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
        <!-- AppResources meta begin -->
        <meta name="paf-app-resources" content="" />
        <!-- AppResources meta end -->

        <!-- TemplateResources meta begin -->
        <meta name="paf_template" content="StdNCol" />

        <!-- TemplateResources meta end -->

        <!-- Page meta begin -->

        <!-- Page meta end -->

        <!-- Logger begin -->
        <meta xmlns:ncbi-portal="http://ncbi.gov/portal/XSLT/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" name="ncbi_app" content="dbgapdocs" /><meta xmlns:ncbi-portal="http://ncbi.gov/portal/XSLT/namespace" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" name="ncbi_pdid" content="static" />
        <!-- Logger end -->

        <title>dbGaP Study Submission Guide</title>

        <!-- PageFixtures headcontent begin -->


        <!-- PageFixtures headcontent end -->

        <!-- AppResources external_resources begin -->
        <script type="text/javascript" src="/core/jig/1.15.1/js/jig.min.js"></script>

        <!-- AppResources external_resources end -->

        <!-- Page headcontent begin -->
        <meta name="subsite" content="dbgap" />
<meta name="path" content="dbgap/docs/submissionguide" />
<meta name="modified" content="2025-01-08T15:08:23Z" />
        <!-- Page headcontent end -->
        <!-- PageFixtures resources begin -->
        <link xmlns="http://www.w3.org/1999/xhtml" type="text/css" rel="stylesheet" href="//static.pubmed.gov/portal/portal3rc.fcgi/4218191/css/4207974/4206132.css" xml:base="http://127.0.0.1/sites/static/header_footer" />

        <!-- PageFixtures resources end -->
    <link rel="shortcut icon" href="//www.ncbi.nlm.nih.gov/favicon.ico" /><meta name="ncbi_phid" content="CE8D5FFC7C8153E10000000000E800B6.m_5" />
<meta name='referrer' content='origin-when-cross-origin'/><link type="text/css" rel="stylesheet" href="//static.pubmed.gov/portal/portal3rc.fcgi/4176647/css/4121862/3974050/3917732/251717/4175140/14534/45193/3534283/4128070/4062871/4005757.css" /><link type="text/css" rel="stylesheet" href="//static.pubmed.gov/portal/portal3rc.fcgi/4176647/css/3529741/3529739.css" media="print" /></head>
    <body class=" static">
        <div class="grid">
            <div class="col twelve_col nomargin shadow">
                <!-- System messages like service outage or JS required; this is handled by the TemplateResources portlet -->
                <div class="sysmessages">
                    <noscript>
	<p class="nojs">
	<strong>Warning:</strong>
	The NCBI web site requires JavaScript to function.
	<a href="/guide/browsers/#enablejs" title="Learn how to enable JavaScript" target="_blank">more...</a>
	</p>
	</noscript>
                </div>
                <!--/.sysmessage-->
                <div class="wrap">
                    <div class="page">
                        <div xmlns:xi="http://www.w3.org/2001/XInclude">
    <div xmlns="http://www.w3.org/1999/xhtml" id="universal_header" xml:base="http://127.0.0.1/sites/static/header_footer">
	<section class="usa-banner">
		<div class="usa-accordion">
			<header class="usa-banner-header">
				<div class="usa-grid usa-banner-inner">
					<img src="https://www.ncbi.nlm.nih.gov/coreutils/uswds/img/favicons/favicon-57.png" alt="U.S. flag" />
					<p>An official website of the United States government</p>
					<button class="non-usa-accordion-button usa-banner-button" aria-expanded="false" aria-controls="gov-banner-top" type="button">
						<span class="usa-banner-button-text">Here's how you know</span>
					</button>
				</div>
			</header>
			<div class="usa-banner-content usa-grid usa-accordion-content" id="gov-banner-top" aria-hidden="true">
				<div class="usa-banner-guidance-gov usa-width-one-half">
					<img class="usa-banner-icon usa-media_block-img" src="https://www.ncbi.nlm.nih.gov/coreutils/uswds/img/icon-dot-gov.svg" alt="Dot gov" />
					<div class="usa-media_block-body">
						<p>
							<strong>The .gov means it's official.</strong>
							<br />
							Federal government websites often end in .gov or .mil. Before
							sharing sensitive information, make sure you're on a federal
							government site.
						</p>
					</div>
				</div>
				<div class="usa-banner-guidance-ssl usa-width-one-half">
					<img class="usa-banner-icon usa-media_block-img" src="https://www.ncbi.nlm.nih.gov/coreutils/uswds/img/icon-https.svg" alt="Https" />
					<div class="usa-media_block-body">
						<p>
							<strong>The site is secure.</strong>
							<br />
							The <strong>https://</strong> ensures that you are connecting to the
							official website and that any information you provide is encrypted
							and transmitted securely.
						</p>
					</div>
				</div>
			</div>
		</div>
	</section>
	<div class="usa-overlay"></div>
	<header class="ncbi-header" role="banner" data-section="Header">

		<div class="usa-grid">
			<div class="usa-width-one-whole">

				<div class="ncbi-header__logo">
					<a href="/" class="logo" aria-label="NCBI Logo" data-ga-action="click_image" data-ga-label="NIH NLM Logo">
						<img src="https://www.ncbi.nlm.nih.gov/coreutils/nwds/img/logos/AgencyLogo.svg" alt="NIH NLM Logo" />
					</a>
				</div>

				<div class="ncbi-header__account">
					<a id="account_login" href="https://account.ncbi.nlm.nih.gov" class="usa-button header-button" style="display:none" data-ga-action="open_menu" data-ga-label="account_menu">Log in</a>
					<button id="account_info" class="header-button" style="display:none" aria-controls="account_popup" type="button">
						<span class="fa fa-user" aria-hidden="true">
							<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="20px" height="20px">
								<g style="fill: #fff">
									<ellipse cx="12" cy="8" rx="5" ry="6"></ellipse>
									<path d="M21.8,19.1c-0.9-1.8-2.6-3.3-4.8-4.2c-0.6-0.2-1.3-0.2-1.8,0.1c-1,0.6-2,0.9-3.2,0.9s-2.2-0.3-3.2-0.9    C8.3,14.8,7.6,14.7,7,15c-2.2,0.9-3.9,2.4-4.8,4.2C1.5,20.5,2.6,22,4.1,22h15.8C21.4,22,22.5,20.5,21.8,19.1z"></path>
								</g>
							</svg>
						</span>
						<span class="username desktop-only" aria-hidden="true" id="uname_short"></span>
						<span class="sr-only">Show account info</span>
					</button>
				</div>

				<div class="ncbi-popup-anchor">
					<div class="ncbi-popup account-popup" id="account_popup" aria-hidden="true">
						<div class="ncbi-popup-head">
							<button class="ncbi-close-button" data-ga-action="close_menu" data-ga-label="account_menu" type="button">
								<span class="fa fa-times">
									<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 48 48" width="24px" height="24px">
										<path d="M38 12.83l-2.83-2.83-11.17 11.17-11.17-11.17-2.83 2.83 11.17 11.17-11.17 11.17 2.83 2.83 11.17-11.17 11.17 11.17 2.83-2.83-11.17-11.17z"></path>
									</svg>
								</span>
								<span class="usa-sr-only">Close</span></button>
							<h4>Account</h4>
						</div>
						<div class="account-user-info">
							Logged in as:<br />
							<b><span class="username" id="uname_long">username</span></b>
						</div>
						<div class="account-links">
							<ul class="usa-unstyled-list">
								<li><a id="account_myncbi" href="/myncbi/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_myncbi">Dashboard</a></li>
								<li><a id="account_pubs" href="/myncbi/collections/bibliography/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_pubs">Publications</a></li>
								<li><a id="account_settings" href="/account/settings/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_settings">Account settings</a></li>
								<li><a id="account_logout" href="/account/signout/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_logout">Log out</a></li>
							</ul>
						</div>
					</div>
				</div>

			</div>
		</div>
	</header>
	<div role="navigation" aria-label="access keys">
		<a id="nws_header_accesskey_0" href="https://www.ncbi.nlm.nih.gov/guide/browsers/#ncbi_accesskeys" class="usa-sr-only" accesskey="0" tabindex="-1">Access keys</a>
		<a id="nws_header_accesskey_1" href="https://www.ncbi.nlm.nih.gov" class="usa-sr-only" accesskey="1" tabindex="-1">NCBI Homepage</a>
		<a id="nws_header_accesskey_2" href="/myncbi/" class="set-base-url usa-sr-only" accesskey="2" tabindex="-1">MyNCBI Homepage</a>
		<a id="nws_header_accesskey_3" href="#maincontent" class="usa-sr-only" accesskey="3" tabindex="-1">Main Content</a>
		<a id="nws_header_accesskey_4" href="#" class="usa-sr-only" accesskey="4" tabindex="-1">Main Navigation</a>
	</div>
	<section data-section="Alerts">
		<div class="ncbi-alerts-placeholder"></div>
	</section>
</div>
</div>
                        <!--/.header-->
                        <div class="header">
                            <div class="res_logo"><h1 class="res_name"><a href="/gap/" title="dbGaP home">dbGaP</a></h1><h2 class="res_tagline">dbgap</h2></div>
                            <div class="search"><form method="get" action="/gap/"><div class="search_form"><label for="database" class="offscreen_noflow">Search database</label><select id="database"><optgroup label="Recent"><option value="gap" selected="selected">dbGaP</option><option value="clinvar">ClinVar</option><option value="medgen">MedGen</option><option value="books" class="last">Books</option></optgroup><optgroup label="All"><option value="gquery">All Databases</option><option value="assembly">Assembly</option><option value="biocollections">Biocollections</option><option value="bioproject">BioProject</option><option value="biosample">BioSample</option><option value="books">Books</option><option value="clinvar">ClinVar</option><option value="cdd">Conserved Domains</option><option value="gap">dbGaP</option><option value="dbvar">dbVar</option><option value="gene">Gene</option><option value="genome">Genome</option><option value="gds">GEO DataSets</option><option value="geoprofiles">GEO Profiles</option><option value="gtr">GTR</option><option value="ipg">Identical Protein Groups</option><option value="medgen">MedGen</option><option value="mesh">MeSH</option><option value="nlmcatalog">NLM Catalog</option><option value="nuccore">Nucleotide</option><option value="omim">OMIM</option><option value="pmc">PMC</option><option value="protein">Protein</option><option value="proteinclusters">Protein Clusters</option><option value="protfam">Protein Family Models</option><option value="pcassay">PubChem BioAssay</option><option value="pccompound">PubChem Compound</option><option value="pcsubstance">PubChem Substance</option><option value="pubmed">PubMed</option><option value="snp">SNP</option><option value="sra">SRA</option><option value="structure">Structure</option><option value="taxonomy">Taxonomy</option><option value="toolkit">ToolKit</option><option value="toolkitall">ToolKitAll</option><option value="toolkitbookgh">ToolKitBookgh</option></optgroup></select><div class="nowrap"><label for="term" class="offscreen_noflow" accesskey="/">Search term</label><div class="nowrap"><input type="text" name="term" id="term" title="Search dbGaP" value="" class="jig-ncbiclearbutton jig-ncbiautocomplete" data-jigconfig="isEnabled:false,disableUrl:'NcbiSearchBarAutoComplCtrl'" autocomplete="off" data-sbconfig="ds:'no',pjs:'no',afs:'yes'" /></div><button id="search" type="submit" class="button_search nowrap" cmd="go">Search</button></div></div></form><ul class=" inline_list searchlinks"><li>
                        <a href="/gap/advanced/">Advanced</a>
                    </li><li>
                        <a href="/gap/limits/">Limits</a>
                    </li></ul></div>

                        </div>
                        <div class="nav_and_browser">

</div>

                        <!-- was itemctrl -->
                        <div class="container">
                            <div id="maincontent" class="content col twelve_col last">
                                <div class="col1">
                                    <h1 id="dbgap-study-submission-guide">dbGaP Study Submission Guide</h1>


<h2 data-heading="h2" data-no-toc="true">You must register your study before submitting data.</h2>


<h2 data-heading="h2" data-no-toc="true"><a target="_blank" href="https://sharing.nih.gov/genomic-data-sharing-policy/submitting-genomic-data/how-to-register-and-submit-a-study-in-dbgap">Register study</a> --&gt; <a href="#astart">Prepare files for submission</a> --&gt; <a href="#qcpass">Check files before submission</a> --&gt; <a target="_blank" href="https://submit.ncbi.nlm.nih.gov/dbgap/">Submit</a> --&gt; <a href="#aprocessing">dbGaP curators process</a> --&gt; <a href="#aSRA">Receive signal and submit high throughput sequences: BAM, CRAM, FASTQ</a> --&gt; <a href="#apreview">Preview and Approve</a> --&gt; <a href="#arelease">Release</a></h2>


<h2 data-heading="h2" data-no-toc="true">Submission Onboarding</h2>


<p><a href="https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/GetPdf.cgi?document_name=HowToSubmit.pdf">dbGaP Submission Overview</a></p>


<p><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/DataModels/HighLevelDataModel.pdf"><img src="/core/assets/dbgap/images/HighLevelDataModel_originalx128text.jpg" /></a>

<a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/DataModels/dbGaPStudyComponents.pdf"><img src="/core/assets/dbgap/images/dbGaPStudyComponents_originalx128text.jpg" /></a>

<a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/DataModels/dbGaPRelationalIDs.pdf"><img src="/core/assets/dbgap/images/dbGaPRelationalIDs_originalx128text.jpg" /></a></p>


<p><a name="avideos" id="avideos"></a>
<strong>Videos: An Overview of the dbGaP Submission Process</strong><br />
<a href="https://youtu.be/P79c3gAWgP4">Part 1 - Register Your Study</a><br />
<a href="https://youtu.be/L1jOi0w9fwg">Part 2 - Submit Your Data</a><br />
<a href="https://youtu.be/2vJ0snPiTvI">Part 3 - Review and Release Your Study</a></p>


<h2 data-heading="h2" data-no-toc="true">What's new?</h2>


<ul>
<li>There are three new videos for an <a href="#avideos">overview of the dbGaP submission process</a>. (June 2024)</li>
<li>There is a new <a href="/gap/docs/login">Login Guide for dbGaP PIs and Submitters for dbGaP Submission System and Submission Portal</a>. (June 2024)</li>
<li>An enhanced <a href="https://www.ncbi.nlm.nih.gov/gap/advanced_search/">dbGaP Advanced Search</a> is now available for users to filter for third-party annotations of Common Data Elements, dbGaP Collections, sensitivity designations of Genomic Summary Results (GSR), and studies with External Data Sources (EDS). (August 2023)</li>
<li>We now have 12 dbGaP Collections with more in progress. A dbGaP Collection includes studies or portions of studies that share the same consent group, disease, or funding project. One Data Access Request (DAR) will provide you with the ability to request for all the studies within a dbGaP Collection at once. To search for dbGaP Collections, visit <a href="https://www.ncbi.nlm.nih.gov/gap/advanced_search/?OBJ=study&amp;COND=%7B%22is_host_of_collection%22:%5B%22yes%22%5D%7D">Advanced Search</a>. For more information, please see the glossary entry "<a href="#collectiongloss">Collections</a>". (July 2023)</li>
<li>A new <a href="#sstrgloss">Subject Sample Telemetry Report (SSTR) webpage and API</a> are available to search and filter on study level Subject and Sample IDs, consents, summary counts, processing status, and molecular and sequence sample uses. See our <a target="_blank" href="https://ncbiinsights.ncbi.nlm.nih.gov/2023/04/27/dbgap-subject-sample-telemetry-report/">blog post</a> for more details. (April 2023)</li>
<li>Jump to "<a href="#newshistory">Previous Updates</a>"</li>
</ul>


<h2 data-heading="h2" data-no-toc="true">Use the questions below to jump to relevant sections or use your browser's find function to search for keywords.</h2>


<div class="toc">
<ul>
<li><a href="#1-what-files-do-i-need-to-submit">1. What files do I need to submit to dbGaP?</a></li>
<li><a href="#2-where-can-i-download-dbgap-sub">2. Where can I download dbGaP Submission Guide Templates to generate the files I need to submit?</a></li>
<li><a href="#3-what-is-the-study-config">3. What is the Study Config?</a></li>
<li><a href="#4-what-is-a-dbgap-subject">4. What is a dbGaP Subject?</a></li>
<li><a href="#5-what-is-a-dbgap-sample">5. What is a dbGaP Sample?</a></li>
<li><a href="#6-what-do-i-need-to-know-about-p">6. What do I need to know about protecting study participants' privacy, HIPAA, and subject de-identification for dbGaP data submissions?</a></li>
<li><a href="#7-what-is-a-phenotype-dataset-ds">7. What is a Phenotype Dataset (DS) File?</a></li>
<li><a href="#8-what-is-a-phenotype-data-dicti">8. What is a Phenotype Data Dictionary (DD) File?</a></li>
<li><a href="#9-how-do-i-create-subject-consen">9. How do I create Subject Consent (SC) DS and DD files?</a></li>
<li><a href="#10-how-do-i-create-subject-sampl">10. How do I create Subject Sample Mapping (SSM) DS and DD files?</a></li>
<li><a href="#11-how-do-i-create-pedigree-ds-a">11. How do I create Pedigree DS and DD files?</a></li>
<li><a href="#12-what-data-must-be-included-in">12. What data must be included in the Subject Phenotypes and Sample Attributes?</a></li>
<li><a href="#13-how-do-i-create-subject-pheno">13. How do I create Subject Phenotypes DS and DD files?</a></li>
<li><a href="#14-how-do-i-create-sample-attrib">14. How do I create Sample Attributes DS and DD files?</a></li>
<li><a href="#15-how-do-i-submit-medical-image">15. How do I submit Medical Images and in what format?</a></li>
<li><a href="#16-how-do-i-verify-that-my-ds-an">16. How do I verify that my DS and DD Files will pass dbGaP's phenotype quality control (QC) tests?</a></li>
<li><a href="#17-what-type-of-study-documents-">17. What type of Study Documents may I submit and in what format?</a></li>
<li><a href="#18-what-should-i-know-about-edit">18. What should I know about editing, proofreading, and copyright?</a></li>
<li><a href="#19-how-do-i-submit-molecular-dat">19. How do I submit Molecular Data to dbGaP?</a></li>
<li><a href="#20-how-do-i-submit-high-throughp">20. How do I submit High Throughput Sequence data and alignment information?</a></li>
<li><a href="#21-how-do-i-submit-copy-number-v">21. How do I submit Copy Number Variation (CNV) data?</a></li>
<li><a href="#22-how-do-i-link-individual-stud">22. How do I link individual study subjects/samples to samples that have been submitted to NCBI databases: GEO, GenBank, SRA (public)?</a></li>
<li><a href="#23-what-are-association-analysis">23. What are Association Analysis Data Files and how should they be formatted?</a></li>
<li><a href="#24-who-can-submit-files-to-dbgap">24. Who can submit files to dbGaP?</a></li>
<li><a href="#25-where-do-i-submit-my-dbgap-fi">25. Where do I submit my dbGaP files?</a></li>
<li><a href="#26-what-if-there-are-errors-or-u">26. What if there are errors or updates in the data and I need to resubmit?</a></li>
<li><a href="#27-what-happens-once-i-submit-my">27. What happens once I submit my core data files and phenotype files to the dbGaP database?</a></li>
<li><a href="#28-when-and-what-will-be-release">28. When and what will be released?</a></li>
<li><a href="#29-whom-may-i-contact-with-quest">29. Whom may I contact with questions about my dbGaP data submission?</a></li>
<li><a href="#30-how-can-i-submit-additional-d">30. How can I submit additional data after my study is released?</a></li>
<li><a href="#glossary-of-terms">GLOSSARY OF TERMS</a></li>
<li><a href="#appendix-for-data-dictionary-dd-">APPENDIX for Data Dictionary (DD) File Descriptions and Specifications</a></li>
</ul>
</div>


<h2 data-heading="h2" data-no-toc="true">Prepare Files for Submission</h2>


<p><a name="astart" id="astart"></a></p>


<h3 id="1-what-files-do-i-need-to-submit">1. What files do I need to submit to dbGaP?</h3>


<p>When a study is registered by a <a href="#gpagloss">Genomic Program Administrator (GPA)</a> in the dbGaP <a href="#ssgloss">Submission System (SS)</a>, the GPA indicates what data is expected to be submitted. This may be verified by the Program Officer (PO) who oversees the study funding. The submitter will separately complete a <a href="#sdogloss">Study Data Outline (SDO)</a> through the <a href="#spgloss">Submission Portal (SP)</a>. This outline summarizes the data that will be uploaded and <a href="#arelease">released</a> in the current version. All data claimed in the SDO must be submitted. The data submission should adhere to the expectations of the <a href="https://sharing.nih.gov/genomic-data-sharing-policy/submitting-genomic-data/data-submission-and-release-expectations">Genomic Data Sharing (GDS) policy</a>.</p>


<p><a name="achecklist" id="achecklist"></a></p>


<h2 data-heading="h2" data-no-toc="true">File Submission Checklist</h2>


<p>All new study versions must complete the <strong><a href="#sdogloss">Study Data Outline</a></strong> in the <a href="https://submit.ncbi.nlm.nih.gov/dbgap/">Submission Portal</a> in order to assert what data types will be submitted and released for the current study version. Upon completion, a dbGaP study accession (<a href="#phsgloss">phs######.v#.p#</a>) will be provided.</p>


<p>Complete the <strong>Study Config</strong> web form. This will populate the public study report page.</p>


<ul>
<li><strong><a href="#aconfig">Study Config</a></strong> </li>
</ul>


<p>For the remaining data, please submit only the files that have been asserted in the <a href="#sdogloss">Study Data Outline</a>. To determine which files are applicable, go through the File Applicability section immediately following this list.</p>


<ul>
<li><strong>Phenotype Dataset (DS) and Data Dictionary (DD) files</strong><ul>
<li>(1) <a href="#asc">Subject Consent DS and DD</a></li>
<li>(1) <a href="#assm">Subject Sample Mapping (SSM) DS and DD</a></li>
<li>(1) <a href="#aped">Pedigree DS and DD</a></li>
<li>(1 or more) <a href="#crucialdata">Subject Phenotypes DS and DD</a></li>
<li>(1 or more) <a href="#asampattr">Sample Attributes DS and DD</a></li>
<li>(1 or more) <strong><a href="#ncbidb">Linking Subject/Sample IDs to samples in other NCBI databases DS and DD</a></strong> </li>
</ul>
</li>
<li><strong><a href="#ageno">Molecular Data</a></strong></li>
<li><strong><a href="#aSRA">Sequence Data</a></strong></li>
<li><strong><a href="#apha">Association Analysis Data</a></strong></li>
<li><strong><a href="#aphd">Study Documents</a></strong></li>
<li><strong><a href="#medimage">Medical Images</a></strong></li>
</ul>


<p>For faster processing time, submit to the dbGaP <a href="#spgloss">Submission Portal</a> by uploading all files in one submission. DO NOT submit BAM, CRAM, FASTQ files until notified.</p>


<h2 data-heading="h2" data-no-toc="true">File Applicability</h2>


<p><strong>Phenotype Dataset (DS) and Data Dictionaries (DD)</strong></p>


<ol>
<li>Studies that have consented subjects must submit a Subject Consent DS and DD.</li>
<li>Studies that have individual level phenotype data (demographic, clinical, exposure, etc) should submit 1 or more Subject Phenotypes DS and DD.</li>
<li>Studies that have molecular data (array, methylation, called variants, etc.) and/or high throughput sequence data (BAM, CRAM, FASTQ) must submit a Subject Sample Mapping DS and DD and 1 or more Sample Attributes DS and DD.</li>
<li>Studies that have self-reported or known genetic relationships and monozygotic twins must submit a Pedigree DS and DD.</li>
<li>Studies that have individual subject/sample IDs submitted to NCBI databases (GEO, GenBank, or public SRA) should provide a Linking DS and DD between the individual study subject/sample IDs to the other NCBI database sample accessions. If only experiment or project accessions are available, then provide via the Study Config web form and mark "no" for "Subject/Sample ID links to public NCBI databases".</li>
</ol>


<p><strong>Molecular Data</strong></p>


<p>Any GWAS, SNP array, imputations, transcriptomic, epigenomic, gene expression, variant calls from WGS, WXS, and targeted sequencing data. This does not include raw sequencing data and alignment information, which is submitted separately.</p>


<p><strong>Sequence Data</strong></p>


<p>Any high throughput sequence data (WGS, WXS, RNA-Seq, etc) in BAM, CRAM, FASTQ formats. Sequence data should be submitted only after: 1) you have received an email with an attached sequence metadata file containing the registered subject and sample IDs, and consents. This process ensures that submitted sequences are tied to sample IDs that belong to consented subjects. 2) The sequence metadata has been processed and you have received an email to upload sequences.</p>


<p><strong>Association Analyses</strong></p>


<p>Any aggregated genomic level data</p>


<p><strong>Study Documents</strong></p>


<p>Any consent forms, protocols, questionnaires, etc. that correspond to the data.</p>


<p><strong>Medical Images</strong></p>


<p>Any CT scans, eye images, etc.</p>


<p><a name="asgtemplates" id="asgtemplates"></a></p>


<h3 id="2-where-can-i-download-dbgap-sub">2. Where can I download dbGaP Submission Guide Templates to generate the files I need to submit?</h3>


<p>Download all Submission Templates: <a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/dbGaP_Submission_Package_20250108.zip">dbGaP_Submission_Package_20250108.zip</a></p>


<p>Download individual Submission Templates: <a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/">https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/</a></p>


<h2 data-heading="h2" data-no-toc="true">Study Config</h2>


<p><a name="aconfig" id="aconfig"></a></p>


<h3 id="3-what-is-the-study-config">3. What is the Study Config?</h3>


<p>The <strong>Study Config</strong> is a web form that collects a description of the study data, methods, and findings, inclusion/exclusion, study history, references, attributions, and terms that will be indexed to enable users to search for your study in <a href="https://www.ncbi.nlm.nih.gov/gap/advanced_search/">dbGaP Advanced Search</a>. The study config must be submitted in order to have a <a href="#phsgloss">dbGaP study accession</a> (phs######.v#.p#) that can be published in dbGaP and used in journal publications. Here is an example of the study report page populated by the information in the study config: (<a href="https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000001.v1.p1"><strong>https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000001.v3.p1</strong></a>). </p>


<p>To fill out the study config, go to your study's dbGaP Submission Portal (<a href="https://submit.ncbi.nlm.nih.gov/dbgap/">https://submit.ncbi.nlm.nih.gov/dbgap/</a>). </p>


<ul>
<li>Click on "Create" if newly filling out the study config or click on "Edit" to modify an existing study config. </li>
<li>Once done, press "Submit" and you will be taken back to the study's Submission Portal page.</li>
<li>To preview the study config, click on "Preview Study Report Page". </li>
</ul>


<p>You may edit the study config until the study is released. To edit, go to your study's Submission Portal, click on "Edit" under "Study Config". Once the study is released, please contact your phenotype curator to make edits. </p>


<p>If you would like to see in advance what items will be collected in the web form, open <strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Phenotype_Data/1_StudyConfig.docx">1_StudyConfig.docx</a></strong>.</p>


<h2 data-heading="h2" data-no-toc="true">Study Participant De-identification</h2>


<h3 id="4-what-is-a-dbgap-subject">4. What is a dbGaP Subject?</h3>


<p>A dbGaP Subject is defined as a single human person/individual/patient that arises from a single germline. Each subject should be submitted with a single, unique, de-identified subject ID. Subjects submitted to dbGaP must be <a href="#consentgloss">consented</a> to submit to a public database. Subject IDs should be an integer or string value. Integers should not have zero padding. IDs should not have spaces. Specifically, only the following characters can be included in the ID: English letters, Arabic numerals, period (.), hyphen (-), underscore (_), at symbol (@), and the pound sign (#). Once a variable name for the subject ID has been chosen, please use the same variable name throughout all the phenotype files for consistency. For example, please do not use SUBJECT_ID in one file and INDIVIDUAL_ID in another file. Please also do not use "dbGaP" in your submitted ID name, since dbGaP will assign a dbGaP subject ID that will be included in the final dump files along with the submitted subject ID. Subjects that are known to be the same person across dbGaP studies will be assigned the same dbGaP subject ID.</p>


<h3 id="5-what-is-a-dbgap-sample">5. What is a dbGaP Sample?</h3>


<p>A dbGaP Sample is defined as the ID of the final preps submitted to dbGaP by a genotyping center, a sequencing group, or to an NCBI resource, such as GEO or GenBank. A single subject may be mapped to multiple samples, but a single sample should not be mapped to multiple subjects unless the samples are pooled.* For example, if one subject (SUBJECT_ID) provided one sample, and that sample was processed to generate 2 sequencing runs or 1 sequencing and 1 genotyping array run, the data file would show two rows, both using the same subject ID, but having 2 unique sample IDs.</p>


<p>*Please inquire about pooled samples if applicable. This would only apply to pooled samples that belong to consented subjects. If the samples are pooled from controls that are publicly available, there is no need for marking the pooled samples, and a single sample ID may be assigned.</p>


<p>Each sample should be submitted with a single, unique, de-identified sample ID. Sample IDs should be an integer or string value. Integers should not have zero padding. IDs should not have spaces. Specifically, only the following characters can be included in the ID: English letters, Arabic numerals, period (.), hyphen (-), underscore (_), at symbol (@), and the pound sign (#). Once a variable name for the sample ID has been chosen, please use the same variable name throughout all the phenotype files for consistency. For example, please do not use SAMPLE_ID in one file and SAMPLE_NAME in another file. Please also do not use "dbGaP" in your submitted ID name, since dbGaP will assign a dbGaP sample ID that will be included in the final dump files along with the submitted sample ID.</p>


<h3 id="6-what-do-i-need-to-know-about-p">6. What do I need to know about protecting study participants' privacy, HIPAA, and subject de-identification for dbGaP data submissions?</h3>


<p>To comply with HIPAA, personally identifying information must be removed from all data, e.g. names, cities, dates, telephone numbers, social security numbers, and any other potentially identifying information, characteristic, or code. </p>


<p><a name="twostepdeid" id="twostepdeid"></a>
A 2-Step de-identification is required for all IDs submitted in dbGaP data files.</p>


<p><span style="color:red"><strong>Example: Two step removal of identifiers</strong></span></p>


<p><span style="color:red"><strong>Step one</strong>: Personal Information → Remove identifiers → Create Study person ID</span></p>


<p><span style="color:red"><strong>Step two</strong>: Study person ID → Create Subject ID submitted to dbGaP.</span></p>


<p>Subject IDs submitted to dbGaP may be randomly assigned or may be consecutive numbers without any identifying information (i.e., the submitted Subject ID should not be based on the study person ID or any personal identifiers such as subject's birth date, health record number, or name). The same applies to sample IDs.</p>


<p>Dates directly tied to an individual that is smaller than a year cannot be submitted. In other words, the month and day should be removed and only year be kept. Alternatively, the date can be normalized to days relative to a set point. For the algorithm dbGaP uses to find HIPAA sensitive dates, see glossary entry: <a href="#hipaagloss">HIPAA</a>.</p>


<p>There may be HIPAA sensitive data inherent to a study, such as cities (ex. Framingham) and small populations (ex. Hutterites) that are shown on dbGaP pages. For ages over 89, the individual level data will only be accessible to Authorized Access users, while the public variable summary will winsorize all ages over 89. For other extreme values, since HIPAA does not specify particular cut-off values, value distribution curves are checked and extreme values are hidden case by case.</p>


<p>The NIH Data Management and Sharing Policy published a Supplemental Information to the NIH Policy for Data Management and Sharing: Protecting Privacy When Sharing Human Research Participant Data (<a href="https://grants.nih.gov/grants/guide/notice-files/NOT-OD-22-213.html">NOT-OD-22-213</a>).</p>


<p><a name="apheno" id="apheno"></a></p>


<h2 data-heading="h2" data-no-toc="true">Phenotype Dataset (DS) and Data Dictionary (DD) Files</h2>


<p>This set of files is referred to as phenotype datasets and data dictionaries since this is curated by the phenotype curator.</p>


<h3 id="7-what-is-a-phenotype-dataset-ds">7. What is a Phenotype Dataset (DS) File?</h3>


<p>A Dataset (DS) file is a rectangular table of data values, subject/sample IDs, and variables, to be submitted either in .txt or .xlsx format, with .txt being the preferred format. There are 5 types of datasets required for submission:</p>


<ol>
<li><a href="#asc">Subject Consent (SC)</a> DS – 1 file only per study. This is a list of subjects (person), their <a href="#aconsent">consents</a>, and biological sex.</li>
<li><a href="#assm">Subject Sample Mapping (SSM)</a> DS – 1 file only per study. This is a list of subjects (person) mapped to their samples submitted as molecular data and high throughput sequence data.</li>
<li><a href="#aped">Pedigree</a> DS – 1 file only per study if there are self-reported or known genetic relationships.</li>
<li><a href="#crucialdata">Subject Phenotypes</a> DS – 1 or more files per study. This is person-level phenotypes.</li>
<li><a href="#crucialdata">Sample Attributes</a> DS – 1 or more files per study. This is sample-level attributes.</li>
</ol>


<p>Required if applicable: Sample Mapping to other NCBI databases (e.g. Trace, GEO, GenBank, public SRA) – 1 or more files per study.</p>


<p>Each <u>column</u> represents a single phenotypic variable. Row # 1 (column headers) of a data file will contain only the variable names.</p>


<p>Each <u>row</u> contains phenotypes of one Subject or attributes of one Sample. Following the first row (column headers), each subsequent row will reflect data of one subject or sample, depending on the type of file. </p>


<h3 id="8-what-is-a-phenotype-data-dicti">8. What is a Phenotype Data Dictionary (DD) File?</h3>


<p>A Data Dictionary (DD) file is a table that defines and describes the variables in the corresponding dataset file (DS). It should be submitted in either .txt or .xlsx format, with .xlsx being the preferred format. Each dataset (DS) file must be submitted with a corresponding DD file. You may review a complete list of <strong>data dictionary descriptions and specifications,</strong> including those required in your DD file in the <a href="#appdx">APPENDIX</a>.</p>


<p>The required columns and specifications for a DD File are:</p>


<p>Column 1: <strong>VARNAME</strong> – variable name. Best if the varname reflects the measurements taken (e.g. HDL_am, ALCOHOL_day, TREATMENT_tamoxi). Do not use "dbGaP" in the variable name.</p>


<p>Column 2: <strong>VARDESC</strong> – variable description. Be specific so that it is clear what you have measured. For example, "blood pressure" is useful, but "brachial blood pressure while sitting" is more informative. Alternatively, submit study documents with details of data collection — dbGaP will link appropriate document sections to variables. For the AFFECTION_STATUS, please fill the disease name in the VARDESC.</p>


<p>Column 3: <strong>UNITS</strong> – units of measurement. If there are no units, leave the entry blank. If none of the variables have units, the UNITS column may be omitted.</p>


<p>Last set of columns: <strong>VALUES</strong> – encoded values with definitions to describe the codes used in the DS. Fill single value in one cell; no compound values in one cell. See <a href="#avalues">VALUES</a> in APPENDIX for full requirement details.</p>


<p>Example:</p>


<table>
<thead>
<tr>
<th><em>Last column with header</em></th>
<th><em>Leave header blank</em></th>
<th><em>Leave header blank</em></th>
<th><em>Leave header blank</em></th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>VALUES</strong></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>10=Elementary</td>
<td>20=High School</td>
<td>40=College</td>
<td>4=Graduate School</td>
</tr>
<tr>
<td>1=2-4 drinks per day</td>
<td>2=5-7 drinks per day</td>
<td>3=&gt;7 drinks per day</td>
<td></td>
</tr>
</tbody>
</table>


<h2 data-heading="h2" data-no-toc="true">Study Meta DS and DD Files: Subject Consent, Subject Sample Mapping (SSM), and Pedigree</h2>


<p><a name="asc" id="asc"></a></p>


<h3 id="9-how-do-i-create-subject-consen">9. How do I create Subject Consent (SC) DS and DD files?</h3>


<p>The Subject Consent (SC) DS contains a comprehensive list of all unique de-identified subject IDs, their assigned consent group, and biological sex value. Open the templates under <strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Phenotype_Data">Phenotype_Data</a></strong>:<br />
<strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Phenotype_Data/2a_SubjectConsent_DS.txt">2a_SubjectConsent_DS.txt</a></strong><br />
<strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Phenotype_Data/2b_SubjectConsent_DD.xlsx">2b_SubjectConsent_DD.xlsx</a></strong></p>


<p>The 2 variables required for the DS File are <strong>SUBJECT_ID</strong> and <strong>CONSENT</strong>.</p>


<p><strong>Column 1: SUBJECT_ID</strong></p>


<p>The first column must be the <a href="#twostepdeid">de-identified IDs</a> of the subjects. Enter a single de-identified subject ID for each person, and preferably use "SUBJECT_ID" as the subject ID header. A person should not have multiple SUBJECT_IDs. If necessary, you may use another variable name (but be consistent in all study files). Please do not use "dbGaP" in the variable name or the ID itself. See <a href="#subjidgloss">SUBJECT_ID</a> in Glossary for full requirement details. dbGaP will assign a study repository aka namespace to every study. The repository/namespace + submitted SUBJECT_ID  will be assigned a dbGaP generated subject ID.</p>


<p>IDs listed in the SUBJECT_ID column must include:</p>


<ol>
<li>All consented de-identified subject IDs with submitted phenotype</li>
<li>All consented de-identified subject IDs with molecular data (e.g. genotypes, high throughput sequences, GEO)</li>
<li>Unconsented pedigree members used for linking purposes only (without submitted data)</li>
<li>Unconsented HapMap subjects used as controls or other publicly available controls with unrestricted use in genotype data  </li>
</ol>


<p><a name="aconsent" id="aconsent"></a>
<strong>Column 2: CONSENT</strong></p>


<p>The second column must be the <a href="#consentgloss">consents</a> of the subjects. Enter a single consent value for each person using an integer (1,2,3…) encoded in the DD. <strong>The DD consents must exactly match the consents registered in the <a href="#ssgloss">Submission System</a> (SS), including modifiers. If they do not match, we cannot process your study.</strong> If you are a submitter and do not have access to the SS, you can see the consent groups in the <a href="https://submit.ncbi.nlm.nih.gov/dbgap/">dbGaP Submission Portal</a> for your study by clicking "View consent group" in the box on the upper right. For questions regarding the registered consent groups and Data Use Limitations (DUL), please contact your GPA. For unconsented pedigree linking members or publicly available controls with unrestricted use (including HapMaps), set CONSENT=0. Aside from the aforementioned controls with unrestricted use, no other samples may belong to CONSENT=0 individuals. See <a href="#consentgloss">CONSENT</a> in Glossary for full requirement details.</p>


<p>In the corresponding DD, do not include CONSENT code 0 in the corresponding DD. dbGaP will automatically add the consent 0 code in the DD. It will be listed as "0=Subjects used as genotyping controls and/or pedigree linking members" (no quotes). For all other consent groups &gt; 0, use the format: <strong>code=Consent Group's Title (Consent Group's Abbreviation)</strong>. For example, here is what a study with 2 consent groups might look like in the DD.</p>


<table>
<thead>
<tr>
<th><em>Last column with header</em></th>
<th><em>Leave header blank</em></th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>VALUES</strong></td>
<td></td>
</tr>
<tr>
<td>1=General Research Use (NPU) (GRU-NPU)</td>
<td>2=Health/Medical/Biomedical (GSO) (HMB-GSO)</td>
</tr>
</tbody>
</table>


<p><strong>Column 3: SEX</strong></p>


<p>Provide the biological sex value of the person listed in the SUBJECT_ID column. To speed up study processing through the dbGaP auto-pipeline, sex values have been restricted to M/Male/1 or F/Female/2 or UNK/Unknown or left empty, and should match the sex values entered into the <a href="#pdds">Pedigree DS</a> if a pedigree DS is applicable. All other values will require a resubmission.</p>


<p><strong>Aliases or Overlapping Subjects between Studies</strong></p>


<p>Include the variables SUBJECT_SOURCE and SOURCE_SUBJECT_ID <strong>ONLY IF</strong>: </p>


<ol>
<li>Your study has subjects that are included in another dbGaP study <em><strong>OR</strong></em> </li>
<li>Your subjects are available in a public repository with an established namespace (Coriell, NRGR, NINDS, NIMH, etc.) </li>
</ol>


<p>This will enable dbGaP to assign the same dbGaP generated subject ID for a person and prevent users from double counting the same person downloaded from multiple dbGaP studies. If you are planning to make SUBJECT_ID = SOURCE_SUBJECT_ID for all subjects, in other words, your list of subjects is a complete overlap to the source, please let your phenotype curator know. Rather than submitting 2 additional variables (SUBJECT_SOURCE and SOURCE_SUBJECT_ID) in your Subject Consent files, your curator can assign the same dbGaP labeled study repository as the other dbGaP study or repository.</p>


<p><strong>Column 4 and 5: SUBJECT_SOURCE and SOURCE_SUBJECT_ID (Submit both variables. We are unable to process SOURCE_SUBJECT_ID without a SUBJECT_SOURCE).</strong></p>


<p>SUBJECT_SOURCE: Provide the namespace, such as the name of the public repository or existing dbGaP subject repository.<br />
SOURCE_SUBJECT_ID: Provide the <a href="#twostepdeid">de-identified subject ID</a> used in the source. Follow <a href="#subjidgloss">guidelines for SUBJECT_ID</a>.</p>


<ol>
<li>For subjects who have participated in another dbGaP study, the value to use for SUBJECT_SOURCE is indicated under "dbgap_subject_repository," the 6th column of the  <a href="#sstrgloss">Subject Sample Telemetry Report (SSTR)</a>. The SSTR may be found by first performing a search for the dbGaP accession number of the overlapping study on the dbGaP home page to locate that study's public page. From there, select the link Subject Sample Telemetry Report (SSTR). If you do not know the accession number of the overlapping study or are unsure of what to use as the SUBJECT_SOURCE, please work with your phenotype curator to obtain this information.</li>
<li>For referencing HapMap subjects from Coriell, the SUBJECT_SOURCE value should be written as "Coriell". The SOURCE_SUBJECT_ID should be written as the de-identified subject ID assigned by Coriell. Please make sure the SEX value of the subject matches the value listed on the Coriell website.</li>
<li>The SUBJECT_ID and SOURCE_SUBJECT_ID can have identical or different IDs.</li>
<li>For SUBJECT_IDs that map to more than one existing or public repository, use SUBJECT_SOURCE and SOURCE_SUBJECT_ID for the first set of aliases and create additional columns for SUBJECT_SOURCE2 and SOURCE_SUBJECT_ID2, SUBJECT_SOURCE3 and SOURCE_SUBJECT_ID3, etc. Note, do not include the number "1" in the first set of aliases. For example, if you have <code>SUBJECT_ID 101</code> who is known as <code>NA1111(Coriell)</code> and <code>45678(NHGRI)</code>, for the first alias, use SUBJECT_SOURCE=Coriell and SOURCE_SUBJECT_ID=NA1111; for the second alias, use SUBJECT_SOURCE2=NHGRI and SOURCE_SUBJECT_ID2=45678.</li>
<li>Avoid a SUBJECT_SOURCE that is very general coupled with a SOURCE_SUBJECT_ID that is a simple integer. For example, SUBJECT_SOURCE=University of California and SOURCE_SUBJECT_ID=1. There is a potential for unintended subject collision; that is, two different people are assigned the same source and ID across studies. There are many University of Californias and there are many studies that use 1 as an ID.</li>
</ol>


<p><a name="scds" id="scds"></a>
<strong>Example of Subject Consent DS File</strong></p>


<table>
<thead>
<tr>
<th>SUBJECT_ID</th>
<th>CONSENT</th>
<th>SEX</th>
<th>SUBJECT_SOURCE</th>
<th>SOURCE_SUBJECT_ID</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>1</td>
<td>1</td>
<td></td>
<td></td>
</tr>
<tr>
<td>2</td>
<td>1</td>
<td>1</td>
<td>NRGR</td>
<td>1012</td>
</tr>
<tr>
<td>3</td>
<td>1</td>
<td>1</td>
<td>NINDS</td>
<td>NDS00008</td>
</tr>
<tr>
<td>4</td>
<td>1</td>
<td>2</td>
<td></td>
<td></td>
</tr>
<tr>
<td>5</td>
<td>1</td>
<td>2</td>
<td>Example Consortium</td>
<td>1284yA8-B</td>
</tr>
<tr>
<td>6</td>
<td>1</td>
<td>UNK</td>
<td></td>
<td></td>
</tr>
<tr>
<td>7</td>
<td>1</td>
<td>UNK</td>
<td></td>
<td></td>
</tr>
<tr>
<td>8</td>
<td>1</td>
<td>1</td>
<td>Coriell</td>
<td>NA1234</td>
</tr>
<tr>
<td>9</td>
<td>1</td>
<td>1</td>
<td>NLM</td>
<td>13</td>
</tr>
<tr>
<td>10</td>
<td>1</td>
<td>2</td>
<td></td>
<td></td>
</tr>
<tr>
<td>1001</td>
<td>0</td>
<td>2</td>
<td></td>
<td></td>
</tr>
<tr>
<td>1002</td>
<td>0</td>
<td>1</td>
<td></td>
<td></td>
</tr>
</tbody>
</table>


<p><a name="scdd" id="scdd"></a>
<strong>Example of Subject Consent DD File</strong></p>


<table>
<thead>
<tr>
<th>VARNAME</th>
<th>VARDESC</th>
<th>TYPE</th>
<th>VALUES</th>
<th></th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<td>SUBJECT_ID</td>
<td>Subject ID</td>
<td>string</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>CONSENT</td>
<td>Registered consent groups (Data Use Limitations (DUL)) as determined by submitters' Institutional Review Boards (IRB) or equivalent body.</td>
<td>encoded value</td>
<td>1=General Research Use (GRU)</td>
<td></td>
<td></td>
</tr>
<tr>
<td>SEX</td>
<td>Biological sex</td>
<td>encoded value</td>
<td>1=Male</td>
<td>2=Female</td>
<td>UNK=Unknown</td>
</tr>
<tr>
<td>SUBJECT_SOURCE</td>
<td>Source repository where subjects originate</td>
<td>string</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>SOURCE_SUBJECT_ID</td>
<td>Subject ID used in the Source Repository</td>
<td>string</td>
<td></td>
<td></td>
<td></td>
</tr>
</tbody>
</table>


<p><a name="assm" id="assm"></a></p>


<h3 id="10-how-do-i-create-subject-sampl">10. How do I create Subject Sample Mapping (SSM) DS and DD files?</h3>


<p>The SSM is a mapping of SUBJECT_IDs (consented subjects and their phenotype data) to SAMPLE_IDs. This list of SAMPLE_IDs is an assertion of the samples that will be submitted in the <a href="#ageno">molecular data</a>, <a href="#aSRA">high throughput sequence data</a>, or <a href="#ncbidb">linked to an NCBI database: GEO, GenBank, non-human public SRA</a>. Open the templates under <strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Phenotype_Data">Phenotype_Data</a></strong>:<br />
<strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Phenotype_Data/3a_SSM_DS.txt">3a_SSM_DS.txt</a></strong><br>
<strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Phenotype_Data/3b_SSM_DD.xlsx">3b_SSM_DD.xlsx</a></strong></br></p>


<p>The required variables are <strong>SUBJECT_ID</strong> and <strong>SAMPLE_ID</strong>.</p>


<p><strong>Column 1: SUBJECT_ID</strong></p>


<p>The first column must be the <a href="#twostepdeid">de-identified IDs</a> of the subjects. Only enter SUBJECT_IDs that are linked to SAMPLE_IDs with submitted <a href="#ageno">molecular data</a>, <a href="#aSRA">high throughput sequence data</a>, or <a href="#ncbidb">linked to an NCBI database: GEO, GenBank, non-human public SRA</a>. If a subject does not have these data types, do not include the subject ID. Subjects listed in the SUBJECT_ID column must be consented with CONSENT &gt; 0 or are publicly available controls with unrestricted use (CONSENT=0) in the <a href="#scds">Subject Consent DS</a>. For SUBJECT_IDs with multiple types of molecular data (e.g. SNP array data, RNA expression data, sequencing data), use multiple rows with identical subject ID, but distinct sample IDs. See <a href="#subjidgloss">SUBJECT_ID</a> in Glossary for full requirement details.</p>


<p><strong>Column 2: SAMPLE_ID</strong></p>


<p>The second column must be the <a href="#twostepdeid">de-identified IDs</a> of the samples. The SAMPLE_IDs in this column must be identical to those used in the <a href="#ageno">molecular data (PLINK, VCFs, etc)</a> and <a href="#aSRA">sequence metadata</a>. Different sample runs or aliquots of the same sample should be identified by different SAMPLE_IDs, but the same SUBJECT_IDs. Likewise, intended duplicates should also be identified by different SAMPLE_IDs, but the same SUBJECT_IDs. <a href="#ncbidb">Sample IDs linking to a public NCBI resource</a> (GEO, GenBank, public SRA) should also be included. The SAMPLE_ID column should not have any repeating IDs. See <a href="#sampidgloss">SAMPLE_ID</a> in Glossary for full requirement details.</p>


<p><strong>Can the SAMPLE_ID be the same as the SUBJECT_ID?</strong></p>


<p>Yes, the SAMPLE_ID can be the same as the SUBJECT_ID. Here are some common scenarios:</p>


<ol>
<li>If the study has 1:1 subject and sample IDs, please still submit an SSM listing the SUBJECT_ID and SAMPLE_ID identically.</li>
<li>If the molecular data uses subject IDs, then treat the subject IDs as sample IDs, listing SUBJECT_ID and SAMPLE_ID identically. Please verify that each person is only assigned a single SUBJECT_ID.</li>
<li>A person has multiple samples and one of the sample IDs is identical to the subject ID. This is acceptable.</li>
</ol>


<p><a name="ssmds" id="ssmds"></a>
<strong>Example of SSM DS File</strong></p>


<table>
<thead>
<tr>
<th>SUBJECT_ID</th>
<th>SAMPLE_ID</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>S1</td>
</tr>
<tr>
<td>2</td>
<td>S2</td>
</tr>
<tr>
<td>3</td>
<td>S3</td>
</tr>
<tr>
<td>4</td>
<td>S4</td>
</tr>
<tr>
<td>5</td>
<td>S5</td>
</tr>
<tr>
<td>6</td>
<td>S6</td>
</tr>
<tr>
<td>6</td>
<td>S7</td>
</tr>
<tr>
<td>7</td>
<td>S8</td>
</tr>
<tr>
<td>7</td>
<td>S9</td>
</tr>
<tr>
<td>7</td>
<td>S10</td>
</tr>
<tr>
<td>8</td>
<td>S11</td>
</tr>
<tr>
<td>8</td>
<td>S12</td>
</tr>
</tbody>
</table>


<p><a name="ssmdd" id="ssmdd"></a>
<strong>Example of SSM DD File</strong></p>


<table>
<thead>
<tr>
<th>VARNAME</th>
<th>VARDESC</th>
<th>TYPE</th>
<th>VALUES</th>
</tr>
</thead>
<tbody>
<tr>
<td>SUBJECT_ID</td>
<td>Subject ID</td>
<td>string</td>
<td></td>
</tr>
<tr>
<td>SAMPLE_ID</td>
<td>Sample ID</td>
<td>string</td>
<td></td>
</tr>
</tbody>
</table>


<p><a name="aped" id="aped"></a></p>


<h3 id="11-how-do-i-create-pedigree-ds-a">11. How do I create Pedigree DS and DD files?</h3>


<p>The Pedigree DS lists the genealogical relationships of subjects within a study. If there are no known relationships, this file does not need to be submitted. However, if dbGaP finds that there are possible relationships between subjects after reviewing the genetic data (with the <a href="https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/Software.cgi">GRAF</a> [<strong>G</strong>enetic <strong>R</strong>elationship <strong>a</strong>nd <strong>F</strong>ingerprinting] software), dbGaP will request a pedigree DS or include a README file with the results of IBD and/or dbGaP GRAF. If the IBD or pedigree information should not be released because of data sharing limitations, please let dbGaP know in writing. See <a href="#grafgloss">GRAF</a> in the Glossary for more information. Open the templates under <strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Phenotype_Data">Phenotype_Data</a></strong>:<br />
<strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Phenotype_Data/4a_Pedigree_DS.txt">4a_Pedigree_DS.txt</a></strong><br />
<strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Phenotype_Data/4b_Pedigree_DD.xlsx">4b_Pedigree_DD.xlsx</a></strong></p>


<p>The required variables are <strong>FAMILY_ID</strong>, <strong>SUBJECT_ID</strong>, <strong>FATHER</strong>, <strong>MOTHER</strong>, and <strong>SEX</strong>. </p>


<p><strong>MZ_TWIN_ID</strong> is required if applicable.</p>


<p><strong>Column 1: FAMILY_ID</strong></p>


<p>FAMILY_IDs are de-identified and should be the same for members of the same family.</p>


<p><strong>Column 2: SUBJECT_ID</strong></p>


<p>SUBJECT_IDs should include any person with familial relationships relevant to the study. The SUBJECT_ID column should also include FATHER and MOTHER IDs. All SUBJECT_IDs of the pedigree file should be included in the Subject Consent (SC) DS, where the study subjects have CONSENT &gt;=1 and linking pedigree SUBJECT_IDs have CONSENT=0. See <a href="#subjidgloss">SUBJECT_ID</a> in Glossary for full requirement details.</p>


<p><strong>Columns 3 and 4: FATHER and MOTHER</strong></p>


<p>List FATHER IDs in Column 3 and MOTHER IDs in Column 4. FATHER and MOTHER IDs should be unique and de-identified. Each FATHER ID and MOTHER ID should be included in the SUBJECT_ID column of both the Pedigree DS and the Subject Consent (SC) DS. For SUBJECT_IDs that do not have parents, the FATHER and MOTHER IDs should be filled with 0 or left blank. <a href="#dummyidgloss">Dummy IDs</a> should be created for the FATHER and MOTHER IDs if no ID is known and it is necessary to indicate sibling or avuncular relationships.</p>


<p><strong>Column 5: SEX</strong></p>


<p>Provide the biological sex value of the person listed in the SUBJECT_ID column. To speed up study processing through the dbGaP auto-pipeline, sex values have been restricted to M/Male/1 or F/Female/2 or UNK/Unknown or left empty, and should match the sex values entered into the <a href="#asc">Subject Consent DS</a>. All other values will require a resubmission.</p>


<p><strong>Column 6: MZ_TWIN_ID</strong></p>


<p>De-identified monozygotic twin IDs should indicate monozygotic twins and multiples of the same family. The MZ_TWIN_ID column should distinguish sample duplicates from samples of monozygotic twins. Monozygotic twins and multiples should be assigned the same MZ_TWIN_ID, FATHER_ID, and MOTHER ID, but different SUBJECT_IDs. For dizygotic twins and all other individuals, the MZ_TWIN_ID column should be left blank. If you wish to identify dizygotic twins, an additional variable may be included in the subject phenotypes DS.</p>


<p><strong>How should I list families with half siblings?</strong></p>


<p>You may list families with half siblings using either example with Example 1 being more preferable. Please remember to include SEX column and if applicable, the MZ_TWIN_ID column.</p>


<ul>
<li>
<p>Example 1:</p>
<blockquote>
<table>
<thead>
<tr>
<th align="center">FAMILY_ID</th>
<th>SUBJECT_ID</th>
<th>FATHER</th>
<th>MOTHER</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center">1</td>
<td>A</td>
<td>C</td>
<td>D</td>
</tr>
<tr>
<td align="center">1</td>
<td>B</td>
<td>C</td>
<td>E</td>
</tr>
<tr>
<td align="center">1</td>
<td>C</td>
<td>0</td>
<td>0</td>
</tr>
<tr>
<td align="center">1</td>
<td>D</td>
<td>0</td>
<td>0</td>
</tr>
<tr>
<td align="center">1</td>
<td>E</td>
<td>0</td>
<td>0</td>
</tr>
</tbody>
</table>
</blockquote>
</li>
<li>
<p>Example 2:</p>
<blockquote>
<table>
<thead>
<tr>
<th align="center">FAMILY_ID</th>
<th>SUBJECT_ID</th>
<th>FATHER</th>
<th>MOTHER</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center">1</td>
<td>A</td>
<td>C</td>
<td>D</td>
</tr>
<tr>
<td align="center">1</td>
<td>B</td>
<td>C</td>
<td>E</td>
</tr>
<tr>
<td align="center">1</td>
<td>C</td>
<td>0</td>
<td>0</td>
</tr>
<tr>
<td align="center">1</td>
<td>D</td>
<td>0</td>
<td>0</td>
</tr>
<tr>
<td align="center">2</td>
<td>E</td>
<td>0</td>
<td>0</td>
</tr>
</tbody>
</table>
</blockquote>
</li>
</ul>


<p><a name="pdds" id="pdds"></a>
<strong>Example of a Pedigree DS File</strong></p>


<table>
<thead>
<tr>
<th>FAMILY_ID</th>
<th>SUBJECT_ID</th>
<th>FATHER</th>
<th>MOTHER</th>
<th>SEX</th>
<th>MZ_TWIN_ID</th>
</tr>
</thead>
<tbody>
<tr>
<td>100</td>
<td>1001</td>
<td>0</td>
<td>0</td>
<td>2</td>
<td></td>
</tr>
<tr>
<td>100</td>
<td>1002</td>
<td>0</td>
<td>0</td>
<td>1</td>
<td></td>
</tr>
<tr>
<td>100</td>
<td>1</td>
<td>1002</td>
<td>1001</td>
<td>1</td>
<td>1</td>
</tr>
<tr>
<td>100</td>
<td>2</td>
<td>1002</td>
<td>1001</td>
<td>1</td>
<td>1</td>
</tr>
<tr>
<td>101</td>
<td>1011</td>
<td>0</td>
<td>0</td>
<td>2</td>
<td></td>
</tr>
<tr>
<td>101</td>
<td>1012</td>
<td>0</td>
<td>0</td>
<td>1</td>
<td></td>
</tr>
<tr>
<td>101</td>
<td>3</td>
<td>1012</td>
<td>1011</td>
<td>1</td>
<td></td>
</tr>
<tr>
<td>102</td>
<td>1022</td>
<td>0</td>
<td>0</td>
<td>2</td>
<td></td>
</tr>
<tr>
<td>102</td>
<td>1023</td>
<td>0</td>
<td>0</td>
<td>1</td>
<td></td>
</tr>
<tr>
<td>102</td>
<td>4</td>
<td>1023</td>
<td>1022</td>
<td>2</td>
<td></td>
</tr>
</tbody>
</table>


<p><a name="pddd" id="pddd"></a>
<strong>Example of a Pedigree DD File</strong></p>


<table>
<thead>
<tr>
<th>VARNAME</th>
<th>VARDESC</th>
<th>TYPE</th>
<th>VALUES</th>
<th></th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<td>FAMILY_ID</td>
<td>Family ID</td>
<td>string</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>SUBJECT_ID</td>
<td>Subject ID</td>
<td>string</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>FATHER</td>
<td>Father's Subject ID</td>
<td>string</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>MOTHER</td>
<td>Mother's Subject ID</td>
<td>string</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>SEX</td>
<td>Biological sex</td>
<td>encoded value</td>
<td>1=Male</td>
<td>2=Female</td>
<td>UNK=Unknown</td>
</tr>
<tr>
<td>MZ_TWIN_ID</td>
<td>Twin ID for monozygotic twins and multiples. An MZ_TWIN_ID is not provided for dizygotic twins or multiples.</td>
<td>string</td>
<td></td>
<td></td>
<td></td>
</tr>
</tbody>
</table>


<h2 data-heading="h2" data-no-toc="true">Subject Phenotypes and Sample Attributes DS and DD Files</h2>


<p><a name="crucialdata" id="crucialdata"></a></p>


<h3 id="12-what-data-must-be-included-in">12. What data must be included in the Subject Phenotypes and Sample Attributes?</h3>


<p>Metadata around the experiment or study and annotations that are necessary to reproduce any published table or analysis must be included with genomic data submissions. In particular, data pertinent to the interpretation of genomic data -- such as associated phenotype data (e.g. clinical information), exposure data, relevant metadata, and <a href="#aphd">descriptive information</a> (e.g. protocols or methodologies used) -- are expected to be shared. <strong>To avoid user questions, make sure to include self-reported RACE and relevant dates (e.g., birth, diagnosis, sample collection) written as years or normalized to a set point in time, along with any phenotypes, measured or collected data that are described in your Study Description.</strong> For the Subject Phenotypes, it would be data relevant to the individual person. For the Sample Attributes, it would be data relevant to the sample derived from the person. For instance, do not list the RACE variable in the Sample Attributes, since RACE is stable for a person across samples. However, for variables like TREATMENT, if the person was only treated once, and data was collected, then TREATMENT could belong in the Subject Phenotypes table. However, if TREATMENT was completed multiple times, and each time a sample was extracted, then it would be better for TREATMENT to be tracked in the Sample Attributes table.</p>


<p><a name="asubjpheno" id="asubjpheno"></a></p>


<h3 id="13-how-do-i-create-subject-pheno">13. How do I create Subject Phenotypes DS and DD files?</h3>


<p>The Subject Phenotypes DS file includes measured and/or descriptive traits per individual person. The primary ID in this file is the SUBJECT_ID. Open the templates under <strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Phenotype_Data">Phenotype_Data</a></strong>:<br />
<strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Phenotype_Data/5a_SubjectPhenotypes_DS.txt">5a_SubjectPhenotypes_DS.txt</a></strong><br />
<strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Phenotype_Data/5b_SubjectPhenotypes_DD.xlsx">5b_SubjectPhenotypes_DD.xlsx</a></strong></p>


<p><strong>Column 1: SUBJECT_ID</strong></p>


<p>Each SUBJECT_ID needs to be unique and should be linked to only 1 row of data in the DS. All SUBJECT_IDs included in this file must be found in the subject consent (SC) DS with CONSENT &gt; 0. No CONSENT=0 SUBJECT_IDs should appear in the Subject Phenotypes DS. CONSENT=0 subjects are not permitted to have individual level data. See <a href="#subjidgloss">SUBJECT_ID</a> in Glossary for full requirement details.</p>


<p><strong>All other Column Headers: VARNAMES (variable names)</strong></p>


<p>Submit the following types of variables:</p>


<ol>
<li>Review section: "<a href="#crucialdata">What data must be submitted</a>"</li>
<li>Affection status: Provide the disease or phenotype of cases in the VARDESC for this variable. Do not use this variable if your study does not involve the comparison of cases and controls for singular diseases or phenotypes sharing a common pathological origin.</li>
<li>Race/ethnicity/ancestry/heritage</li>
<li>Relevant dates (e.g., birth, diagnosis) written as years or normalized to a set point in time. Do not include month and days directly tied to the person, which are considered HIPAA sensitive. Click here to see the algorithm dbGaP uses to find HIPAA sensitive dates: <a href="#hipaagloss">HIPAA</a></li>
<li>Since the sex variable is already required in the <a href="#asc">Subject Consent DS</a>, no need to resubmit the SEX variable in the Subject Phenotypes DS. However, if it is part of your data, no need to go through the extra work of removing it from the Subject Phenotype DS.</li>
</ol>


<p><a name="lotspheno" id="lotspheno"></a>
<strong>Can I submit multiple subject phenotypes DS files?</strong></p>


<p>You may submit multiple subject phenotypes DS/DD. Subject phenotypes files can be split by race/ethnicity, cohort, collection period, etc. The file name should indicate how the multiple subject phenotypes are split. The primary ID in each subject phenotypes file should be the SUBJECT_ID.</p>


<p><strong>How do I submit data that has been measured serially or longitudinally?</strong></p>


<p>If each SUBJECT_ID has a series of measurements or the data are longitudinal, below are the formatting options for this data:</p>


<ol>
<li>The first subject phenotypes DS may include all the variables that are stable through events, e.g. biological sex, race, prior history. A second subject phenotypes DS may include all the variables that change per event or time for a person. For example, when a dataset has a single SUBJECT_ID listed multiple times due to measures collected at different events, this would be considered a longitudinal dataset. To make a row unique, unique (composite) keys should have scientific significance and aid in searching for covariate data. Unique keys should not be marked for every single variable in the dataset. Going back to the example, in the corresponding DD, mark an "X" under the <a href="#uniqkey">UNIQUEKEY</a> column for the variables SUBJECT_ID + EVENT. This means that for each subject at some particular event, there are some set of relevant data collected.</li>
<li>Alternatively, you could create a single subject phenotypes DS, but have your table stretch in columns, where each event and number is a variable, such as mi_event1, mi_event2, stk_event1, stk_event2, etc., and the value would be binary. In this model, each SUBJECT_ID would only be listed once. You'd also need mi_event1_dayssinceoccurance, weight_@_mi_event_1, etc. We have received both types of submissions. We prefer option 1.</li>
</ol>


<p><a name="spds" id="spds"></a>
<strong>Example of a Subject Phenotypes DS File</strong></p>


<table>
<thead>
<tr>
<th>SUBJECT_ID</th>
<th>AFFECTION_STATUS</th>
<th>RACE</th>
<th>EDUCATION</th>
<th>AGE</th>
<th>AGE_ONSET</th>
<th>HEIGHT</th>
<th>WEIGHT</th>
<th>KRAS</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>1</td>
<td>African American</td>
<td>4</td>
<td>35</td>
<td>25</td>
<td>67</td>
<td>180.2</td>
<td>yes</td>
</tr>
<tr>
<td>2</td>
<td>2</td>
<td>Asian</td>
<td>20</td>
<td>56</td>
<td>54</td>
<td>67</td>
<td>201.5</td>
<td>no</td>
</tr>
<tr>
<td>3</td>
<td>2</td>
<td>European</td>
<td>40</td>
<td>1000</td>
<td>45</td>
<td>60</td>
<td>160.5</td>
<td>yes</td>
</tr>
<tr>
<td>4</td>
<td>1</td>
<td>Latin American</td>
<td>20</td>
<td>37</td>
<td>35</td>
<td>75</td>
<td>99.5</td>
<td>no</td>
</tr>
<tr>
<td>5</td>
<td>2</td>
<td>Asian</td>
<td>10</td>
<td>46</td>
<td>40</td>
<td>61</td>
<td>315.2</td>
<td>no</td>
</tr>
</tbody>
</table>


<p><a name="spdd" id="spdd"></a>
<strong>Example of a Subject Phenotypes DD File</strong> - this is one table, but has been split into two for viewing purposes. For details about each column header in the DD, see the <a href="#appdx">APPENDIX</a>.</p>


<table>
<thead>
<tr>
<th>VARNAME</th>
<th>VARDESC</th>
<th>DOCFILE</th>
<th>TYPE</th>
<th>UNITS</th>
<th>MIN</th>
<th>MAX</th>
<th>RESOLUTION</th>
<th>COMMENT1</th>
<th>COMMENT2</th>
</tr>
</thead>
<tbody>
<tr>
<td>SUBJECT_ID</td>
<td>Subject ID</td>
<td></td>
<td>string</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>AFFECTION_STATUS</td>
<td>Case control status of the subject for [please fill in phenotypic term]</td>
<td>Diagnosis.pdf</td>
<td>encoded value</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>RACE</td>
<td>Self-reported race</td>
<td>Main_exam.pdf</td>
<td>string</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>EDUCATION</td>
<td>Level of education</td>
<td>Main_exam.pdf</td>
<td>encoded value</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>AGE</td>
<td>Subject age at enrollment</td>
<td>Diagnosis.pdf</td>
<td>integer, encoded value</td>
<td>years</td>
<td>0</td>
<td>&gt;89</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>AGE_ONSET</td>
<td>Disease onset age</td>
<td>Diagnosis.pdf</td>
<td>integer</td>
<td>years</td>
<td>0</td>
<td>&gt;89</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>HEIGHT</td>
<td>Height measured at enrollment</td>
<td>Diagnosis.pdf</td>
<td>decimal</td>
<td>inches</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>WEIGHT</td>
<td>Subject's weight</td>
<td>Diagnosis.pdf</td>
<td>decimal, encoded value</td>
<td>pounds</td>
<td></td>
<td></td>
<td>1</td>
<td></td>
<td></td>
</tr>
<tr>
<td>KRAS</td>
<td>Somatic mutation in KRAS (Entrez GeneID: 3845)</td>
<td>Cancer.docx</td>
<td>string</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
</tbody>
</table>


<table>
<thead>
<tr>
<th>VARIABLE_SOURCE</th>
<th>SOURCE_VARIABLE_ID</th>
<th>VARIABLE_MAPPING</th>
<th>UNIQUEKEY</th>
<th>COLLINTERVAL</th>
<th>ORDER</th>
<th>VALUES</th>
<th></th>
<th></th>
<th></th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<td>NCI</td>
<td>Subject ID</td>
<td></td>
<td>X</td>
<td>Collected in Exam 1</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td></td>
<td></td>
<td></td>
<td></td>
<td>Collected in Exam 1</td>
<td></td>
<td>1=Control</td>
<td>2=Case</td>
<td>3=Other</td>
<td></td>
<td></td>
</tr>
<tr>
<td>MSH</td>
<td>Race</td>
<td></td>
<td></td>
<td>Collected in Exam 1</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>MSH</td>
<td>Educational Status</td>
<td></td>
<td></td>
<td>Collected in Exam 1</td>
<td></td>
<td>99=NA</td>
<td>10=Elementary</td>
<td>20=High School</td>
<td>40=College</td>
<td>4=Graduate School</td>
</tr>
<tr>
<td>PhenX</td>
<td>PX010101020000</td>
<td>Identical</td>
<td></td>
<td>Collected in Exam 1</td>
<td>List</td>
<td>9999=Missing</td>
<td>1000=Not assessed</td>
<td>INTEGERS</td>
<td></td>
<td></td>
</tr>
<tr>
<td>MSH</td>
<td>Age of Onset</td>
<td></td>
<td></td>
<td>Collected in Exam 1</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>MSH</td>
<td>Body Height</td>
<td></td>
<td></td>
<td>Collected in Exam 1</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>MSH</td>
<td>Body Weight</td>
<td></td>
<td></td>
<td>Collected in Exam 1, 2, 3</td>
<td>List</td>
<td>1000=Not assessed</td>
<td>DECIMALS</td>
<td>9999=Unknown</td>
<td></td>
<td></td>
</tr>
<tr>
<td>LNC</td>
<td>KRAS gene mutations tested for in Blood or Tissue by Molecular genetics method Nominal</td>
<td></td>
<td></td>
<td>Collected in Exam 3</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
</tbody>
</table>


<p><a name="asampattr" id="asampattr"></a></p>


<h3 id="14-how-do-i-create-sample-attrib">14. How do I create Sample Attributes DS and DD files?</h3>


<p>The <strong>Sample Attributes DS</strong> includes measured and/or descriptive traits per individual sample (not person). A person may be represented by multiple samples. Therefore, the primary id in this file is the SAMPLE_ID. Open the templates under <strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Phenotype_Data">Phenotype_Data</a></strong>:<br />
<strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Phenotype_Data/6a_SampleAttributes_DS.txt">6a_SampleAttributes_DS.txt</a></strong><br />
<strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Phenotype_Data/6b_SampleAttributes_DD.xlsx">6b_SampleAttributes_DD.xlsx</a></strong></p>


<p><strong>Column 1: SAMPLE_ID</strong></p>


<p>Only include SAMPLE_IDs that are listed in the subject sample mapping (SSM) DS and belong to SUBJECT_IDs that have CONSENT&gt;0 in the subject consent (SC) DS. SAMPLE_IDs belonging to CONSENT=0 SUBJECT_IDs should not appear in the Sample Attributes DS file. The SAMPLE_ID should use the exact same syntax used for the SAMPLE_ID listed in the SSM. For example, '0AB12' is not the same as 'AB12', nor is '123-1' the same as '123_1'. Each SAMPLE_ID should be represented by 1 row of data in the DS. See <a href="#sampidgloss">SAMPLE_ID</a> in Glossary for full requirement details.</p>


<p><strong>Columns 2-5: NCBI BioSample variables included in the Sample Attributes DS</strong></p>


<p>The NCBI BioSample database (<a href="https://www.ncbi.nlm.nih.gov/biosample/">https://www.ncbi.nlm.nih.gov/biosample/</a>) contains descriptions of biological source materials used in experimental assays. Each of your samples will be assigned a BioSample accession number and will thus be searchable through BioSample. The first three variables below must be included to provide meaningful data for each sample's BioSample entry. HISTOLOGICAL_TYPE should only be included if applicable.</p>


<ol>
<li>BODY_SITE – the collection site of the sample (ex. skin, breast, peripheral blood, inner oral cavity). If the sample is from a xenograft, you may rename the variable.</li>
<li>ANALYTE_TYPE – the analyte type of the sample (ex. DNA, RNA). If the same sample ID was used for both DNA and RNA aliquots, the value should be "DNA/RNA" instead of listing the sample twice. The BioSample database does not allow multiple values for the same sample ID.</li>
<li>IS_TUMOR – the tumor status of the sample. The values can be binary such as yes/no or encoded 1=yes and 2=no. For non-cancer studies, the values in IS_TUMOR should be "no" or "unknown."</li>
<li>HISTOLOGICAL_TYPE – the sample's cell or tissue type/subtype (ex. melanocytes, buccal cells, embryonic stem cells, carcinoma, lymphoma, and mixed types). If the histological type is not known or is identical to the BODY_SITE, do not include this variable.</li>
</ol>


<p><strong>All other Column Headers: VARNAMES (variable names)</strong></p>


<p>Most institutes request all data pertinent to the interpretation of genomic data, such as clinical information, exposure data, and relevant metadata pertaining to the sample. Please note that the template (<a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Phenotype_Data/6a_SampleAttributes_DS.txt">6a_SampleAttributes_DS.txt</a>) provided is based on a cancer study and the variables listed may be useful for cancer studies. However, if your study is not a cancer study, please do not include the cancer variables. Instead, submit additional sample attribute variables that will provide a greater understanding of the study. For example: sample collection date, sample extraction method and date; batch and center effects, sample plate or well number; sample run date, sample QA results; and sample affection status (ex. psoriatic skin sample vs. non-psoriatic skin sample from a case subject who has psoriasis). Relevant dates (e.g., sample collection date) that are directly tied to a person should be written as years or normalized to a set point in time. Do not include month and days directly tied to the person, which are considered HIPAA sensitive. Click here to see the algorithm dbGaP uses to find HIPAA sensitive dates: <a href="#hipaagloss">HIPAA</a>.</p>


<p><a name="lotsattr" id="lotsattr"></a>
<strong>Can I submit multiple sample attributes DS files?</strong></p>


<p>You may submit multiple sample attributes DS/DD. You may split out sample attributes files to separate them by race/ethnicity, cohort, collection period, etc. Each of the sample attributes files should have SAMPLE_ID as the primary id. The BioSample required variables should appear only once per SAMPLE_ID, and the values for the BioSample required variables should not conflict. For example, a SAMPLE_ID cannot be marked as both TUMOR and non-TUMOR. In this case, we would request that an additional SAMPLE_ID be created. If this is not possible, please contact the dbGaP phenotype curator.</p>


<p><strong>How do I submit data that has been measured serially or longitudinally?</strong></p>


<p>Each SAMPLE_ID has a series of measurements or the data is longitudinal. In this case, this table may have a SAMPLE_ID listed multiple times. We would treat this as a longitudinal dataset, where SAMPLE_ID + [variable] are the variables that make the row unique. Unique (composite) keys should have scientific significance and aid in searching for covariate data. Unique keys should not be marked for every single variable in the dataset. Mark an "X" under the <a href="#uniqkey">UNIQUEKEY</a> column for the variables in the corresponding DD. In this case, we recommend submitting the BioSample required variables in a separate sample attributes DS/DD.</p>


<p><a name="sads" id="sads"></a>
<strong>Example of a Sample Attributes DS File</strong> - this is one table, but has been split into two for viewing purposes.</p>


<table>
<thead>
<tr>
<th>SAMPLE_ID</th>
<th>BODY_SITE</th>
<th>ANALYTE_TYPE</th>
<th>IS_TUMOR</th>
<th>HISTOLOGICAL_TYPE</th>
<th>COLLECTION_AGE</th>
</tr>
</thead>
<tbody>
<tr>
<td>S1</td>
<td>Skin</td>
<td>DNA</td>
<td>Y</td>
<td>Melanoma</td>
<td>25</td>
</tr>
<tr>
<td>S2</td>
<td>Lung</td>
<td>RNA</td>
<td>Y</td>
<td>Liposarcoma</td>
<td>54</td>
</tr>
<tr>
<td>S3</td>
<td>Buccal</td>
<td>DNA</td>
<td>N</td>
<td>Buccal cells</td>
<td>45</td>
</tr>
<tr>
<td>S4</td>
<td>Skin</td>
<td>RNA</td>
<td>N</td>
<td>Skin</td>
<td>35</td>
</tr>
<tr>
<td>S5</td>
<td>Skin</td>
<td>RNA</td>
<td>N</td>
<td>Keratinocytes</td>
<td>40</td>
</tr>
</tbody>
</table>


<table>
<thead>
<tr>
<th>PRIMARY_METASTATIC_TUMOR</th>
<th>PRIMARY_TUMOR_LOCATION</th>
<th>TUMOR_STAGE</th>
<th>TUMOR_GRADE</th>
<th>TUMOR_TREATMENT</th>
</tr>
</thead>
<tbody>
<tr>
<td>Primary</td>
<td>Skin</td>
<td>II</td>
<td>G3</td>
<td>Chemotherapy and biological therapy</td>
</tr>
<tr>
<td>Primary</td>
<td>Peritoneal cavity</td>
<td>III</td>
<td>G2</td>
<td>Radiation</td>
</tr>
<tr>
<td>NA</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
</tr>
<tr>
<td>NA</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
</tr>
<tr>
<td>NA</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
<td>NA</td>
</tr>
</tbody>
</table>


<p><a name="sadd" id="sadd"></a>
<strong>Example of a Sample Attributes DD File</strong> - For additional options for the DD, see the <a href="#appdx">APPENDIX</a>.</p>


<table>
<thead>
<tr>
<th>VARNAME</th>
<th>VARDESC</th>
<th>TYPE</th>
<th>UNITS</th>
<th>MIN</th>
<th>MAX</th>
<th>UNIQUEKEY</th>
<th>VALUES</th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<td>SAMPLE_ID</td>
<td>Sample ID</td>
<td>string</td>
<td></td>
<td></td>
<td></td>
<td>X</td>
<td></td>
<td></td>
</tr>
<tr>
<td>BODY_SITE</td>
<td>Body site where sample was collected</td>
<td>string</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>ANALYTE_TYPE</td>
<td>Analyte type</td>
<td>string</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>IS_TUMOR</td>
<td>Tumor status</td>
<td>encoded value</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td>Y=Is tumor</td>
<td>N=Is not a tumor</td>
</tr>
<tr>
<td>HISTOLOGICAL_TYPE</td>
<td>Cell or tissue type or subtype of sample</td>
<td>string</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>COLLECTION_AGE</td>
<td>Subject's age at sample collection</td>
<td>integer</td>
<td>years</td>
<td>0</td>
<td>&gt;89</td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>PRIMARY_METASTATIC_TUMOR</td>
<td>Primary tumor, metastasis, or transformed cell line</td>
<td>string</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>PRIMARY_TUMOR_LOCATION</td>
<td>Primary tumor location</td>
<td>string</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>TUMOR_STAGE</td>
<td>Tumor stage of sample</td>
<td>string</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>TUMOR_GRADE</td>
<td>Tumor grade of sample</td>
<td>string</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>TUMOR_TREATMENT</td>
<td>Type of tumor treatment for sample</td>
<td>string</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
</tbody>
</table>


<p><a name="medimage" id="medimage"></a></p>


<h2 data-heading="h2" data-no-toc="true">Medical Images</h2>


<h3 id="15-how-do-i-submit-medical-image">15. How do I submit Medical Images and in what format?</h3>


<p>De-identified medical image files may be submitted <strong>only</strong> if they meet the following criteria: 1) the images correspond to subjects or samples with phenotype and genomic data that are submitted to dbGaP and 2) the images could not be submitted to an image-specific database or be made publicly available. Since no validation or QC is run on images submitted to dbGaP, we will need an attestation as a README to say that you confirm there are no PII or HIPAA-sensitive information included with the image submission. When submitting multiple files, or the file is &gt; 1TB, you need to submit them in zip or tar. </p>


<p>Note: H&amp;E Visium slides should be submitted to NCBI <a href="https://www.ncbi.nlm.nih.gov/geo/info/submission.html">GEO</a>.</p>


<p>Also, create a mapping of SUBJECT_IDs to the image files. Open the templates under <strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Medical_Images/">Medical_Images</a></strong>:<br />
<strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Medical_Images/SubjectImageMappingDS.txt">SubjectImageMappingDS.txt</a></strong><br />
<strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Medical_Images/SubjectImageMappingDD.xlsx">SubjectImageMappingDD.xlsx</a></strong></p>


<p><strong>Column 1: SUBJECT_ID</strong></p>


<p>All SUBJECT_IDs included in this file must be found in the subject consent (SC) DS with CONSENT&gt;0. No CONSENT=0 SUBJECT_IDs should appear in the Subject Image Mapping DS. See <a href="#subjidgloss">SUBJECT_ID</a> in Glossary for full requirement details.</p>


<p><strong>Columns 2-5: IMAGE_TYPE, BODY_SITE, FILENAME, FILE_TYPE</strong></p>


<p>Include the following four variables for image data.</p>


<ol>
<li>IMAGE_TYPE – the type of image (ex. CT scan, photograph, MRI).</li>
<li>BODY_SITE – the body site of the image (ex. brain, chest, eye).</li>
<li>FILENAME - the filename including the file extension.</li>
<li>FILE_TYPE – the file type (ex. jpg, dng, tif).</li>
</ol>


<p><strong>All other Column Headers: VARNAMES (variable names)</strong></p>


<p>Any other relevant information related to the image can be included as additional columns.</p>


<p><a name="imageds" id="imageds"></a>
<strong>Example of a Subject Image Mapping DS File</strong></p>


<table>
<thead>
<tr>
<th>SUBJECT_ID</th>
<th>IMAGE_TYPE</th>
<th>BODY_SITE</th>
<th>FILENAME</th>
<th>FILE_TYPE</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>photograph</td>
<td>fundus</td>
<td>fundus01a.jpg</td>
<td>jpg</td>
</tr>
<tr>
<td>1</td>
<td>photograph</td>
<td>fundus</td>
<td>fundus01b.jpg</td>
<td>jpg</td>
</tr>
<tr>
<td>4</td>
<td>photograph</td>
<td>fundus</td>
<td>fundus04a.jpg</td>
<td>jpg</td>
</tr>
<tr>
<td>4</td>
<td>photograph</td>
<td>fundus</td>
<td>fundus04b.jpg</td>
<td>jpg</td>
</tr>
<tr>
<td>6</td>
<td>CT scan</td>
<td>chest</td>
<td>chest06.tif</td>
<td>tif</td>
</tr>
<tr>
<td>7</td>
<td>CT scan</td>
<td>chest</td>
<td>chest07.tif</td>
<td>tif</td>
</tr>
</tbody>
</table>


<p><a name="imagedd" id="imagedd"></a>
<strong>Example of a Subject Image Mapping DD File</strong></p>


<table>
<thead>
<tr>
<th>VARNAME</th>
<th>VARDESC</th>
<th>TYPE</th>
<th>VALUES</th>
</tr>
</thead>
<tbody>
<tr>
<td>SUBJECT_ID</td>
<td>Subject ID</td>
<td>string</td>
<td></td>
</tr>
<tr>
<td>IMAGE_TYPE</td>
<td>Image type</td>
<td>string</td>
<td></td>
</tr>
<tr>
<td>BODY_SITE</td>
<td>Body site of image</td>
<td>string</td>
<td></td>
</tr>
<tr>
<td>FILENAME</td>
<td>Filename including the file extension</td>
<td>string</td>
<td></td>
</tr>
<tr>
<td>FILE_TYPE</td>
<td>File type</td>
<td>string</td>
<td></td>
</tr>
</tbody>
</table>


<p><a name="qcpass" id="qcpass"></a></p>


<h3 id="16-how-do-i-verify-that-my-ds-an">16. How do I verify that my DS and DD Files will pass dbGaP's phenotype quality control (QC) tests?</h3>


<p>Go through this list prior to submission. This list will help you eliminate the most common errors detected in formatting and data consistency. You can also check your Subject Consent DS, Subject Sample Mapping (SSM) DS, Pedigree DS against your Genotype data (PLINK and VCF) on your system using <a href="https://github.com/ncbi/gaptools/blob/master/GaPTools.md">GaPTools</a>. </p>


<ul>
<li>All IDs are two-step <a href="#twostepdeid">de-identified</a>.</li>
<li>Each DS and DD must be submitted as a separate file. Please do not submit multiple worksheets per file.</li>
<li>Submit tab-delimited .txt and .xlsx files only. Tab-delimited txt files are preferable for the DS. Excel (.xlsx) format is preferable for the DD. The final files provided to Authorized Users of the study will be in the tab-delimited txt format.</li>
<li>The DS should be a rectangular table. Column headers should not exceed columns of values. Column headers should not be missing. Primary IDs should not be missing for the row. Remove empty rows or columns between data values or above the headers.</li>
<li>File names should not contain special characters, spaces, hyphens, brackets, periods, or forward (/) or backward slashes (\).</li>
<li>Check formatting and spelling of the DS and DD. Remove non-ascii characters, new line feeds or carriage return characters (they sometimes may appear like a square or a question mark in a box), unintended quotes (""").</li>
<li>Check that "dbGaP" is not used in any of the variable names or the IDs. "dbGaP" is reserved for dbGaP generated items that are included in the study release.</li>
<li>Variable names between DS and its corresponding DD must be identical in syntax. For example, "day_ enrollment" is not the same as "day_enrollment" or "Day_Enrollment.". In the example of inconsistent variables, notice the letter case difference and extra space.</li>
<li>Variable names and variable descriptions need to be distinct within a dataset.</li>
<li>The same variable name must be used for the ID columns. For example, do not use SUBJECT_ID in a dataset, but Patient_ID in another dataset for the same identifier. If you use SUBJECT_ID as the primary subject ID variable name, then use SUBJECT_ID as the variable name in every dataset that lists out the subjects. Likewise, keep the primary sample ID variable name identical throughout all the datasets.</li>
<li>All SAMPLE_IDs listed in the Subject Sample Mapping (SSM) dataset must match the SAMPLE_IDs in the <a href="#ageno">molecular data</a> and <a href="#aSRA">high throughput sequences</a>. The syntax must be identical. For example, SAMPLE_ID "1034_abc.20" is not the same as SAMPLE_ID "1034-abc.20" or "1034_abc.2".</li>
<li>Remove HIPAA sensitive data, such as patient's name, doctor's name, months and days from dates directly tied to the subject, etc. Year is acceptable. Click here to see the algorithm dbGaP uses to find HIPAA sensitive dates: <a href="#hipaagloss">HIPAA</a></li>
<li>Some HIPAA sensitive data are permissible, such as age &gt; 89 for studies that focus on older populations, or geographic locations, etc. Please work with the dbGaP curator to make sure that the public summaries are correctly hidden.</li>
<li>Define codes for variable <a href="#avalues">values</a> in the respective DD, entering one code definition per cell. For example, "1=Control" in one cell and "2=Case" in a separate cell. Do NOT enter codes delimited with semicolon or commas in a single cell, like "1=Case; 2=Control."</li>
<li>Each row of each dataset must be unique (marked by a <a href="#uniqkey">UNIQUEKEY</a> in the DD). Thus, if a SUBJECT_ID is to appear more than once in a Subject Phenotypes DS, there must be at least one other variable that forms a unique key for each row. The same condition must be met if the same SAMPLE_ID is to be repeated in the Sample Attribute DS.</li>
<li>Remove duplicate SUBJECT_IDs from Subject Consent and Pedigree DS files and remove rows with duplicate SAMPLE_IDs from the SSM DS. </li>
<li>Remove completely identical rows and empty rows.</li>
<li>Check that all subjects IDs found in the subject phenotypes DS have CONSENT&gt;0 and all sample IDs in the sample attributes DS belong to subjects that have CONSENT&gt;0. Another way said, CONSENT=0 (pedigree linking members and HapMap controls) and unconsented IDs should not be in any individual-level subject phenotypes or sample attributes DS.</li>
<li>If there are multiple sex variables captured for the same person, verify that all sex values reported among the phenotype component datasets (Subject Consent, Pedigree, and Subject Phenotypes DS) are consistent with the sex determined by the genotypes, unless the conflicting variable indicates self-reported sex (Subject Phenotypes DS only).</li>
<li>If there are multiple case control variables captured for the same person, verify that all case control values are consistent for the same individuals.</li>
<li>Double check for data consistency!</li>
</ul>


<p>Review the descriptions of variables in the <a href="#appdx"><strong>APPENDIX</strong></a> for specific instructions on labeling header columns and file-naming conventions. Also read the <a href="#glssry"><strong>Glossary</strong></a> for definitions of variables. To see the QC checks that dbGaP completes for each study, see section "<a href="#aqcchecks">What happens once I submit my core data files and phenotype files?</a>".</p>


<p><a name="aphd" id="aphd"></a></p>


<h2 data-heading="h2" data-no-toc="true">Study Documents</h2>


<h3 id="17-what-type-of-study-documents-">17. What type of Study Documents may I submit and in what format?</h3>


<p>Any document that describes study methods and data collection should be submitted, e.g., protocols, questionnaires, manuals of procedures and operations, consents, and can be published on the public dbGaP page. The preferred file format is pdf, though Word and Excel documents will be accepted. Please submit tabular images in Excel. </p>


<p>The study documents may be annotated by the phenotype curator or submitted with annotations using variable or dataset names. These annotations can be added directly to the document or to a <a href="#appdx">DD under the "DOCFILE" column</a>. The annotations link text segments to corresponding variables and/or datasets. The final annotations will be visible on the public dbGaP pages. Click on the 2 links below to see how to go from the Variable Summary page to the Study Document page and vice versa.</p>


<p><a href="https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/variable.cgi?study_id=phs000001.v3.p1&amp;phv=53743">Variable Summary page</a><br />
<img src="/core/assets/dbgap/images/22_var_sum.png" alt=" " /></p>


<p><a href="https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/document.cgi?study_id=phs000001.v3.p1&amp;phd=1138">Study Document page</a><br />
<img src="/core/assets/dbgap/images/23_doc_sum.png" alt=" " /></p>


<h3 id="18-what-should-i-know-about-edit">18. What should I know about editing, proofreading, and copyright?</h3>


<p><em>Proofreading and Editing</em> – Please proofread and edit your documents thoroughly before submission — they will be posted to the public dbGaP web pages.</p>


<p>dbGaP will not perform any copyediting or proofreading. Any content changes require submission of a new version of the document. Documents that contain potential HIPAA rule violations will not be processed and need to be resubmitted following redactions.</p>


<p><em>Copyright – Previously Published Work</em> – If you submit a published work (article, review, book chapter, questionnaires, etc.) for dbGaP posting, please include documentation that authorizes the public posting on the dbGaP website. If you are unsure about the copyright status of a document, contact the publisher or owner of the work.</p>


<p>NIH does not claim copyright of any submitted documents. However, NIH must be given nonexclusive rights to freely distribute all documents on the dbGaP site. </p>


<p><a name="ageno" id="ageno"></a></p>


<h2 data-heading="h2" data-no-toc="true">Molecular Data</h2>


<h3 id="19-how-do-i-submit-molecular-dat">19. How do I submit Molecular Data to dbGaP?</h3>


<p>No BAM, CRAM, and FASTQ files should be submitted as "Molecular Data" type to the dbGaP Submission Portal. High throughput human sequence data and alignment information should be submitted through a separate process: <a href="#aSRA">High throughput sequencing submission instructions</a>. </p>


<p>Molecular data, that is not high throughput sequence data, should be submitted to the dbGaP <a href="#spgloss">Submission Portal</a> under the section "Other files" with type "Molecular Data". It should be submitted along with the phenotype data or as early as possible so that it enters a dbGaP genotype curator's queue. Do not submit each file separately, but bundle the files. To compress and bundle files, zip first then tar. Do not tar first then zip as this will significantly delay the processing time.</p>


<p>For VCFs, the files should be compressed using bgzip instead of zip as bgzip's block compression method can be directly used with VCFtools and BCFtools. This enables dbGaP to run qc checks quickly and report back to you any errors. For VCF files larger than 300GB, please split by chromosome, then tar the set of VCFs and submit as a single tarball.</p>


<p><span style="color:red">Essential requirement: Sample IDs must be de-identified. Every sample ID found in an individual level Molecular Data file must be mapped to a consented subject in the Subject Sample Mapping (<a href="#ssmds">SSM</a>) dataset.</span> See <a href="#sampidgloss">SAMPLE_ID</a> in Glossary for full requirement details. Sample IDs that do not follow the requirements will not be processed. If sample IDs are modified, please also modify the corresponding <a href="#asampattr">Sample Attributes</a> dataset. </p>


<p>Please include a README with a brief description of the data that you are submitting. It should minimally include genotyping steps, genome build, and technology if applicable.</p>


<p>Common questions and errors:</p>


<ul>
<li>The sample ID is ideally the final aliquot used for a sequencing run or well on an array plate. A person with a given subject ID can have many samples.</li>
<li>If a sample ID is a technical control such as Coriell HapMap sample or a publicly available control, it must be mapped to a subject ID in the Subject Sample Mapping (<a href="#ssmds">SSM</a>) dataset and that subject ID must be explicitly marked as CONSENT=0 in the Subject Consent (<a href="#scds">SC</a>) dataset.</li>
<li>Single cells or multiplexed single cells should each be given a unique sample ID.</li>
<li>Sample IDs in sequence derived genotypes (VCFs) must be identical to the sample IDs used in the corresponding sequence data (BAMs).</li>
<li>Include a File Sample Mapping (FSM) file to map sample IDs to single sample data files.</li>
<li>Include README to describe content of data files and QC anomalies especially if the content is not in one of the formats listed below and fits into the "Other" category.</li>
<li>Check that files are not truncated.</li>
</ul>


<p>See the <a href="/gap/docs/moleculardatasection">Molecular Data</a> section for guidelines, common errors, dbGaP qc checks, and where to submit molecular data.</p>


<p>Click on the links below to hop to a specific molecular data type:</p>


<ul>
<li><a href="/gap/docs/moleculardatasection#aarray">Genotype</a> (SNP array in PLINK format and if available, raw data (Illumina .idat or Affymetrix .cel), and genotype reports)</li>
<li><a href="/gap/docs/moleculardatasection#avcf">SNP, CNV, and structural variants derived from sequence data</a> (.vcf)</li>
<li><a href="/gap/docs/moleculardatasection#aimpute">Imputation</a> (IMPUTE2, MACH, MINIMAC, SHAPEIT)</li>
<li><a href="/gap/docs/moleculardatasection#expressionepigenetic">Expression/Epigenetic</a> array or counts (.txt, .tsv)</li>
<li><a href="/gap/docs/moleculardatasection#amaf">Somatic and/or germline mutation annotations</a> (.maf)</li>
<li><a href="/gap/docs/moleculardatasection#aother">Other</a> (individual and summary level data (.txt or .csv matrix), -omics, single cell, UCSC BED format, etc.)</li>
</ul>


<p><a name="aSRA" id="aSRA"></a></p>


<h3 id="20-how-do-i-submit-high-throughp">20. How do I submit High Throughput Sequence data and alignment information?</h3>


<p>dbGaP accepts high throughput human sequence data in BAM, CRAM, and FASTQ formats. Choose one data storage option below. Existing studies may have a combination of the options but all new submissions should follow a single option.</p>


<ol>
<li>
<p>NCBI Data Storage (SRA): Both <strong>sequence metadata</strong> and <strong>sequence files</strong> are submitted to NCBI and available for download from NCBI servers OR direct cloud access through Google Cloud and AWS (Amazon).  </p>
</li>
<li>
<p>Cloud Data Storage (External Data Source including Trusted Partners): The <strong>sequence metadata</strong> is submitted to NCBI with details of sequence file cloud storage locations. This option requires sponsoring institutes to configure your study with an NIH data repository. <strong>Sequence files</strong> will be accessed either through the cloud storage provider using dbGaP credentials via <a href="#aagloss">Authorized Access</a> or through an NIH data repository platform if available.  </p>
</li>
</ol>


<p>Note: Sequence Read Archive (SRA) - Please do not submit individual-level human sequence data directly to the SRA. While SRA brokers the sequence data for dbGaP, the sequence data should be uploaded through the dbGaP pipeline described below. This ensures that individual-level human sequence data is properly tied to consented individuals in a dbGaP study and users are able to request for data through dbGaP's <a href="#aagloss">Authorized Access</a>. For non-human sequence data, such as microbiome or 16S rRNA cleaned of human contamination, please work directly with SRA by going to their <a href="https://www.ncbi.nlm.nih.gov/sra/docs/submit/">website</a>. This is colloquially referred to as "public SRA" as the data does not require <a href="#aagloss">Authorized Access</a>. If you would like to link <strong>publicly available</strong> metagenomic sequences free of human sequence contaminants to controlled access subjects or samples in a dbGaP study, jump to <a href="#srapublic">public SRA</a>.</p>


<p><strong>Steps to submitting Human Sequence to a dbGaP study</strong></p>


<p><em>Option 1: NCBI Data Storage (SRA) - sequence data will be submitted to dbGaP</em></p>


<ol>
<li>Update or verify that your study is configured for sequence data submission by selecting <strong>yes</strong> to #5 "Sequence" in the <strong><a href="https://submit.ncbi.nlm.nih.gov/dbgap/">Submission Portal</a> Study Data Outline</strong>. </li>
<li><strong>Submit Subject Consent (<a href="#asc">SC</a>) and the Subject Sample Mapping (<a href="#assm">SSM</a>) files</strong>. A dbGaP phenotype curator will validate and load the submitted IDs and consents in the dbGaP database, and each sample ID will be assigned an NCBI <a target="_blank" href="https://www.ncbi.nlm.nih.gov/biosample/">BioSample</a> ID (SAMN#). This process instantiates IDs and verifies that sequences submitted for samples belong to consented subjects. This may take a few days.</li>
<li>dbGaP <a href="#spgloss">Submission Portal</a> sends email with a sequence metadata spreadsheet attached with your registered sample IDs already entered. </li>
<li><strong>Complete and Submit</strong> the sequence metadata spreadsheet to the dbGaP <a target="_blank" href="https://submit.ncbi.nlm.nih.gov/dbgap/">Submission Portal</a> for only sequence data you plan to submit for this version of the study. Do not include sequence data that have previously been submitted to the study (for example in an earlier version) in the spreadsheet. Remove sample IDs that do not have sequence data. Take care to not edit the spreadsheet column headers and only use the controlled vocabulary options in fields with a selection menu to ensure that the sequence metadata will pass automated checks. Detailed instructions to complete each column in the sequence metadata spreadsheet is at: <a target="_blank" href="https://www.ncbi.nlm.nih.gov/sra/docs/submitdbgap/#submission-overview">https://www.ncbi.nlm.nih.gov/sra/docs/submitdbgap/#submission-overview</a>.</li>
<li>You will receive an email in 2-3 business days indicating if your sequence metadata has errors and needs to be resubmitted <u>OR</u> has been validated and loaded.</li>
<li>Once your sequence metadata spreadsheet has been validated and loaded, you will receive email instructions to upload sequences through ASPERA. You will be provided with a private key to use the asp-dbgap account.  </li>
<li>Once your sequence data has been uploaded, the files will be validated. Specifically, the number of samples, number of files, file names, and md5s must match exactly what was indicated in the sequence metadata spreadsheet. You will be notified within 5 business days of your sequence upload status.</li>
<li>All sequence data must be processed before a study can be <a href="#arelease">released</a> through <a href="https://dbgap.ncbi.nlm.nih.gov/aa/wga.cgi?page=login">Authorized Access</a>.</li>
</ol>


<p><em>Option 2: Cloud Data Storage (External Data Source including Trusted Partners) - sequence data will not be submitted to dbGaP, rather EDS will provide a cloud location</em></p>


<ol>
<li>This option is only for studies that have an <a href="#edsgloss">External Data Source (EDS)</a> registered in the dbGaP Submission System and the EDS would like to provide cloud data storage locations that can be linked. Most studies will work independently with their EDS to store data and use dbGaP only for authorization, and will not need this option.</li>
<li>Update or verify that your study is configured for sequence data submission by selecting <strong>yes</strong> to #5 "Sequence" in the <strong><a href="https://submit.ncbi.nlm.nih.gov/dbgap/">Submission Portal</a> Study Data Outline</strong>.  </li>
<li><strong>Submit Subject Consent (<a href="#asc">SC</a>) and the Subject Sample Mapping (<a href="#assm">SSM</a>) files</strong>. A dbGaP phenotype curator will validate and load the submitted IDs and consents in the dbGaP database, and each sample ID will be assigned an NCBI <a href="https://www.ncbi.nlm.nih.gov/biosample/">BioSample</a> ID (SAMN#). This process instantiates IDs and verifies that sequences submitted for samples belong to consented subjects. This may take a few days.</li>
<li>dbGaP <a href="#spgloss">Submission Portal</a> sends email with a sequence metadata spreadsheet attached with your registered sample IDs already entered as additional columns necessary for cloud data submissions.</li>
<li><strong>Complete and Submit</strong> the sequence metadata spreadsheet to the dbGaP <a href="https://submit.ncbi.nlm.nih.gov/dbgap/">Submission Portal</a> for only sequence data you plan to submit for this version of the study. Do not include sequence data that have previously been submitted to the study (for example in an earlier version) in the spreadsheet. Remove sample IDs that do not have sequence data. Take care to not edit the spreadsheet column headers and only use the controlled vocabulary options in fields with a selection menu to ensure that the sequence metadata will pass automated checks. Detailed instructions to complete each column in the sequence metadata spreadsheet is at: <a target="_blank" href="https://www.ncbi.nlm.nih.gov/sra/docs/submitdbgap/#submission-overview">https://www.ncbi.nlm.nih.gov/sra/docs/submitdbgap/#submission-overview</a>. Additionally, you will have 5 additional columns specific to cloud data storage: active_location_URL, Bases, Reads, coverage, AvgReadLength.</li>
<li>A sequence curator will verify that files referenced in your sequence metadata can be accessed. You will need to grant access to NCBI operated accounts for this process to occur. </li>
<li>All sequence data must be processed before a study can be <a href="#arelease">released</a> through <a href="https://dbgap.ncbi.nlm.nih.gov/aa/wga.cgi?page=login">Authorized Access</a>.</li>
</ol>


<p>Do NOT upload sequences until you receive the email confirmation that your sequence metadata spreadsheet has been loaded. Sequence metadata (.xlsx) that is uploaded to the dbGaP Submission Portal typically takes 2-3 business days to process.<br />
Do NOT submit sequence files to the dbGaP Submission Portal ASPERA account (subasp) as the sequence files are destined for the Sequence Read Archive (SRA) that uses dbGaP's controlled access.<br />
<em>Do submit</em> sequence files to the ASPERA account (asp-dbgap) named in the email instructions. Sequence files (BAM, CRAM, FASTQ) typically take 3-5 business days to process, depending on the number and size.<br />
Please split pairs of FASTQ files into subsets that are 250 GB or less when uncompressed. In those cases, additional columns of filetype, filename, and MD5 checksum can be added using the same column titles.<br />
Instructions for sequence metadata and data upload can be found here: <a target="_blank" href="https://www.ncbi.nlm.nih.gov/sra/docs/submitdbgap/">https://www.ncbi.nlm.nih.gov/sra/docs/submitdbgap/</a>.</p>


<p>Contact for questions or status update: <a href="mailto:dbgap-sp-help@ncbi.nlm.nih.gov">dbgap-sp-help@ncbi.nlm.nih.gov</a>.</p>


<p><strong>Tracking samples</strong></p>


<p>A link to the <a href="#sstrgloss">Subject Sample Telemetry Report (SSTR)</a> will be provided when the IDs and consents have been loaded. The SSTR includes a complete list of subjects, samples, consents, dbGaP assigned IDs and study repository, BioSample variables, and sequence_data_details.</p>


<p><strong>RNA sequences</strong></p>


<p>If RNA sequences will be submitted, please consider also submitting expression or read counts and determine whether they will be submitted to dbGaP or NCBI GEO (public). If the counts are submitted to dbGaP, upload as <a href="#ageno">Molecular Data</a> in the dbGaP <a href="https://submit.ncbi.nlm.nih.gov/dbgap/">Submission Portal</a>. If the counts are submitted to NCBI <a href="https://www.ncbi.nlm.nih.gov/geo/">GEO</a>, submit a linking of subject or sample IDs to GEO accessions (GSM######) following the instructions <a href="#ncbidb">here</a>. This will enable dbGaP to link to GEO and vice versa.</p>


<p><strong>Whole Genome, Exome, or Targeted sequences</strong></p>


<p>If whole genome, exome, or targeted sequences will be submitted, please consider also submitting derived variant calls (VCFs or MAFs), which are more frequently used. VCFs or MAFs should be uploaded as <a href="#ageno">Molecular Data</a> in the dbGaP <a href="https://submit.ncbi.nlm.nih.gov/dbgap/">Submission Portal</a>. Please make sure that the sample IDs used in the VCFs or MAFS are the same sample IDs listed in your sequence data and Subject Sample Mapping (<a href="#assm">SSM</a>. If the sample IDs are not found in the files, please create a separate 2 column table to map sample IDs to file names.</p>


<p><strong>May I submit identical sequences (i.e., same file name and md5)?</strong></p>


<p>SRA's system currently will not process duplicate sequences that have the same file name and md5, whether the sequences are submitted to dbGaP SRA or public SRA. If you have a need to submit duplicate sequences to two different dbGaP studies, please directly contact SRA for guidance: <a href="mailto:sra@ncbi.nlm.nih.gov">sra@ncbi.nlm.nih.gov</a>.</p>


<p><a name="aCNV" id="aCNV"></a></p>


<h3 id="21-how-do-i-submit-copy-number-v">21. How do I submit Copy Number Variation (CNV) data?</h3>


<p>CNV is coordinated with <a href="https://www.ncbi.nlm.nih.gov/dbvar">NCBI dbVar</a>. Individual-level CNV data should be submitted to dbGaP and released via controlled access. Summary-level (probe/primer and other assay and frequency information) copy number variation data should be submitted to dbVar and released by the public dbVar. Please click on <a href="https://www.ncbi.nlm.nih.gov/dbvar/content/submission/">dbVar Submission Guide</a> if your study includes CNV data.</p>


<p><a name="ncbidb" id="ncbidb"></a></p>


<h3 id="22-how-do-i-link-individual-stud">22. How do I link individual study subjects/samples to samples that have been submitted to NCBI databases: GEO, GenBank, SRA (public)?</h3>


<p>Create a linking DS and DD of SUBJECT_IDs or SAMPLE_IDs to the accessions used in the applicable databases. In the <a href="https://submit.ncbi.nlm.nih.gov/dbgap/">Submission Portal</a>, mark "yes" for "Subject/Sample ID links to public NCBI databases" in the <a href="#sdogloss">Study Data Outline (SDO)</a>. Upload these files as "Phenotype data" if keyed off of the subject IDs <em>OR</em> upload these files as "Sample Attributes" if keyed off of the sample IDs. If only experiment or project accessions are available, then in the SDO, mark "no" for "Subject/Sample ID links to public NCBI databases". Experiment or project accessions and their corresponding URL can be listed in the Study Config web form under "Study Web Links".</p>


<p><a href="https://www.ncbi.nlm.nih.gov/geo/info/submission.html">GEO</a>: repository of high-throughput gene expression data and hybridization arrays, chips, microarrays. Open the templates under <strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Sample_NCBI_DB_Linking/">Sample_NCBI_DB_Linking</a></strong>:<br />
<strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Sample_NCBI_DB_Linking/SampleGEOLinkingDS.txt">SampleGEOLinkingDS.txt</a></strong><br />
<strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Sample_NCBI_DB_Linking/SampleGEOLinkingDD.xlsx">SampleGEOLinkingDD.xlsx</a></strong></p>


<p><a href="https://www.ncbi.nlm.nih.gov/genbank/submit/">GenBank</a>: genetic sequence database comprising an annotated collection of all publicly available DNA sequences. Open the templates under <strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Sample_NCBI_DB_Linking/">Sample_NCBI_DB_Linking</a></strong>:<br />
<strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Sample_NCBI_DB_Linking/SampleGenBankLinkingDS.txt">SampleGenBankLinkingDS.txt</a></strong><br />
<strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Sample_NCBI_DB_Linking/SampleGenBankLinkingDD.xlsx">SampleGenBankLinkingDD.xlsx</a></strong></p>


<p><a name="srapublic" id="srapublic"></a>
<a href="https://www.ncbi.nlm.nih.gov/sra/docs/submit/">SRA (public)</a>: archive of raw sequencing data and alignment information from high-throughput sequencing platforms of non-human data. Open the templates under <strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Sample_NCBI_DB_Linking/">Sample_NCBI_DB_Linking</a></strong>:<br />
<strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Sample_NCBI_DB_Linking/SamplePublicSRALinkingDS.txt">SamplePublicSRALinkingDS.txt</a></strong><br />
<strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Sample_NCBI_DB_Linking/SamplePublicSRALinkingDD.xlsx">SamplePublicSRALinkingDD.xlsx</a></strong></p>


<p><strong>Column 1: <a href="#subjidgloss">SUBJECT_ID</a> or <a href="#sampidgloss">SAMPLE_ID</a></strong></p>


<p>Eliminate extra work. If additional sample IDs need to be created and/or added to the SSM to account for the sample to NCBI database accession mapping, use the subject IDs in the <a href="#scds">Subject Consent DS</a> instead. Create a mapping of subject IDs to the NCBI database accession: Column 1 will list SUBJECT_IDs and Column 2 will list the corresponding NCBI database accession. Otherwise, use the SAMPLE_ID found in the <a href="#ssmds">SSM DS</a>: Column 1 will list SAMPLE_IDs and Column 2 will list the corresponding NCBI database accession.</p>


<p>A sample ID can be listed multiple times if it has multiple accessions (such as GEO accessions) derived from the same sample. See <a href="#sampidgloss">SAMPLE_ID</a> in Glossary for full requirement details.</p>


<p><strong>Column 2:</strong> NCBI database accession (i.e. <strong>GEO_ACCESSION, GENBANK_ACCESSION, SRA_ACCESSION</strong>)</p>


<p>The sample accessions of the various NCBI databases should be linked to the submitted subject or sample IDs. This column should have distinct IDs. GEO_ACCESSIONS begin with GSM#######. GENBANK_ACCESSIONS begin with HM#######. Non-human SRA_ACCESSIONS begin with SAMN########.</p>


<p><strong>Example of GEO Linking DS and DD File using GSM####### accessions</strong></p>


<table>
<thead>
<tr>
<th>SAMPLE_ID</th>
<th>GEO_ACCESSION</th>
</tr>
</thead>
<tbody>
<tr>
<td>S2</td>
<td>GSM18467693</td>
</tr>
<tr>
<td>S2</td>
<td>GSM18467694</td>
</tr>
<tr>
<td>S2</td>
<td>GSM18467695</td>
</tr>
<tr>
<td>S10</td>
<td>GSM18467696</td>
</tr>
<tr>
<td>S10</td>
<td>GSM18467697</td>
</tr>
<tr>
<td>S10</td>
<td>GSM18467698</td>
</tr>
</tbody>
</table>


<table>
<thead>
<tr>
<th>VARNAME</th>
<th>VARDESC</th>
</tr>
</thead>
<tbody>
<tr>
<td>SAMPLE_ID</td>
<td>Sample ID</td>
</tr>
<tr>
<td>GEO_ACCESSION</td>
<td>GEO accession ID (GSM#)</td>
</tr>
</tbody>
</table>


<p><strong>Example of GenBank Linking DS and DD File using HM####### accessions</strong></p>


<table>
<thead>
<tr>
<th>SAMPLE_ID</th>
<th>GENBANK_ACCESSION</th>
</tr>
</thead>
<tbody>
<tr>
<td>S2</td>
<td>HM258784</td>
</tr>
<tr>
<td>S2</td>
<td>HM258785</td>
</tr>
<tr>
<td>S2</td>
<td>HM258786</td>
</tr>
<tr>
<td>S10</td>
<td>HM258787</td>
</tr>
<tr>
<td>S10</td>
<td>HM258788</td>
</tr>
<tr>
<td>S10</td>
<td>HM258789</td>
</tr>
</tbody>
</table>


<table>
<thead>
<tr>
<th>VARNAME</th>
<th>VARDESC</th>
</tr>
</thead>
<tbody>
<tr>
<td>SAMPLE_ID</td>
<td>Sample ID</td>
</tr>
<tr>
<td>GENBANK_ACCESSION</td>
<td>GenBank accession ID (HM#)</td>
</tr>
</tbody>
</table>


<p><strong>Example of SRA Linking DS and DD File (non-human sequences that are publicly available) using BioSample SAMN######## accessions</strong></p>


<table>
<thead>
<tr>
<th>SAMPLE_ID</th>
<th>SRA_ACCESSION</th>
</tr>
</thead>
<tbody>
<tr>
<td>S2</td>
<td>SAMN2506412</td>
</tr>
<tr>
<td>S10</td>
<td>SAMN2506420</td>
</tr>
<tr>
<td>S13</td>
<td>SAMN2506432</td>
</tr>
<tr>
<td>S14</td>
<td>SAMN2506433</td>
</tr>
<tr>
<td>S15</td>
<td>SAMN2506434</td>
</tr>
<tr>
<td>S16</td>
<td>SAMN2506435</td>
</tr>
</tbody>
</table>


<table>
<thead>
<tr>
<th>VARNAME</th>
<th>VARDESC</th>
</tr>
</thead>
<tbody>
<tr>
<td>SAMPLE_ID</td>
<td>Sample ID</td>
</tr>
<tr>
<td>SRA_ACCESSION</td>
<td>SRA public sequence accession ID (SAMN#)</td>
</tr>
</tbody>
</table>


<p><a name="apha" id="apha"></a></p>


<h2 data-heading="h2" data-no-toc="true">Association Analyses</h2>


<h3 id="23-what-are-association-analysis">23. What are Association Analysis Data Files and how should they be formatted?</h3>


<p>Association analyses are <a href="#gsrgloss">Genomic Summary Results (GSR)</a> that do not include individual level data. They are from genomic association studies and include linkage and burden testing on genotypic and phenotypic traits. They vary on trait, variant type, frequency, and analytic method. To facilitate data sharing, we have created a unified guideline for Minimum Information Required for Association Data (MIRAD) listed below. We also accept the newer GWAS-SSF (GWAS Summary Statistics Format) according to <a href="https://doi.org/10.1101/2022.07.15.500230">Hayhurst, et al., 2023</a>.</p>


<p><strong>MIRAD includes four essential data elements.</strong></p>


<ol>
<li><strong>Locus Identifier</strong>
The identifier includes locus ID and location, but is not limited to rs#, gene ID and SV# for SNP, gene, and structural variant. They can be mapped to the current genome build and can evolve with future reference genome assemblies and NCBI annotations.</li>
<li><strong>Variation summary</strong>
It contains information about alleles, allele frequencies, sample size, and genotype counts per sample group within each locus. To limit the ability of unauthorized parties to infer individual participants, data like counts and frequency are only accessible to users who have been approved for <a href="https://dbgap.ncbi.nlm.nih.gov/aa/wga.cgi?page=login">Authorized Access</a>.</li>
<li><strong>Statistical significance and Effect size</strong>
p-value and/or FDR either come from univariate testing on variants from a single locus or from burden testing on a set of rare variants from a target-region provided by sequencing projects. The effect size includes odds ratio, regression coefficient, relative risk, etc., on effect allele. These data not only help users to find causal variant and haplotype, but also can be used to estimate locus contribution to the heredity of the trait or disease(s).</li>
<li><strong>Phenotype Definition and Analysis Metadata</strong>
Descriptions of the analysis and method, include phenotypic covariates, parameters, and ancestry of participants, are needed for reproducing the result set once the individual data are fully available. The main trait or disease analyzed should be defined based on controlled vocabulary in <a href="https://www.ncbi.nlm.nih.gov/mesh">MeSH</a> terms, study population information, and relevant publications using PMIDs should be provided.</li>
</ol>


<p><strong>Reasoning</strong>: Sharing of these data elements allow other researchers to evaluate supporting evidence and independently verify discoveries with different samples and data models. If individual level genotype is inaccessible, people can directly use them for meta-analysis to increase statistical power or for the development of hypotheses. The data, like locus info, effect allele and effect size, can provide valuable information for genomic medicine.</p>


<p><strong>Our practice</strong>: Using MIRAD, dbGaP has developed several templates for data submission and genome browser display. You are welcome to join the discussion, make suggestions, and comment on the MIRAD proposal. The dbGaP team is committed to bringing new discoveries to the public and research communities and are happy to work with researchers to promote data sharing within the scientific community. </p>


<p>See the instructions in <strong><a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Association_Analysis_Data/Association_Analysis.xlsx">Association_Analysis.xlsx</a></strong> for Case-Controls (Worksheet 1) or Others (Worksheet 2). Each analysis metadata sheet is given a separate analysis accession (pha#.v#) and will need to have a unique name. If GWAS results are submitted as outputs of the software, please give brief descriptions of the column headers, indicating the linking-columns and/or relationships when several files are involved.</p>


<p>The GSR will be posted on the <a href="https://ftp.ncbi.nlm.nih.gov/dbgap/studies/">public FTP</a> site, unless the study investigator and <a href="#gpagloss">GPA</a> specify that the data is sensitive in the dbGaP <a href="#ssgloss">Submission System</a> and needs to be restricted under dbGaP Authorized Access. Additionally, there is the option to add a study with analyses to CADA. CADA stands for the Compilation of Aggregate Genomic Data and is a collection of analyses across many dbGaP studies that can be accessed with a single <a href="#dargloss">Data Access Request</a>. </p>


<p><strong>Guidance for submitting a large number of analyses</strong></p>


<p>Please make sure that the analyses metadata are consistent, so that processing can be scripted. For curators to quickly review the MeSH and population terms, please submit a separate 3 or 4 column table with:<br />
Column 1: Analyses metadata file name<br />
Column 2: MeSH term <a href="https://www.ncbi.nlm.nih.gov/mesh">https://www.ncbi.nlm.nih.gov/mesh</a><br />
Column 3: Ancestry, Race, Ethnicity using broad categories (i.e., Asian, Black or African American, Middle Eastern or North African, Native Hawaiian or Other Pacific Islander, White, More than one population, Hispanic or Latino, Not Hispanic or Latino, African, East Asian, West Asian, European, American). Please match the syntax listed here.<br />
Column 4 (Optional): Column 3 in greater detail using free text, i.e., expanding what multiple populations mean or more specificity</p>


<h2 data-heading="h2" data-no-toc="true">Submitting Files</h2>


<p><a name="whosub" id="whosub"></a></p>


<h3 id="24-who-can-submit-files-to-dbgap">24. Who can submit files to dbGaP?</h3>


<p>A dbGaP study must be registered in the dbGaP Submission System before data can be submitted. Please click on "<a href="https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/GetPdf.cgi?document_name=HowToSubmit.pdf">How to Submit</a>" for the overall schema. The study investigator and the person designated by the study investigator (PI Submitter) will be able to submit along with any other individuals they add as a submitter.</p>


<p><a name="wheresub" id="wheresub"></a></p>


<h3 id="25-where-do-i-submit-my-dbgap-fi">25. Where do I submit my dbGaP files?</h3>


<p>Submit all files through the dbGaP <a href="#spgloss">Submission Portal</a>. Go to <a href="https://submit.ncbi.nlm.nih.gov/dbgap/">https://submit.ncbi.nlm.nih.gov/dbgap/</a>. To safeguard study participants' privacy, dbGaP will not accept individual-level data via email. Once the study is registered, a Submission Portal account is provided to the study investigator and anyone that the study investigator lists as a submitter. To obtain access to the Submission Portal account, please accept the email invitation you have received immediately. The email invitation will expire in 7 days. Once accepted, you may submit your files any time thereafter. Individuals with "manager" roles in the dbGaP Submission Portal can also add in additional submitters.</p>


<p>Additional guidance for file upload:</p>


<ul>
<li>Subject Phenotypes - upload all Subject Phenotypes <a href="#spds">DS</a> and <a href="#spdd">DD</a> and any <a href="#ncbidb">linking files to other NCBI databases</a> if keyed off of subjects.</li>
<li>Sample Attributes - upload all Sample Attributes <a href="#sads">DS</a> and <a href="#sadd">DD</a> and any <a href="#ncbidb">linking files to other NCBI databases</a> if keyed off of samples.</li>
<li>Sequence Metadata - dbGaP will email you a Sequence Metadata file after the subject IDs, sample IDs, and consents have been loaded. This process ensures that submitted sequences are tied to sample IDs that belong to consented subjects. Upload the Sequence Metadata once you have completed filling out the remaining columns. Once a Sequence Metadata has been validated and loaded into our system, you will no longer be able to replace that file. You will be sent a separate email with instructions to upload your high throughput sequence data (BAM, CRAM, FASTQ). To add additional samples, submit another Sequence Metadata file with only the new samples. If the validation reports errors, you will be able to "Replace" the existing Sequence Metadata file. If you need to remove or modify entries in a validated file, please contact <a href="mailto:sra@ncbi.nlm.nih.gov">sra@ncbi.nlm.nih.gov</a>.</li>
<li>Other files<ul>
<li><a href="#ageno">Molecular Data</a> - select type "Molecular Data". No high throughput sequence data (FASTQ, BAM, and CRAM) should be submitted here.</li>
<li><a href="#aphd">Study Documents</a> - select type "Document: Phenotype" if the document can be made available on the public webpage. Some READMEs, genotype qc results, etc are not appropriate for public distribution, and should be submitted under type "Molecular Data" instead and packaged for Authorized Access only.</li>
</ul>
</li>
</ul>


<h3 id="26-what-if-there-are-errors-or-u">26. What if there are errors or updates in the data and I need to resubmit?</h3>


<p>If you must resubmit your files for a new iteration of the current version, please follow these instructions:</p>


<ul>
<li>Do not submit individual-level data through email. Resubmit data through the dbGaP <a href="#spgloss">Submission Portal</a> (<a href="https://submit.ncbi.nlm.nih.gov/dbgap/">https://submit.ncbi.nlm.nih.gov/dbgap/</a>), so that we have a formal record of your submission. </li>
<li>Update the <a href="#sdogloss">Study Data Outline (SDO)</a> in the Submission Portal to indicate new data types or remove incorrectly marked data types.</li>
<li>Submit only new or updated files. Do not resubmit unchanged files as every submitted file is compared to previously submitted files, which will add significantly to the processing time.</li>
<li>There are 3 options to update the <strong>Phenotype Component</strong> in the Submission Portal. Notify your phenotype curator of the changes.<ul>
<li>Option 1: <strong>Replace</strong> previously submitted datasets (DS) or data dictionaries (DD).</li>
<li>Option 2: <strong>Add</strong> new DS and DD pairs. Do not use the "replace" button in the Submission Portal, but rather add pairs of DS and DD.</li>
<li>Option 3: <strong>Delete</strong> previously submitted DS and DD pairs. There are no replacements.</li>
</ul>
</li>
<li>Keep resubmitted phenotype filenames the same or add the date to the existing filename, i.e. yyyymmdd (ex. 20190101). Do not submit filenames used two versions or more ago. dbGaP will crosscheck the latest file submission against the previous submission and report any unexpected changes.</li>
<li>Double check the submission by going through the phenotype QC checklist of common errors: <a href="#qcpass">Quality Control</a></li>
<li>There are 2 options to update the <strong>Molecular Data</strong> in the Submission Portal. Notify your genotype curator of the changes.<ul>
<li>Option 1: <strong>Add</strong> new data. In the Submission Portal, upload under "Other files" with type "Molecular Data".</li>
<li>Option 2: <strong>Delete</strong> previously submitted molecular data.</li>
<li>To replace, first <strong>Delete</strong> then <strong>Add</strong>.</li>
</ul>
</li>
<li>Replacing a Sequence Metadata file is only possible when the Sequence Metadata has failed validation. Otherwise, only new Sequence Metadata files can be added. New Sequence Metadata files should only include samples with new sequences that will be submitted. To replace or delete previously submitted samples or sequences, contact an SRA curator: <a href="mailto:sra@ncbi.nlm.nih.gov">sra@ncbi.nlm.nih.gov</a>. The Sequence Metadata will first need to be updated and validated before sequences can be uploaded through ASPERA.</li>
</ul>


<p><a name="adownload" id="adownload"></a>
To download a copy of the phenotype component files from the Submission Portal, you must have "Manager" permissions in the <a href="https://submit.ncbi.nlm.nih.gov/dbgap/">Submission Portal</a>. The phenotype component includes Subject Consent, Subject Sample Mapping, Subject Phenotypes, Sample Attributes, Subject/Sample to NCBI linking datasets (DS) and data dictionaries (DD).</p>


<ol>
<li>In the box on the upper right of the Submission Portal, click "Download Phenotype Files".</li>
<li>Select phenotype files.</li>
<li>Click "Download". Once the download request is initiated, the "Download" button will be disabled until the request expires.</li>
<li>You will receive two emails when your files are ready to download. This should occur within a day.<ul>
<li>Email 1 "dbGaP: phs00####.v# Phenotype Files Ready for Download"</li>
<li>Email 2 "dbGaP: phs00####.v# Passphrase"</li>
</ul>
</li>
<li>In Email 1, click on the "File download URL" link, which will take you to the encrypted TAR file.</li>
<li>Click on "Save File" to download the encrypted TAR file, i.e., <code>phs00####_v#.tar.gpg</code>. This link will expire after 72 hours. Thereafter, you will need to make a new request.</li>
<li>
<p>The TAR file will need to be decrypted in order to open. Use Windows or Unix below.</p>
<p>In Windows:</p>
<ol>
<li>Go to <a href="https://gnupg.org/download/">https://gnupg.org/download/</a></li>
<li>Find "gpg4win". Click and download the executable.</li>
<li>Open Kleopatra, a GUI app for GnuPG</li>
<li>Select "Decrypt/Verify", and navigate to the downloaded TAR file and open it.</li>
<li>When prompted, enter the passphrase from Email 2.<ul>
<li>If decryption is successful, the message "Decryption succeeded" will be displayed</li>
</ul>
</li>
<li>Enter desired output location and click "Save All".</li>
<li>Go to the output folder to view the decrypted files.</li>
</ol>
<p>In Unix:</p>
<ol>
<li>It is likely that your system already has gpg installed. If not, download and install GnuPG from <a href="https://gnupg.org/download/">https://gnupg.org/download/</a> (GnuPG Binary Releases).</li>
<li>From command line, run the following command: <code>gpg --batch --passphrase &lt; decryption_key &gt; -d &lt; downloaded_gpg_file &gt; | tar xf -</code></li>
<li>Go to the output directory to view decrypted files.</li>
</ol>
</li>
</ol>


<h2 data-heading="h2" data-no-toc="true">dbGaP Processing and Release</h2>


<p><a name="aprocessing" id="aprocessing"></a></p>


<h3 id="27-what-happens-once-i-submit-my">27. What happens once I submit my core data files and phenotype files to the dbGaP database?</h3>


<p>dbGaP curators work through the study queue in the order the study is submitted to the dbGaP <a href="#spgloss">Submission Portal</a>. Study submissions should be complete, which may include all <a href="#apheno">phenotype component files</a>, <a href="#ageno">molecular (non-sequence¹) data</a>, <a href="#aSRA">high througput sequence data</a> , <a href="#aphd">study documents</a>, and <a href="#apha">analyses</a>. Completed study submissions can be released as soon as:</p>


<ol>
<li>dbGaP has finished processing the study;</li>
<li>If there are high throughput human sequence data and all sequences appear ready/public in the <a href="#sstrgloss">Subject Sample Telemetry Report (SSTR)</a>;</li>
<li>The registration information is consistent with the submitted data and the study registration in the <a href="#ssgloss">Submission System</a> is marked "Completed by GPA";</li>
<li>The study investigator or PI assistant has given permission to release the study. If you have additional files to submit for the release, then the study submission is incomplete and will not have priority in the processing queue.</li>
</ol>


<p>You can track your study's progress through the <a href="#ssrgloss">Study Status Report (SSR)</a>.</p>


<p><a name="aqcchecks" id="aqcchecks"></a>
<strong>QC Checks</strong></p>


<p>We are offering pre-validation tools for you to check your data before submitting to dbGaP on your system using <a href="https://github.com/ncbi/gaptools/blob/master/GaPTools.md">GaPTools</a>.</p>


<p>dbGaP will run several quality control (qc) checks upon submission. </p>


<ol>
<li><strong>Automated preprocessing checks</strong> will immediately be run after submission for studies with PLINK or VCFs, Subject Consents DS, Subject Sample Mapping (SSM) DS, and Pedigree DS. The automated system will email all submitters with results from the five types of files. If one of the five types is resubmitted, the automated system will be re-run. Here is a web page showing errors and warnings that the automated system may detect: <a href="https://www.ncbi.nlm.nih.gov/gap/public_utils/messages/">https://www.ncbi.nlm.nih.gov/gap/public_utils/messages/</a>. </li>
<li>
<p><strong>Manual and scripted qc checks</strong> will be completed by the dbGaP curators of your study. The phenotype curator and genotype curator will separately report back errors detected, since the processing occurs at different times depending on the queue and the errors can be complex within each component.</p>
<ul>
<li><em>Phenotype Curation</em>: <ul>
<li>The phenotype curator coordinates the entire study release and processes the information in the Submission System registration (SS), Submission Portal (SP), Study Config, DS and DD (Subject Consent, SSM, Pedigree, Subject Phenotypes, Sample Attributes, and Sample to NCBI Database Mapping), Study Documents, and Medical Images.</li>
<li>All individual level data are split by consents.</li>
<li>The manual portion includes vetting the <a href="#sdogloss">Study Data Outline</a>, validating consents, and checking for incongruent phenotypic values and summaries.</li>
<li>Scripted qc checks look for inconsistencies between files and between all dbGaP studies, formatting errors that make loading of the datasets (DS) and data dictionaries (DD) into the dbGaP database impossible, inconsistencies between DS and DD with regard to subject consent, sex, affection status, and potential HIPAA violations.</li>
<li>See <a href="#qcpass">Question 16</a> for common errors we encounter.  </li>
</ul>
</li>
<li><em>Genotype Curation</em>:<ul>
<li>The genotype curator processes all molecular data EXCEPT for high throughput sequence data.</li>
<li>Molecular data may include SNP array, methylation, expression/epigenetic data, CNV, VCF, MAF, imputation, and other formats.</li>
<li>QC checks include sex checks, pedigree checks, and unintened duplications.</li>
<li>For data where these checks are not relevant, the data is packaged and split by consents.</li>
<li>BAM, FASTQ, CRAM files are not processed by the dbGaP genotype curator, but by the sequence pipeline.</li>
</ul>
</li>
<li><em>Combined Curation</em>:<ul>
<li>Inconsistencies between molecular data sample IDs and phenotype sample IDs, unintended data duplications, incorrect pedigree information, Subject relationships will further be checked using dbGaP software, <a href="https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/Software.cgi">GRAF</a> (<strong>G</strong>enetic <strong>R</strong>elationship <strong>a</strong>nd <strong>F</strong>ingerprinting). </li>
<li>The reports and counts in the <a href="#sstrgloss">Subject Sample Telemetry Report (SSTR)</a> will be reviewed.</li>
</ul>
</li>
</ul>
</li>
</ol>


<p><a name="alfaqc" id="alfaqc"></a>
<strong>GSR, GRAF-pop, and ALFA</strong></p>


<p>dbGaP subjects with genomic data and that have been designated "non-sensitive" for release of <a href="#gsrgloss">Genomic Summary Results (GSR)</a> in the dbGaP <a href="#ssgloss">Submission System</a> will also be analyzed using <a href="https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/Software.cgi">GRAF-pop</a> and included for the <a href="#alfagloss">ALFA</a> (<strong>Al</strong>lele <strong>F</strong>requency <strong>A</strong>ggregator) project. Studies may be contacted to correct the submitted data or provide a README if:</p>


<ol>
<li>They contain allele frequencies that deviate from the expected range of known allele frequencies for the <a href="https://www.ncbi.nlm.nih.gov/snp/docs/gsr/data_inclusion/#population">12 diverse populations</a> and/or</li>
<li>The submitted ancestry or population deviates from the computed ancestry for a large number of samples.</li>
</ol>


<p>Careful adherence to this submission guide and the emailed error reports can eliminate the need for resubmission and quicken the schedule for release.</p>


<p><strong>Splitting Files by Consents</strong></p>


<p>dbGaP will assign dbGaP-generated subject IDs and sample IDs and split the final individual level datasets (both phenotypes and genotypes) for release by consent, with the exception of the three meta study DS (Subject Consent, SSM and Pedigree). Subject IDs that have been marked as aliases will be assigned the same dbGaP subject ID. The dbGaP-generated IDs will appear in the <a href="#dumpgloss">final dump files</a>, NCBI BioSample website, and the <a href="#sstrgloss">Subject Sample Telemetry Report (SSTR)</a>. </p>


<p><a name="apreview" id="apreview"></a>
<strong>Preview</strong></p>


<p>Prior to posting your study, dbGaP will provide you with access to a preview site of your study that shows study content as it might appear on the final public dbGaP page: <a href="https://www.ncbi.nlm.nih.gov/gap">https://www.ncbi.nlm.nih.gov/gap</a>. Once all the study components have been processed and you have reviewed the preview site and the <a href="#sstrgloss">SSTR</a>, dbGaP will send an email to request the study investigator's or PI assistant's approval to release the study.</p>


<hr />


<p>¹<a href="#aSRA">Sequence data</a> (e.g. BAM, CRAM, FASTQ) should be submitted only after: 1) you have received an email with an attached sequence metadata file containing the registered subject and sample IDs, and consents. This process ensures that submitted sequences are tied to sample IDs that belong to consented subjects. 2) The sequence metadata has been processed and you have received an email to upload sequences.</p>


<p><a name="arelease" id="arelease"></a></p>


<h3 id="28-when-and-what-will-be-release">28. When and what will be released?</h3>


<p>The release occurs approximately 6-8 weeks following receipt of final datasets that are without error. If there are errors, the processing time will increase. The study registration in the <a href="#ssgloss">Submission System</a> must be marked "Completed by GPA". Once the study investigator or PI assistant and dbGaP approve of the posting of the study, it will be released in 2-3 business days to the following sites.</p>


<blockquote>
<p>Public dbGaP page (<a href="https://www.ncbi.nlm.nih.gov/gap">https://www.ncbi.nlm.nih.gov/gap</a>) – includes a study report page, public summary phenotype variables and datasets, molecular data summary, study documents, analyses browser, and indexing of various study terms for users to search and filter for studies. When your study becomes publicly available, the URL will appear like <a href="https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000001.v3.p1">https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs00####.v#.p#</a>, where the last part of the URL is the <a href="#phsgloss">study accession number</a>.</p>
<p>Public FTP site (<a href="https://ftp.ncbi.nlm.nih.gov/dbgap/studies/">https://ftp.ncbi.nlm.nih.gov/dbgap/studies/</a>) – features the study manifest (a list of all released files), study configuration (a list of how the study is configured in the Authorized Access system), release notes (summarizes the data that has been released and any changes since the last version), summary statistics of phenotype variables, phenotype data dictionaries, study documents, and analyses aka genomic summary results (truncated, gene-level, and/or summary level).</p>
<p>Authorized Access portal (<a href="https://dbgap.ncbi.nlm.nih.gov/aa/wga.cgi?page=login">https://dbgap.ncbi.nlm.nih.gov/aa/wga.cgi?page=login</a>) – this is the management portal for individual-level data. This site can be used to submit a data access request, manage access requests, and download approved datasets.</p>
</blockquote>


<p><strong>What if I have a paper publication or must meet a specific release date?</strong></p>


<p>If you need to schedule a study release to coincide with a publication (e.g. hold the study until a certain date, try to complete study processing by a certain date), communicate to dbGaP the specific date and/or at least a general time frame as soon as you know it. dbGaP will work with you to accommodate your release schedule whenever possible.</p>


<p><strong>How often can dbGaP release my study?</strong></p>


<p>A dbGaP study can be released quarterly at most. Finalized data, that is data without error, must be submitted 6-8 weeks in advance for qc checks and processing. Please contact us if we need to work out a release schedule.</p>


<p><strong>What should I do if I need my study accession public before the data has been processed?</strong></p>


<p>If a publication requires that the study is public on the dbGaP page, please let us know and we can release the study report page in advance. The phenotype and molecular data can then be released at a later time.</p>


<p><strong>Can an embargo date be applied?</strong></p>


<p>There are no longer publication embargo dates. See <a href="https://osp.od.nih.gov/scientific-sharing/genomic-data-sharing-faqs/">https://osp.od.nih.gov/scientific-sharing/genomic-data-sharing-faqs/</a>. However, if you need dbGaP to postpone a study from release until a certain date, please confer with the PO and <a href="#gpagloss">GPA</a> assigned to your dbGaP study to agree on a date of release (only weekdays). Once a date has been decided, please email the dbGaP phenotype curator along with the PO and GPA to let us know the agreed upon date.</p>


<h3 id="29-whom-may-i-contact-with-quest">29. Whom may I contact with questions about my dbGaP data submission?</h3>


<p>General dbGaP questions and Authorized Access questions: <a href="mailto:dbgap-help@ncbi.nlm.nih.gov">dbgap-help@ncbi.nlm.nih.gov</a></p>


<p>dbGaP <a href="#spgloss">Submission Portal</a> questions: <a href="mailto:dbgap-sp-help@ncbi.nlm.nih.gov">dbgap-sp-help@ncbi.nlm.nih.gov</a></p>


<p>Phenotype and molecular data questions, please contact the assigned study curator(s).</p>


<ul>
<li>Phenotype curators: study config, IDs, consents, phenotype data, study documents, medical images, study release schedules, etc.</li>
<li>Genotype curators: all molecular data EXCEPT for high throughput sequence related questions</li>
<li>Sequence curators: high throughput sequence data. Contact <a href="mailto:sra@ncbi.nlm.nih.gov">sra@ncbi.nlm.nih.gov</a></li>
</ul>


<p>dbGaP Team Lead: Michael Feolo <a href="mailto:feolo@ncbi.nlm.nih.gov">feolo@ncbi.nlm.nih.gov</a> </p>


<p><a name="aversions" id="aversions"></a></p>


<h2 data-heading="h2" data-no-toc="true">dbGaP Versions</h2>


<h3 id="30-how-can-i-submit-additional-d">30. How can I submit additional data after my study is released?</h3>


<p>Once your study is released, it is a historical record in the dbGaP database. If you would like to submit new data or update existing data (correct, remove, or add rows or columns of data), you will need to create a new version of your study. This means that the study accession of your study will be updated, e.g., phs000024.v1.p1 to phs000024.v2.p?, where the version number (v#) will increment by one and the participant set number (p#) will increment by one if subjects have been retired or have moved from one consent group to another. If only new subjects have been added, the p# will not be incremented. Once a new version of a study is released, the prior version will no longer be available for download. The new version will encompass all files from the previous version and any newly submitted data.</p>


<p>For new versions of a study, we ask users to continue to follow the guidelines in this Submission Guide. Repeated formatting errors will increase processing time. More importantly, if the data is inconsistent such as IDs do not match the prior version, counts between files do not match, or reported sex values do not match genotyped sex values, the processing time will be substantially longer to process each iteration of the new version. Double check the submission by going through the checklist of common errors: <a href="#qcpass">Quality Control</a></p>


<p>If you need a copy of the phenotype component files that were last uploaded to the Submission Portal, please see the download instructions: <a href="#adownload">here</a>. Please note that these files may have been modified for release, so if you use these files for updates, please make sure to incorporate the prior changes.
If you are an investigator who would like to access your own data after your study is released, you will need to have your GPA register you as an "Investigator with <a href="#streamlinedgloss">Streamlined Access</a>".</p>


<p>For Submission Portal issues, email <a href="mailto:dbgap-sp-help@ncbi.nlm.nih.gov">dbgap-sp-help@ncbi.nlm.nih.gov</a>.<br />
For data specific questions, email the assigned phenotype or genotype curator.<br />
Submission Portal: <a href="https://submit.ncbi.nlm.nih.gov/dbgap/">https://submit.ncbi.nlm.nih.gov/dbgap/</a></p>


<ol>
<li>To create a new version, go to the dbGaP <a href="#spgloss">Submission Portal</a> and complete the <a href="#sdogloss">Study Data Outline (SDO)</a>. If you only want to edit the Study Config of the released study, DO NOT complete the SDO, rather contact your phenotype curator.</li>
<li>Once a new version is created, your <a href="#gpagloss">Genomic Program Administrator (GPA)</a> will be notified. The GPA will need to complete the registration in the dbGaP <a href="#ssgloss">Submission System</a> for the new version. Any consent changes should be provided to the GPA and those consents should be reflected in the Submission System as early as possible. Since all processed files are packed by consents, any changes in consents after processing will require your study to be reprocessed and significantly delay your study release. Please also contact your GPA for Acknowledgment Statement changes.</li>
<li><strong>Phenotype Component</strong> <ol>
<li>Do not submit files that have been submitted previously and are unchanged. This will add significant time to your processing.</li>
<li>Update the <a href="#aconfig">Study Config</a> so that it is cumulative and describes all versions of the study.</li>
<li>The Subject Consent (SC) files, Subject Sample Mapping (SSM) files, and Pedigree files should always be <em>cumulative</em>, e.g., all subject and sample IDs included in version 1 should be included in the version 2 SC, SSM and pedigree files. If a subject or sample ID is not included, dbGaP will mark the subject or sample ID as retired and the data will no longer be available in the new version. High throughput sequences belonging to retired samples will also be removed. </li>
<li>For Subject Phenotypes and Sample Attributes datasets (DS), all subjects listed must be consented in the SC and all samples must belong to consented subjects in the SSM. dbGaP will not concatenate multiple datasets into a single dataset. When adding data, consider how users might best use the data -- should the data from all versions be in a single Subject Phenotypes DS and single Sample Attributes DS or split into many Subject Phenotypes DS and Sample Attributes DS? For more guidance on whether to update a previously submitted dataset or add a brand new dataset, see "<a href="#lotspheno">Can I submit multiple subject phenotypes DS files?</a>" and "<a href="#lotsattr">Can I submit multiple sample attributes DS files?</a>". </li>
<li>There are 3 options to update the phenotype components in the Submission Portal. Notify your phenotype curator of the changes.<ul>
<li>Option 1: <strong>Replace</strong> previously released datasets (DS) or data dictionaries (DD). These DS or DD will be cumulative including subjects, samples, and variables from prior versions. Any subjects, samples, or variables removed from these DS and DD will be considered retired. The phenotype table accession (pht#) will remain the same, and the pht version will be incremented.</li>
<li>Option 2: <strong>Add</strong> new DS and DD pairs. Previously released subject phenotypes and sample attributes datasets will be kept, and there will be additional DS and DD added. Do not use the "replace" button in the Submission Portal, but rather add pairs of datasets (DS) and data dictionaries (DD). New phenotype table accessions (pht#) will be assigned.</li>
<li>Option 3: <strong>Delete</strong> previously released DS and DD pairs. There are no replacements. The phenotype table accession (pht#) will be retired for this version.</li>
</ul>
</li>
</ol>
</li>
<li><strong>Molecular Data</strong><ol>
<li>Submit molecular data with your phenotype component submission, so that this component enters the genotype curator's queue as soon as possible.</li>
<li>There are 2 options to update molecular data in the Submission Portal. Notify your genotype curator of the changes.<ul>
<li>Option 1: <strong>Add</strong> new data. In the Submission Portal, upload under "Other files" with type "Molecular Data".</li>
<li>Option 2: <strong>Delete</strong> previously released molecular data.</li>
<li>To replace previously released molecular data, first <strong>Delete</strong> then <strong>Add</strong>. The genotype accession (phg#) version will be incremented.</li>
</ul>
</li>
<li>If consents have been updated, a genotype curator will re-split the molecular data files according to the new consents, so that you will not need to resubmit for consent updates.</li>
<li>Review the instructions for the Molecular Data section: <a href="#ageno">here</a>.</li>
</ol>
</li>
<li><strong>Sequence Data</strong><ol>
<li>Do not submit sequences (BAM, CRAM, FASTQ) until you have been sent a Sequence Metadata file. You will be emailed a Sequence Metadata file if you selected "yes" for Sequence Data in the <a href="#sdogloss">Study Data Outline (SDO)</a> and once the IDs and consents from the Subject Consent and Subject Sample Mapping datasets have been processed and loaded.</li>
<li>The Sequence Metadata file should only include samples with new sequences that will be submitted. To replace or delete previously submitted sequences, contact an SRA curator: <a href="mailto:sra@ncbi.nlm.nih.gov">sra@ncbi.nlm.nih.gov</a></li>
<li>Fill out the Sequence Metadata and upload to the Submission Portal. The Sequence Metadata file will be validated, and you will be sent a second email to upload sequences.</li>
<li>Review the instructions for High Throughput Sequence Data section: <a href="#aSRA">here</a>.</li>
</ol>
</li>
<li>Retain the format and corrections that were made in the previous version following the Submission Guide. Remaking the same changes will take additional time.<ol>
<li>Check that variable names in the Dataset and the matching Data Dictionary are identical in spelling, i.e. have the same number of spaces, same case, etc.</li>
<li>Check that every variable has a variable description. Check that coded values in the Dataset have code meanings listed in the Data Dictionary.</li>
<li>Check that the sex of a subject remains consistent throughout a single study. If the sex has been changed as a result of a correction, please let dbGaP know via email.</li>
<li>Check that the case control status of a subject remains consistent throughout a single study.</li>
<li>Check that all subjects have been assigned a consent group.</li>
<li>Check that the existing subject and sample ID mappings remain the same between versions, unless there is an error and an ID needs to be remapped. In case of ID remapping, please let dbGaP know which IDs need to be remapped.</li>
<li>Check that all samples are mapped to a subject and therefore to a consent group.</li>
<li>Check that the data files contain the values you expect. Check for truncated values. Compare new files to the final files submitted for the previous version to check for differences and to make sure all changes are intended. If you need more information regarding which files were incorporated into the final release of the previous version, please make a request to dbGaP.</li>
</ol>
</li>
<li>To help us better understand the new version, please let your phenotype curator know:<ol>
<li>How many new subjects been added? Note: Subjects refer to a person. A person can have many samples.</li>
<li>How many subjects been deleted? To protect subject identity, if only 1 subject (person) is being deleted or added and there are no additional changes, we ask that either additional subjects are added or 1 additional subject is retired. This minimizes the possibility of comparing variable summaries between versions and identifying the phenotypes for that 1 person.</li>
<li>How many subjects have changed consent groups?</li>
<li>How many samples have been added?</li>
<li>How many samples have been deleted? </li>
<li>How many samples been remapped to different subjects? List.</li>
<li>Have any samples and subjects been renamed? If yes, provide a 4 column table with the the column headers: Old Sample, New Sample, Old Subject, New Subject. If only the sample are being renamed, then provide only the first 2 columns. Submit to the <a href="#spgloss">dbGaP Submission Portal</a> under "Other files" with Type "Special".</li>
<li>Are there updates to the phenotype component? New, replaced, or deleted datasets or variables? Have variables been renamed?</li>
<li>Are there updates to the molecular data? New, replaced or deleted files? What type of molecular data is being added? </li>
<li>Are there updates to the sequence data (BAM, CRAM, FASTQ)? What type of sequence data is being added (WGS, WXS, targeted, etc.)?</li>
<li>Are there any other updates we should be aware of?</li>
</ol>
</li>
</ol>


<p><a name="glssry" id="glssry"></a></p>


<h2 id="glossary-of-terms">GLOSSARY OF TERMS</h2>


<p><a name="aagloss" id="aagloss"></a>
<strong>Authorized Access (AA)</strong></p>


<p>Authorized Access, (<a href="https://dbgap.ncbi.nlm.nih.gov/aa/wga.cgi?page=login">https://dbgap.ncbi.nlm.nih.gov/aa/wga.cgi?page=login</a>), is the management portal for individual-level data. This site can be used to submit a <a href="#dargloss">Data Access Request (DAR)</a>, manage access requests, and download approved datasets.</p>


<p><a name="alfagloss" id="alfagloss"></a>
<strong>ALFA</strong></p>


<p>NCBI's Allele Frequency Aggregator (ALFA) pipeline computes allele frequencies for variants in dbGaP across approved unrestricted studies and provides the data as open-access to the public through <a href="https://www.ncbi.nlm.nih.gov/snp/">dbSNP</a>. Studies must be registered as <a href="#gsrgloss">GSR</a> insensitive in order to be included in <a href="https://www.ncbi.nlm.nih.gov/snp/docs/gsr/alfa/">ALFA</a>.</p>


<p><a name="collectiongloss" id="collectiongloss"></a>
<strong>Collection</strong></p>


<p>A dbGaP Collection is a virtual study under which other studies are grouped. A dbGaP Collection provides streamlined access to data across dbGaP studies or portions of dbGaP studies that share the same consent group, disease, or funding project. Data access for a collection is controlled by a single data access committee. The data in a collection is not harmonized across studies or otherwise altered from the original study. Investigators using data within a dbGaP collection are required to follow the use restrictions and acknowledgement instructions from the original dataset.</p>


<p>To search for dbGaP Collections, go to <a href="https://www.ncbi.nlm.nih.gov/gap/advanced_search/?OBJ=study&amp;COND=%7B%22is_host_of_collection%22:%5B%22yes%22%5D%7D">dbGaP Advanced Search</a>.</p>


<p>To create a new dbGaP Collection, see <a href="/gap/docs/specialstudies#CollectionSpecial">Special Studies</a>.</p>


<p><a name="consentgloss" id="consentgloss"></a>
<strong>Consent</strong></p>


<p><em>Study-level</em><br />
Consents in dbGaP refer to the <em>NIH Standard Data Use Limitations (DULs)</em> or how the study's data can be used in the future. One might consider if there will be additional data types or modalities generated and added to your dbGaP study. When filling out the Institutional Certification (IC), the DULs should be determined by your Institutional Review Board (IRB) or equivalent body based on your study's informed consent. These DULs are then registered in the <a href="#ssgloss">Submission System</a> by your study's <a href="#gpagloss">Genomic Program Administrator (GPA)</a> as consent group titles and abbreviations.</p>


<p>See <em>NIH Standard Data Use Limitations</em>: <a href="https://sharing.nih.gov/genomic-data-sharing-policy/institutional-certifications/completing-an-institutional-certification-form#step-5">https://sharing.nih.gov/genomic-data-sharing-policy/institutional-certifications/completing-an-institutional-certification-form#step-5</a></p>


<p><em>Individual-level data</em><br />
Each person (subject) should belong to a single consent group within a dbGaP study. Therefore, all samples of a person in the study are also associated to the same consent group. If a subject belongs to two or more consent groups in your study, pick the more stringent of the two consent groups, so that each person belongs to a single consent group. If you are the study investigator, you can see the consent groups in the <a href="#ssgloss">dbGaP Submission System</a>. If you are a submitter, you can see the consent groups in the dbGaP <a href="#spgloss">Submission Portal</a> for your study by clicking "View consent group" in the box on the upper right. dbGaP <a href="#aagloss">Authorized Access</a> users request for studies by consent. For questions regarding the registered consent group and DUL, please contact your <a href="#gpagloss">GPA</a>. </p>


<p>See the NIH Guidance for Genomic Sharing Plan: <a href="https://sharing.nih.gov/genomic-data-sharing-policy/developing-genomic-data-sharing-plans">https://sharing.nih.gov/genomic-data-sharing-policy/developing-genomic-data-sharing-plans</a> </p>


<p>A study should be designated with at least one NIH consent group title.</p>


<blockquote>
<table>
<thead>
<tr>
<th>Consent Group Titles</th>
<th>Consent Group Abbreviations</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>General Research Use</td>
<td>GRU</td>
<td>Use of the data is limited only by the terms of the Data Use Certification.</td>
</tr>
<tr>
<td>Health/Medical/Biomedical</td>
<td>HMB</td>
<td>The dataset can only be used for studying health, medical or biomedical conditions, and does not include the study of population origins or ancestry.</td>
</tr>
<tr>
<td>Disease-Specific (Disease/Trait/Exposure)</td>
<td>DS-xxx</td>
<td>The dataset can be used only for research on a specific disease or related condition.</td>
</tr>
</tbody>
</table>
</blockquote>


<p>Additional modifiers can be added if applicable.</p>


<blockquote>
<table>
<thead>
<tr>
<th>Consent Group Limitations</th>
<th>Consent Group Abbreviations</th>
<th>Description</th>
</tr>
</thead>
<tbody>
<tr>
<td>IRB approval required</td>
<td>IRB</td>
<td>The requesting institution's IRB or equivalent body must approve the requested use.</td>
</tr>
<tr>
<td>Publication required</td>
<td>PUB</td>
<td>The requestor must share their results with the larger scientific community.</td>
</tr>
<tr>
<td>Collaboration required</td>
<td>COL</td>
<td>The requestor must provide a letter of collaboration with the primary study investigator(s).</td>
</tr>
<tr>
<td>Not-for-profit use only</td>
<td>NPU</td>
<td>The dataset can only by used by not-for-profit organizations. State specifically if the data should not be made available to commercial organizations.</td>
</tr>
<tr>
<td>Methods</td>
<td>MDS</td>
<td>The dataset can be used for methods research and development (e.g., development of statistical software or algorithms).</td>
</tr>
<tr>
<td>Genetic studies only</td>
<td>GSO</td>
<td>The dataset can only be used only for genetic studies.</td>
</tr>
</tbody>
</table>
</blockquote>


<p>For example, a study might have two consent groups: 1) General Research Use with IRB approval and Not-for-profit use and 2) General Research Use. Therefore, a subset of the subjects would have the GRU-IRB-NPU designation, while the remaining subjects would be GRU. There should be no overlapping subjects between the two consent groups.</p>


<p>Note: "Other" may be selected when it is definitive that no standardized consent group and modifier listed above can be used as the data use limitation of a study. "Other" is not an official designation and should not be used as the Consent Group Title or Abbreviation. The GPA and PI should determine a Consent Group Title and Abbreviation that best represents the data use limitation. Since Abbreviations are used in file names, and file names have character limits, please choose a concise Abbreviation. </p>


<p><a name="dacgloss" id="dacgloss"></a>
<strong>Data Access Committee (DAC)</strong></p>


<p>What is the DAC? NIH Data Access Committees (DACs) review requests to access data in the Database of Genotypes and Phenotypes (dbGaP). See <a href="https://osp.od.nih.gov/scientific-sharing/data-access-request-dar-approvals-and-disapprovals-by-data-access-committee-dac/">https://osp.od.nih.gov/scientific-sharing/data-access-request-dar-approvals-and-disapprovals-by-data-access-committee-dac/</a></p>


<p>DAC Chairs and Emails: <a href="https://sharing.nih.gov/genomic-data-sharing-policy/resources/contacts-and-help#gds_support">https://sharing.nih.gov/genomic-data-sharing-policy/resources/contacts-and-help#gds_support</a></p>


<p>Genomic Data Sharing (GDS): <a href="https://sharing.nih.gov/genomic-data-sharing-policy">https://sharing.nih.gov/genomic-data-sharing-policy</a></p>


<p><a name="dargloss" id="dargloss"></a>
<strong>Data Access Request (DAR)</strong></p>


<p>How to make a Data Access Request: <a href="https://sharing.nih.gov/accessing-data/accessing-genomic-data/how-to-request-and-access-datasets-from-dbgap">https://sharing.nih.gov/accessing-data/accessing-genomic-data/how-to-request-and-access-datasets-from-dbgap</a></p>


<p>The Signing Official should confirm that the individual listed as the <a href="#itdirgloss">IT Director</a>, has a background in computer security, has the institutional (and not just a department) authority and can confirm that your institution has the capacity to protect shared data, and will comply with NIH Genomic Data Sharing Policy.</p>


<p>dbGaP Data Access and Use Report: <a href="https://ncbi.nlm.nih.gov/projects/gap/cgi-bin/DataUseSummary.cgi">https://ncbi.nlm.nih.gov/projects/gap/cgi-bin/DataUseSummary.cgi</a></p>


<p><a name="dulgloss" id="dulgloss"></a>
<strong>Data Use Limitations (DUL)</strong></p>


<p><a href="https://sharing.nih.gov/genomic-data-sharing-policy/institutional-certifications/completing-an-institutional-certification-form#step-5">https://sharing.nih.gov/genomic-data-sharing-policy/institutional-certifications/completing-an-institutional-certification-form#step-5</a></p>


<p>See <a href="#consentgloss">Consents</a> for examples.</p>


<p><a name="phsgloss" id="phsgloss"></a>
<strong>dbGaP Accession Numbers</strong></p>


<p><em>Study Accession Number</em> - Once the Study Data Outline (SDO) is completed, a study accession is assigned: <strong>phs######.v#.p#</strong>.
The study accession is a unique, stable, and versioned identifier (ID) that can be used in publications. It is prefixed by "phs," indicating a phenotype study.</p>


<p>The version number (.v#) and participant set number (.p#) do not change during iterations within a release cycle, but following release and only after changes have been made to existing data or new data is added. The Study v# is always incremented, while the v# for its components are only incremented when there are changes to that specific component. The p# is incremented when subjects in an existing study set changes consent status. The p# is never incremented when only new subjects are added and existing subjects have not changed consents. </p>


<p><em>Dataset Accession Number</em> - Each phenotype table (SC, SSM, pedigree, subject phenotypes, and sample attributes) is assigned a pht######.v#.</p>


<p><em>Variable Accession Number</em> - Each variable in a phenotype table (SC, SSM, pedigree, subject phenotypes, and sample attributes) is assigned a phv########.v#.</p>


<p><em>Document Accession Number</em> - Each study document (e.g. protocols, questionnaires, manuals of procedures and operations) is assigned a phd######.#, where .# is the version number.</p>


<p><em>Molecular Data Accession Number</em> - Each grouping of molecular data is assigned a phg######.v#.</p>


<p><em>Analysis Accession Number</em> - Each analysis is assigned a pha#######.v#.</p>


<p><a name="dummyidgloss" id="dummyidgloss"></a>
<strong>Dummy IDs</strong></p>


<p>Dummy IDs are IDs created by the submitter to fill in unknown mother and father IDs when establishing a sibling relationship in the pedigree file. It is important that the dummy ID for the mother and father ID be unique. It is assumed that the dummy mother ID and father ID are identical for full sibling pairs. </p>


<p><a name="dumpgloss" id="dumpgloss"></a>
<strong>Dump Files</strong></p>


<p>Dump files is the term used to describe the individual-level phenotype data (SC, SSM, pedigree, subject phenotypes, and sample attributes) generated and distributed through controlled access. Dump file names have the study accession (phs), table accession (pht), a short dataset name, and consent designations. Each file has variable accessions and dbGaP-assigned subject IDs and/or sample IDs in addition to data submitted. The SSM dataset dump file also has BioSample IDs. </p>


<p><a name="edsgloss" id="edsgloss"></a>
<strong>External Data Source (EDS)</strong></p>


<p>External Data Source (EDS) is a non-dbGaP entity that is a public or private, national or international organization that is able to meet core NIH standards for establishing data quality and data management service protocols for NIH, based on the programmatic need of an NIH funding Institute or Center (IC). EDS are NIH Institute and Center Supported Repositories. Studies with data in the EDS will require credentialed users to apply for access to the data through dbGaP Authorized Access. For more information, see NIH's Scientific Data Sharing page: <a href="https://sharing.nih.gov/data-management-and-sharing-policy/sharing-scientific-data/repositories-for-sharing-scientific-data">https://sharing.nih.gov/data-management-and-sharing-policy/sharing-scientific-data/repositories-for-sharing-scientific-data</a>.<br />
To create a dbGaP study with an EDS, see <a href="/gap/docs/specialstudies#EDSSpecial">Special Studies</a>.</p>


<p><a name="gpagloss" id="gpagloss"></a>
<strong>Genomic Program Administrator (GPA)</strong></p>


<p>GPAs work with investigators to facilitate study registration and data submission for controlled-access data repositories. Each NIH IC has designated one or more senior staff as GPAs to support genomic data sharing implementation activities.
<a href="https://sharing.nih.gov/genomic-data-sharing-policy/resources/contacts-and-help#gds_support">https://sharing.nih.gov/genomic-data-sharing-policy/resources/contacts-and-help#gds_support</a></p>


<p><a name="gsrgloss" id="gsrgloss"></a>
<strong>Genomic Summary Results (GSR)</strong></p>


<p><a href="https://osp.od.nih.gov/wp-content/uploads/What_are_Genomic_Summary_Results.pdf">https://osp.od.nih.gov/wp-content/uploads/What_are_Genomic_Summary_Results.pdf</a><br />
<em>GSR Sensitive</em> - This is a designation made by the <a href="#gpagloss">GPA</a> in the <a href="#ssgloss">Submission System</a> when a study is registered. The full analysis results are available for download through the dbGaP Authorized Access System upon approval of the Data Access Request (DAR). The publicly available analysis results on the public FTP are truncated (top hits) and have potentially identifiable information (frequencies and direction of effect) redacted.<br />
<em>GSR Insensitive</em> - This is a designation made by the <a href="#gpagloss">GPA</a> in the <a href="#ssgloss">Submission System</a> when a study is registered. Both full (unredacted) association results with frequency of alleles/direction of effect and top hits are available on the public FTP site.</p>


<p><a name="grafgloss" id="grafgloss"></a>
<strong>GRAF</strong></p>


<p>GRAF (<strong>G</strong>enetic <strong>R</strong>elationship <strong>a</strong>nd <strong>F</strong>ingerprinting) is a C++ program that quickly finds closely related subjects using SNP genotype data. Access GRAF at <a href="https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/Software.cgi">https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/Software.cgi</a></p>


<p><a name="hipaagloss" id="hipaagloss"></a>
<strong>HIPAA - algorithm to detect HIPAA sensitive dates</strong></p>


<ol>
<li>Two 1 or 2-digit numbers and a 2 or 4-digit number, in this order, separated by "/", "-" or ".", e.g., "3/5/1994" or "12-28-03".</li>
<li>One 4-digit number and two 1 or 2-digit numbers separated by "/", "-" or ".", e.g., "1994.2.13".</li>
<li>A 1 or 2-digit number and a 4-digit number starting with 19 or 20 separated by "/", e.g., "10/1994" (but not "10.1994").</li>
<li>A 1 or 2-digit number followed by a "/" and a 2-digit number starting with 0, e.g., "3/04" (but not "3/94").</li>
<li>A month name or abbreviation and a 1, 2, or 4-digit number, in either order, separated by some non-letter, non-number characters or not separated, e.g., "JAN '93", "FEB64", "May 3rd" (but not "May be 14").</li>
<li>A 6-digit number is considered to be potential date value if its first four digits make a valid date in mmdd format (i.e., first two digits read as month and second two as day of the month). For example, 112876 is considered to be a potential date value since 1128 is a valid date (Nov. 28) in mmdd format; 231208 or 113198 is not a potential date since 23/12 or 11/31 is not a valid date in month/day format. If all of the values, or first 10 values, of a variable are 6-digit potential dates, this variable together with it potential date values will be reported by the scripts.</li>
<li>An 8-digit number is considered to be a potential date value if it makes a valid date in the 20th or 21st century in either mmddyyyy or yyyymmdd format. For example, "19940822" is considered to be a potential date since it can be read as 1994/08/22 (Aug. 22, 1994). "10312005" is a potential date value since it can be read as 10/31/2005 (Oct. 31, 2005). "19080230" is not considered to be a potential date since neither 1908/02/30 nor 19/08/0230 is a valid date in the 20th or 21st century. If all of the values or the first 10 values of a variable are 8-digit numbers of potential date values, the variable will be reported as containing potential HIPAA violations.</li>
</ol>


<p>In addition to date values, the QC scripts also report data values that look like social security numbers (e.g., "123-45-6789" or "123456789"), phone numbers (e.g., "321-456-7890" or "(301)456-7890"), zip codes (e.g., "MD 20892"), etc. A few cases of this kind of sensitive information have been detected by the QC scripts. However, other cases like names of people are not reported by the QC scripts. A few cases of names of patients and providers have been detected by visual inspection.</p>


<p><a name="instcertgloss" id="instcertgloss"></a>
<strong>Institutional Certification (Institution Cert)</strong></p>


<p><a href="https://sharing.nih.gov/genomic-data-sharing-policy/institutional-certifications">https://sharing.nih.gov/genomic-data-sharing-policy/institutional-certifications</a></p>


<p><a name="itdirgloss" id="itdirgloss"></a>
<strong>IT Director</strong></p>


<p>The IT Director is a person who has the institutional (and not just a department) authority and can confirm that your institution has the capacity to protect shared data, and will comply with NIH Genomic Data Sharing Policy. The IT Director should have a background in computer security and should not be the same person as the PI, any of the collaborators, the Signing Official, or the IRB review board. For example, your Chief Information Officer would be appropriate.</p>


<p><strong>Login</strong></p>


<p>The <a href="/gap/docs/login">Login Guide for dbGaP PIs and Submitters for dbGaP Submission System and Submission Portal</a> addresses login through the Authenticator App, NIH smart card, eRA Commons account, and other third-party accounts. There is an FAQ that includes common questions, including merging multiple accounts and webpage error messages.</p>


<p><a name="parentchildgloss" id="parentchildgloss"></a>
<strong>Parent-Child Study / Umbrella Study and Substudies / Cohort</strong></p>


<p>See <a href="/gap/docs/specialstudies#ParentChildSpecial">Special Studies</a></p>


<p><a name="piassistgloss" id="piassistgloss"></a>
<strong>PI Assistant</strong></p>


<p>The study investigator may designate an individual to be the PI Assistant in the <a href="#ssgloss">dbGaP Submission System</a>. This individual will have "manager" and "submitter" permissions in the dbGaP <a href="#spgloss">Submission Portal</a> and will be the primary contact for dbGaP. This individual will be able to provide final approval for the study release. </p>


<p><a name="sampidgloss" id="sampidgloss"></a>
<strong>SAMPLE_ID</strong></p>


<p>A dbGaP Sample is defined as the ID of the final preps submitted to dbGaP by a genotyping center, runs from high throughput sequencing by a sequencing group, or data submitted to an NCBI resource, such as GEO or GenBank. A single subject may be mapped to multiple samples, but a single sample should not be mapped to multiple subjects unless the samples are pooled.* For example, if one subject (SUBJECT_ID) provided one sample, and that sample was processed to generate 2 sequencing runs or 1 sequencing and 1 genotyping array run, the data file would show two rows, both using the same subject ID, but having 2 unique sample IDs.</p>


<p>*Please inquire about pooled samples if applicable. This would only apply to pooled samples that belong to consented subjects. If the samples are pooled from controls that are publicly available, there is no need for marking the pooled samples, and a single sample ID may be assigned.</p>


<p>Each sample should be submitted with a single, unique, de-identified sample ID. Sample IDs should be an integer or string value. Integers should not have zero padding. IDs should not have spaces. Specifically, only the following characters can be included in the ID: English letters, Arabic numerals, period (.), hyphen (-), underscore (_), at symbol (@), and the pound sign (#). Once a variable name for the sample ID has been chosen, please use the same variable name throughout all the phenotype files for consistency. For example, please do not use SAMPLE_ID in one file and SAMPLE_NAME in another file. Please also do not use "dbGaP" in your submitted ID name, since dbGaP will assign a dbGaP sample ID that will be included in the final dump files along with the submitted sample ID.</p>


<p><a name="streamlinedgloss" id="streamlinedgloss"></a>
<strong>Streamlined Access</strong></p>


<p>Study investigators can be granted "Investigators with streamlined access" in the <a href="#ssgloss">Submission System</a> where the study is registered. The system will automatically create a project for the submitting investigators in <a href="#aagloss">Authorized Access</a>. This project will not be provisioned with a DAR, or require SO or DAC approval. It will also not expire. </p>


<p><a name="sdogloss" id="sdogloss"></a>
<strong>Study Data Outline (SDO)</strong></p>


<p>The Study Data Outline (SDO) is filled out by the submitter in the <a href="#spgloss">Submission Portal</a> and informs dbGaP curators what data is expected to be uploaded and released for the current study version. The SDO is based off of the <a href="#achecklist">File Submission Checklist and File Applicability</a>. The SDO must be filled out to obtain the study accession (phs#) or create a new version after release. The study accession can be used in publications. After the SDO is submitted, a submitter may begin to create/edit the study config and submit data. To edit the SDO, go to the box on the upper right and click on "Study data outline". A few of the common questions we have after reviewing the SDO is whether VCFs called from sequence data will be submitted and whether expression counts from RNASeq will be submitted. </p>


<p><strong>Study Registration</strong></p>


<p><a href="https://osp.od.nih.gov/scientific-sharing/study-registration-and-data-submission-to-an-nih-designated-controlled-access-data-repository/">https://osp.od.nih.gov/scientific-sharing/study-registration-and-data-submission-to-an-nih-designated-controlled-access-data-repository/</a></p>


<p><a name="studystatusgloss" id="studystatusgloss"></a>
<strong>Study Statuses</strong></p>


<p><u><a href="#ssgloss">Submission System (SS)</a> Statuses</u></p>


<p><em>Incomplete</em> - This status denotes that the registration in the SS is missing key items, such as names, emails, Institutional Certifications (ICs), consents, acknowledgements, etc.<br />
<em>Awaiting GPA's Approval</em> - This status denotes that the registration in the SS has been filled out, but not yet approved by the GPA. When version 2 and later are created, the system is automatically set to "Awaiting GPA's Approval". The GPA can modify or accept existing entries. Changes to consents can significantly delay the study release.<br />
<em>Review by PI</em> - This status denotes that the registration in the SS is awaiting review by the PI.<br />
<em>Completed by GPA</em> - This status denotes that the registration in the SS has been completed by the GPA. For processing, the admin IC and consents must be finalized. Changes to the consents once processing begins can significantly delay the study release.<br />
<em>Deleted</em> - This status denotes that the study was once registered in the SS but now no longer should be processed for release.<br />
<em>Release Postponed</em> - For substudies that are registered but not to be released with the current version of Parent-Child studies, the status will be marked postponed. We strongly suggest not registering substudies ahead, but to register them when they can be released in the parent version that they were registered in.<br />
Once the study is released, the SS will take on the Authorized Access Statuses below.</p>


<p><u>Study Components Statuses</u></p>


<p><em>In Queue</em> - The study queue is ordered by submission date. For studies that resubmit, we will prioritize the first iteration. Thereafter, the study will be worked on in the order it has been submitted.<br />
<em>In Process</em> - A curator is completing <a href="#aqcchecks">manual and automated QC checks</a>, validating consents and data values for consistency, correcting errors, adding search features, and packaging files by consents.<br />
<em>Waiting on Submitter</em> - A curator is waiting on submitters to submit data, correct reported issues, modify or submit additional data, or reconcile consents.<br />
<em>Preview</em> - This status is used solely for the phenotype component to denote that the study is on the dbGaP preview site and needs to be reviewed by the submitters.<br />
<em>Completed</em> - This status denotes that a component (phenotypes, molecular data, high throughput sequences) has passed manual and automated qc checks, validated, and split by consents.<br />
<em>Postponed</em> - This status denotes that the study is completed and ready for release, but is postponed to accomodate publication schedules and other agreements between the funding agency and the study. No new data or modifications should be submitted during this time.</p>


<p><u><a href="#aagloss">Authorized Access (AA)</a> Statuses</u></p>


<p><em>Released</em> - A released study is available on <a href="https://www.ncbi.nlm.nih.gov/gap/">public pages</a>, through <a href="#aagloss">Authorized Access</a>, and <a href="https://ftp.ncbi.nlm.nih.gov/dbgap/studies/">public FTP sites</a>. For more details, see: <a href="#arelease">When and what will be released?</a><br />
<em>Withdrawn</em> – A study may be withdrawn permanently from AA after study release. The study and its Data Access Requests (DARs) are not available, except for admins in some interfaces.<br />
<em>Suspended</em> - A study may be suspended temporarily from AA after study release. The study is not available for PIs, but DAC members still can see existing DARs for this study and make approvals/rejection.</p>


<p><a name="ssrgloss" id="ssrgloss"></a>
<strong>Study Status Report (SSR)</strong></p>


<p>A Study Status Report (SSR) is used to track the progress of your study processing, and includes contact emails for your phenotype curator, genotype curator, Program Officer (PO), and <a href="#gpagloss">GPA</a>. There is a link to your study's SSR from the Submission System, Submission Portal, preview site instructions, and preview site.</p>


<p><a name="sstrgloss" id="sstrgloss"></a>
<strong>Subject Sample Telemetry Report (SSTR)</strong></p>


<p>The SSTR is a public study level report that displays loaded subject and sample IDs, consents, summary counts, processing status, and molecular and sequence sample uses. This report is populated at multiple time points when: 1) subject IDs, sample IDs, and consents are loaded, 2) BioSample assigns BioSample IDs, 3) molecular data or sequences are loaded. Note: Sequence data refers to all high throughput sequence data, while Molecular data is all other molecular data except for high throughput sequence data. Submitters are able to track when the sequence metadata has been accepted and see if there are errors with the submitted sequence data or if the sequence data is ready for release or is already public. Submitters should verify that the number of samples with molecular data and sequences matches what they expect the count to be when reviewing the preview site. Once a study is released, <a href="https://www.ncbi.nlm.nih.gov/gap/advanced_search/">dbGaP Advanced Search</a> can be used to select a specific study to access the SSTR link on the study report page.</p>


<p><strong>Subject Sample Telemetry Report (SSTR) API</strong></p>


<p>The SSTR APIs provide programmatic access to public summary and metadata level telemetry for subjects and samples submitted and processed for a dbGaP study. The API responses are in JSON format, and conform to the dbGaP study schema: <a href="https://www.ncbi.nlm.nih.gov/gap/sstr/schema/dbgap_study.v1.schema.json">https://www.ncbi.nlm.nih.gov/gap/sstr/schema/dbgap_study.v1.schema.json</a>. The swagger pages can be found here: <a href="https://www.ncbi.nlm.nih.gov/gap/sstr/swagger/">https://www.ncbi.nlm.nih.gov/gap/sstr/swagger/</a></p>


<p><a name="subjidgloss" id="subjidgloss"></a>
<strong>SUBJECT_ID</strong></p>


<p>A dbGaP Subject is defined as a single human person/individual/patient that arises from a single germline. Each subject should be submitted with a single, unique, de-identified subject ID. Subject IDs should be an integer or string value. Integers should not have zero padding. IDs should not have spaces. Specifically, only the following characters can be included in the ID: English letters, Arabic numerals, period (.), hyphen (-), underscore (_), at symbol (@), and the pound sign (#). Once a variable name for the subject ID has been chosen, please use the same variable name throughout all the phenotype files for consistency. For example, please do not use SUBJECT_ID in one file and INDIVIDUAL_ID in another file. Please also do not use "dbGaP" in your submitted ID name, since dbGaP will assign a dbGaP subject ID that will be included in the final dump files along with the submitted subject ID. Subjects that are known to be the same person across dbGaP studies will be assigned the same dbGaP subject ID.</p>


<p><a name="spgloss" id="spgloss"></a>
<strong>Submission Portal (SP)</strong></p>


<p>The Submission Portal (SP) link is <a href="https://submit.ncbi.nlm.nih.gov/dbgap/">https://submit.ncbi.nlm.nih.gov/dbgap/</a>. Login using the same email address that was used to accept the SP invitation. See SP Login Instructions: <a href="/gap/docs/login">here</a>. The SP is a secure way to upload and track study data to dbGaP. The <a href="#sdogloss">Study Data Outline (SDO)</a> tracks the type of data that will be submitted per study version. The files accepted in the SP are: <a href="#aconfig">Study Config</a>, <a href="#scds">Subject Consents</a>, <a href="#ssmds">Subject Sample Mapping</a>, <a href="#pdds">Pedigree</a>, <a href="#spds">Subject Phenotypes</a>, <a href="#sads">Sample Attributes</a>, <a href="#aphd">Documents: Phenotypes</a>, <a href="#ageno">Molecular Data¹</a>, <a href="#aSRA">Sequence Metadata</a>, <a href="#medimage">Medical Images</a>, <a href="#apha">Association Analyses (Genomic Summary Results)</a>, special files requested by the study curator, and Exchange Area files. Do not submit sequence data (BAM, CRAM, or FASTQ) through the SP. The SP can be accessed by submitters who have been sent an invitation to submit, and have accepted the invitation within 7 days. Initially, the study investigator and the PI submitter are sent invitations. Any person with a "Manager" role in the SP can add or remove submitters. The SP is not the same as the <a href="#ssgloss">dbGaP Submission System (SS)</a>.</p>


<hr />


<p>¹<a href="#aSRA">Sequence data</a> (e.g. BAM, CRAM, FASTQ) should be submitted only after: 1) you have received an email with an attached sequence metadata file containing the registered subject and sample IDs, and consents. This process ensures that submitted sequences are tied to sample IDs that belong to consented subjects. 2) The sequence metadata has been processed and you have received an email to upload sequences.</p>


<p><a name="ssgloss" id="ssgloss"></a>
<strong>Submission System (SS) aka Registration System</strong></p>


<p>The dbGaP Submission System (SS) is also known as the registration system. The link is <a href="https://dbgap.ncbi.nlm.nih.gov/dbgap/ss/dbgapss.cgi?login">https://dbgap.ncbi.nlm.nih.gov/dbgap/ss/dbgapss.cgi?login</a>. The <a href="#gpagloss">GPA</a> works with the study investigator to determine the following: study principal investigator (PI), study project officer (PO), NIH administration and funding, target data delivery date, target public release date, release type, types of data submission expected, inclusion in CADA (Compilation of Aggregate Genomic Data - a collection of analyses across many dbGaP studies that can be accessed with a single Data Access Request), estimated study participants, SRA submission expected, and PI assistant for study submissions. The GPA will upload the Submission Certification, <a href="#instcertgloss">Institutional Certifications</a>, and Data Use Certification, which specifies the <a href="#dulgloss">Data Use Limitations (DUL)</a>. The DULs form the consent groups that will be used to parse the study data, and also determine which <a href="#dargloss">Data Access Requests (DAR)</a> can be approved through dbGaP Authorized Access. BioProjects are created for each new study registered in the SS. The SS is only accessible by the GPA, PO, and PI. The SS is not the same as the dbGaP <a href="#spgloss">Submission Portal (SP)</a>. To make changes to the registration entry in the Submission System, contact your <a href="#gpagloss">GPA</a>. If you are a PI and have been given access, but have trouble logging in, see instructions: <a href="/gap/docs/login">here</a>.</p>


<p><strong>Submission System (SS) Reference for GPAs</strong></p>


<p>This guide provides an overview of the dbGaP Submission System (SS) and steps to register a study: <a href="https://www.ncbi.nlm.nih.gov/gap/docs/gpareference/">https://www.ncbi.nlm.nih.gov/gap/docs/gpareference/</a></p>


<p><a name="vargloss" id="vargloss"></a>
<strong>Variable</strong></p>


<p>A dbGaP Variable is defined as the variable name and associated column of data in a phenotype table (SC, SSM, pedigree, subject phenotypes, and sample attributes). The variable's metadata, such as the variable name, description, units, type, and encoded values are defined in its respective phenotype Data Dictionary file. The variable accession is a <strong>phv########.v#.p#</strong>, where the version number (.v#) is incremented when changes occur to the data columns (phenotype values) following a release.</p>


<p><a name="appdx" id="appdx"></a></p>


<h2 id="appendix-for-data-dictionary-dd-">APPENDIX for Data Dictionary (DD) File Descriptions and Specifications</h2>


<p><strong>(*indicates required)</strong></p>


<table>
<thead>
<tr>
<th><strong>Column Headers</strong></th>
<th><strong>Description</strong></th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>VARNAME*</strong></td>
<td><strong>Var</strong>iable <strong>name</strong>. The VARNAME must not contain backward slashes (\). Do not use "dbGaP" in the variable name. "dbGaP" is reserved for dbGaP generated items.</td>
</tr>
<tr>
<td><strong>VARDESC*</strong></td>
<td><strong>Var</strong>iable <strong>desc</strong>ription. The description should be understandable and enable users to replicate the variable. For example, "blood pressure" is useful, but "brachial blood pressure while sitting" provides more context. Alternatively, study documents with detail are also acceptable.</td>
</tr>
<tr>
<td><strong>DOCFILE</strong></td>
<td>Study document name associated with the variable. To list multiple documents, add a semicolon (;) between documents. Please list only study document filenames that are submitted to dbGaP.</td>
</tr>
<tr>
<td><strong>TYPE</strong></td>
<td>Data value type: <strong>integer</strong> (1,2,3,4,…), <strong>encoded value</strong> (integers or strings are coded for non-numerical meaning, ex. 1=Control; 2=Case, see VALUES), <strong>decimal</strong> (0.5,2.5,…), <strong>string</strong> (African American, Asian, Caucasian, Hispanic, Non-Hispanic). For <strong>mixed values</strong> (any combination of string, integers, decimals and/or encoded values) in a single data column, list all types present.</td>
</tr>
<tr>
<td><strong>UNITS*</strong></td>
<td>Units of measurement of variable</td>
</tr>
<tr>
<td><strong>MIN</strong></td>
<td>The logical minimum value of the variable. If a separate code such as -1 is used for a missing field, this should not be considered as the MIN value.</td>
</tr>
<tr>
<td><strong>MAX</strong></td>
<td>The logical maximum value for the variable. If a separate code such as 9999 is used for a missing field, this should not be considered as the MAX value.</td>
</tr>
<tr>
<td><strong>RESOLUTION</strong></td>
<td>Measurement resolution – the number of decimal places to which a measured value is presented in the data. For example, in 54.321 the resolution is 3.</td>
</tr>
<tr>
<td><strong>COMMENT1</strong>, <strong>COMMENT2</strong></td>
<td>Additional information not included in the VARDESC that will further define the variable. If additional comments are needed beyond COMMENT2, insert new columns (COMMENT3, COMMENT4, etc.) before the column "ORDER."</td>
</tr>
<tr>
<td><strong>VARIABLE_SOURCE</strong></td>
<td>Source of controlled vocabularies. Ex. PhenX, MeSH, SNOMED, NCI. If there is no match, leave blank. (Must be submitted as a group with <strong>SOURCE_VARIABLE_ID</strong> and <strong>VARIABLE_MAPPING</strong>).</td>
</tr>
<tr>
<td><strong>SOURCE_VARIABLE_ID</strong></td>
<td>A unique identifier from the VARIABLE_SOURCE or a unique text concept/term from various controlled vocabularies. (Must be submitted as a group with <strong>VARIABLE_SOURCE</strong> and <strong>VARIABLE_MAPPING</strong>).</td>
</tr>
<tr>
<td><strong>VARIABLE_MAPPING</strong></td>
<td>For example, a variable from the source could be Identical, Related, or Comparable. (Must be submitted as a group with <strong>VARIABLE_SOURCE and SOURCE_VARIABLE_ID</strong>).</td>
</tr>
<tr>
<td><a name="uniqkey" id="uniqkey"></a> <strong>UNIQUEKEY</strong></td>
<td>Unique key is a combination of variables that is designed to uniquely identify a row in a longitudinal dataset or rows that have repeating SUBJECT_IDs or SAMPLE_IDs. Mark "X" for variables that constitute the unique keys, and leave other values blank. Ex. SUBJECT_ID and VISIT_NUMBER. UNIQUEKEYs can only be used in the subject phenotypes file and some cases of the sample attributes file. The SC, SSM, and pedigree files should never have UNIQUEKEYs marked, since there should be a unique identifier appearing once in each file.</td>
</tr>
<tr>
<td><strong>COLLINTERVAL</strong></td>
<td>Collection interval is the time frame in which the data for the variable or dataset was collected.</td>
</tr>
<tr>
<td><strong>ORDER</strong></td>
<td>The order in which VALUES appear on the variable summary report page. If VALUES of a single variable/column of data are integers or decimals, leave blank. If VALUES are encoded values, string, or mixed, define the order. VALUES can be ordered by <strong>Frequency</strong> (highest to lowest frequency of VALUES) or by <strong>List</strong> (user specifies order through placement in VALUES columns). For mixed values within a single variable/column of data, see examples: "age" and "weight" in example file <a target="_blank" href="https://ftp.ncbi.nlm.nih.gov/dbgap/dbGaP_Submission_Guide_Templates/Individual_Submission_Templates/Phenotype_Data/5b_SubjectPhenotypes_DD.xlsx">5b_SubjectPhenotypes_DD.xlsx</a>.</td>
</tr>
<tr>
<td><a name="avalues" id="avalues"></a><strong>VALUES*</strong></td>
<td>List of all unique values and/or descriptions of all encoded values, one value per cell. Encoded values are defined as a value and its meaning. For example, if a data file contains a variable named "EDUCATION" and its data values are "1, 2, 3, and 99," these coded values will need to be defined in the data dictionary. The format of an encoded value is <strong>VALUE=MEANING</strong>. Therefore, in the data dictionary, there should be 4 <u>separate data cells</u> filled out with the following: 1=Completed High School, 2=Completed College, 3=Completed Graduate School, 99=Unknown. The "VALUES" header must be the last column header (farthest right in the table). It should appear only in the column above the first encoded value that is listed. The remaining column header cells should be left blank. The script will identify the first code meanings and continue right until there are no more code meanings. For example, if the variable "SEX" has 3 encoded values: 1=Male, 2=Female and 3=Unknown, the column header "VALUES" will appear only above the cell that contains 1=Male. 1=Male, 2=Female and 3=Unknown will be listed in three separate cells next to each other. The header column cells above "2=Female" and "3=Unknown" should be left blank.</td>
</tr>
</tbody>
</table>


<p>Example of VALUES:</p>


<table>
<thead>
<tr>
<th><em>Last column with header</em></th>
<th><em>Leave header blank</em></th>
<th><em>Leave header blank</em></th>
<th><em>Leave header blank</em></th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>VALUES</strong></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>10=Elementary</td>
<td>20=High School</td>
<td>40=College</td>
<td>4=Graduate School</td>
</tr>
<tr>
<td>1=2-4 drinks per day</td>
<td>2=5-7 drinks per day</td>
<td>3=&gt;7 drinks per day</td>
<td></td>
</tr>
</tbody>
</table>


<p><a name="newshistory" id="newshistory"></a></p>


<h2 data-heading="h2" data-no-toc="true">Previous Updates</h2>


<ul>
<li>There are updated <a href="#ncbidb">guidance</a> and <a href="#asgtemplates">templates</a> to link subject/sample IDs to samples in NCBI databases: GEO, GenBank, SRA. (January 2022)</li>
<li>The <a href="#sdogloss">Study Data Outline</a> has replaced the Study Questionnaire in the <a href="https://submit.ncbi.nlm.nih.gov/dbgap/">Submission Portal</a> to collect pertinent information for study processing and releases. (October 2021)</li>
<li>High throughput <a href="#aSRA">sequence metadata</a> should now be uploaded to the dbGaP <a href="https://submit.ncbi.nlm.nih.gov/dbgap/">Submission Portal</a> under section "Sequence metadata" instead of through email. (May 2021)</li>
<li>We are offering pre-validation tools for you to check your data before submitting to dbGaP on your system using <a target="_blank" href="https://github.com/ncbi/gaptools/blob/master/GaPTools.md">GaPTools</a> (February 2021)</li>
<li>The <a href="#aconfig">study config</a> can now be filled out online in your study's <a href="https://submit.ncbi.nlm.nih.gov/dbgap/">Submission Portal</a>. (October 2020)</li>
<li><a href="#aqcchecks">Automated Preprocessing Validation Checks</a> are being run on all studies submitting PLINK or VCF files. This system will provide feedback within a few days of submission for IDs errors and inconsistences between PLINK, VCFs, Subject Consent, SSM, and Pedigree datasets (DS). For active studies pre-dating this new system, curators will work with you to update your files, so that this automated check can be run. (July 2019)</li>
<li>Biological sex is required in the <a href="#asc">Subject Consent</a> files in order to run the <a href="#aqcchecks">Automated Preprocessing Validation Checks</a>. (July 2019)</li>
<li>SAMPLE_USE is discontinued from the <a href="#assm">Subject Sample Mapping</a> files. Please remove before submitting. (April 2018)</li>
</ul>
                                </div>
                                <!--/.col1-->
                                <div class="col2">

                                </div>
                                <!--/.col2-->
                                <div class="col3">

                                </div>
                                <!--/.col3-->
                                <div class="col4">

                                </div>
                                <!--/.col4-->
                                <div class="col5">

                                </div>
                                <div class="col6">

                                </div>
                                <div class="col7">

                                </div>
                                <div class="col8">

                                </div>
                                <div class="col9">

                                </div>
                            </div><!--/.content-->
                        </div><!--/.container-->
                        <div id="NCBIFooter_dynamic">
    <div class="breadcrumbs">You are here:
            <span id="breadcrumb_text"><a href="/guide/">NCBI</a></span></div>
    <a id="help-desk-link" class="help_desk" href="https://support.ncbi.nlm.nih.gov/ics/support/default.asp?Time=2025-03-05T05:13:24-05:00&amp;Snapshot=%2Fprojects%2FdbGap%2Fgapdocs@1.2&amp;Host=portal106&amp;ncbi_phid=CE8D5FFC7C8153E10000000000E800B6&amp;ncbi_session=CE8B5AF87C7FFCB1_0191SID&amp;from=https%3A%2F%2Fwww.ncbi.nlm.nih.gov%2Fgap%2Fdocs%2Fsubmissionguide%2F&amp;Ncbi_App=dbgapdocs&amp;Page=static&amp;style=classic&amp;deptID=28049" target="_blank">Support Center</a>
    <noscript><img alt="" src="/stat?jsdisabled=true&amp;ncbi_app=dbgapdocs&amp;ncbi_db=&amp;ncbi_pdid=static&amp;ncbi_phid=CE8D5FFC7C8153E10000000000E800B6" /></noscript>
</div>


<div xmlns:xi="http://www.w3.org/2001/XInclude">
    <div xmlns="http://www.w3.org/1999/xhtml" class="footer" id="footer" xml:base="http://127.0.0.1/sites/static/header_footer">
	<section class="icon-section">
		<div id="icon-section-header" class="icon-section_header">Follow NCBI</div>
		<div class="grid-container container">
			<div class="icon-section_container">
				<a class="footer-icon" id="footer_twitter" href="https://twitter.com/ncbi" aria-label="Twitter">
					<svg xmlns="http://www.w3.org/2000/svg" width="40" height="40" viewBox="0 0 40 40" fill="none">
						<title>Twitter</title>
						<g id="twitterx1008">
							<path id="path1008" d="M6.06736 7L16.8778 20.8991L6.00001 32.2H10.2L18.6 23.1L25.668 32.2H34L22.8 17.5L31.9 7H28.4L20.7 15.4L14.401 7H6.06898H6.06736ZM9.66753 8.73423H12.9327L29.7327 30.4658H26.5697L9.66753 8.73423Z" fill="#5B616B"></path>
						</g>
					</svg>
				</a>
				<a class="footer-icon" id="footer_facebook" href="https://www.facebook.com/ncbi.nlm" aria-label="Facebook"><svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" viewBox="0 0 300 300">
					<title>Facebook</title>
					<path class="cls-11" d="M210.5,115.12H171.74V97.82c0-8.14,5.39-10,9.19-10h27.14V52l-39.32-.12c-35.66,0-42.42,26.68-42.42,43.77v19.48H99.09v36.32h27.24v109h45.41v-109h35Z">
					</path>
				</svg></a>
				<a class="footer-icon" id="footer_linkedin" href="https://www.linkedin.com/company/ncbinlm" aria-label="LinkedIn"><svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" viewBox="0 0 300 300">
						<title>LinkedIn</title>
						<path class="cls-11" d="M101.64,243.37H57.79v-114h43.85Zm-22-131.54h-.26c-13.25,0-21.82-10.36-21.82-21.76,0-11.65,8.84-21.15,22.33-21.15S101.7,78.72,102,90.38C102,101.77,93.4,111.83,79.63,111.83Zm100.93,52.61A17.54,17.54,0,0,0,163,182v61.39H119.18s.51-105.23,0-114H163v13a54.33,54.33,0,0,1,34.54-12.66c26,0,44.39,18.8,44.39,55.29v58.35H198.1V182A17.54,17.54,0,0,0,180.56,164.44Z">
						</path>
					</svg></a>
				<a class="footer-icon" id="footer_github" href="https://github.com/ncbi" aria-label="GitHub"><svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" viewBox="0 0 300 300">
					<defs>
						<style>
							.cls-11,
							.cls-12 {
							fill: #737373;
							}

							.cls-11 {
							fill-rule: evenodd;
							}
						</style>
					</defs>
					<title>GitHub</title>
					<path class="cls-11" d="M151.36,47.28a105.76,105.76,0,0,0-33.43,206.1c5.28,1,7.22-2.3,7.22-5.09,0-2.52-.09-10.85-.14-19.69-29.42,6.4-35.63-12.48-35.63-12.48-4.81-12.22-11.74-15.47-11.74-15.47-9.59-6.56.73-6.43.73-6.43,10.61.75,16.21,10.9,16.21,10.9,9.43,16.17,24.73,11.49,30.77,8.79,1-6.83,3.69-11.5,6.71-14.14C108.57,197.1,83.88,188,83.88,147.51a40.92,40.92,0,0,1,10.9-28.39c-1.1-2.66-4.72-13.42,1-28,0,0,8.88-2.84,29.09,10.84a100.26,100.26,0,0,1,53,0C198,88.3,206.9,91.14,206.9,91.14c5.76,14.56,2.14,25.32,1,28a40.87,40.87,0,0,1,10.89,28.39c0,40.62-24.74,49.56-48.29,52.18,3.79,3.28,7.17,9.71,7.17,19.58,0,14.15-.12,25.54-.12,29,0,2.82,1.9,6.11,7.26,5.07A105.76,105.76,0,0,0,151.36,47.28Z">
					</path>
					<path class="cls-12" d="M85.66,199.12c-.23.52-1.06.68-1.81.32s-1.2-1.06-.95-1.59,1.06-.69,1.82-.33,1.21,1.07.94,1.6Zm-1.3-1">
					</path>
					<path class="cls-12" d="M90,203.89c-.51.47-1.49.25-2.16-.49a1.61,1.61,0,0,1-.31-2.19c.52-.47,1.47-.25,2.17.49s.82,1.72.3,2.19Zm-1-1.08">
					</path>
					<path class="cls-12" d="M94.12,210c-.65.46-1.71,0-2.37-.91s-.64-2.07,0-2.52,1.7,0,2.36.89.65,2.08,0,2.54Zm0,0"></path>
					<path class="cls-12" d="M99.83,215.87c-.58.64-1.82.47-2.72-.41s-1.18-2.06-.6-2.7,1.83-.46,2.74.41,1.2,2.07.58,2.7Zm0,0">
					</path>
					<path class="cls-12" d="M107.71,219.29c-.26.82-1.45,1.2-2.64.85s-2-1.34-1.74-2.17,1.44-1.23,2.65-.85,2,1.32,1.73,2.17Zm0,0">
					</path>
					<path class="cls-12" d="M116.36,219.92c0,.87-1,1.59-2.24,1.61s-2.29-.68-2.3-1.54,1-1.59,2.26-1.61,2.28.67,2.28,1.54Zm0,0">
					</path>
					<path class="cls-12" d="M124.42,218.55c.15.85-.73,1.72-2,1.95s-2.37-.3-2.52-1.14.73-1.75,2-2,2.37.29,2.53,1.16Zm0,0"></path>
				</svg></a>
				<a class="footer-icon" id="footer_blog" href="https://ncbiinsights.ncbi.nlm.nih.gov/" aria-label="Blog">
					<svg xmlns="http://www.w3.org/2000/svg" id="Layer_1" data-name="Layer 1" viewBox="0 0 40 40">
						<defs><style>.cls-1{fill:#737373;}</style></defs>
						<title>NCBI Insights Blog</title>
						<path class="cls-1" d="M14,30a4,4,0,1,1-4-4,4,4,0,0,1,4,4Zm11,3A19,19,0,0,0,7.05,15a1,1,0,0,0-1,1v3a1,1,0,0,0,.93,1A14,14,0,0,1,20,33.07,1,1,0,0,0,21,34h3a1,1,0,0,0,1-1Zm9,0A28,28,0,0,0,7,6,1,1,0,0,0,6,7v3a1,1,0,0,0,1,1A23,23,0,0,1,29,33a1,1,0,0,0,1,1h3A1,1,0,0,0,34,33Z"></path>
					</svg>
				</a>
			</div>
		</div>
	</section>

	<section class="container-fluid bg-primary">
		<div class="container pt-5">
			<div class="row mt-3">
				<div class="col-lg-3 col-12">
					<p><a class="text-white" href="https://www.nlm.nih.gov/socialmedia/index.html">Connect with NLM</a></p>
					<ul class="list-inline social_media">
						<li class="list-inline-item"><a href="https://twitter.com/NLM_NIH" aria-label="Twitter" target="_blank" rel="noopener noreferrer">
							<svg xmlns="http://www.w3.org/2000/svg" width="35" height="35" viewBox="0 0 36 35" fill="none">
								<title>Twitter</title>
								<g id="twitterx1009" clip-path="url(#clip0_65276_3946)">
									<path id="Vector_Twitter" d="M17.5006 34.6565C26.9761 34.6565 34.6575 26.9751 34.6575 17.4996C34.6575 8.02416 26.9761 0.342773 17.5006 0.342773C8.02514 0.342773 0.34375 8.02416 0.34375 17.4996C0.34375 26.9751 8.02514 34.6565 17.5006 34.6565Z" fill="#205493" stroke="white" stroke-width="1.0" stroke-miterlimit="10"></path>
									<path id="path1009" d="M8.54811 8.5L16.2698 18.4279L8.50001 26.5H11.5L17.5 20L22.5486 26.5H28.5L20.5 16L27 8.5H24.5L19 14.5L14.5007 8.5H8.54927H8.54811ZM11.1197 9.73873H13.4519L25.4519 25.2613H23.1926L11.1197 9.73873Z" fill="white"></path>
								</g>
								<defs>
									<clipPath id="clip0_65276_3946">
										<rect width="35" height="35" fill="white"></rect>
									</clipPath>
								</defs>
							</svg>
						</a></li>
						<li class="list-inline-item"><a href="https://www.facebook.com/nationallibraryofmedicine" aria-label="Facebook" rel="noopener noreferrer" target="_blank">
							<svg xmlns="http://www.w3.org/2000/svg" width="35" height="35" viewBox="0 0 36 35" fill="none">
								<title>Facebook</title>
								<g id="Facebook" clip-path="url(#clip0_1717_1086)">
									<path id="Vector_Facebook" d="M15.1147 29.1371C15.1147 29.0822 15.1147 29.0296 15.1147 28.9747V18.9414H11.8183C11.6719 18.9414 11.6719 18.9414 11.6719 18.8018C11.6719 17.5642 11.6719 16.3289 11.6719 15.0937C11.6719 14.9793 11.7062 14.9518 11.816 14.9518C12.8683 14.9518 13.9206 14.9518 14.9751 14.9518H15.1215V14.8329C15.1215 13.8057 15.1215 12.774 15.1215 11.7492C15.1274 10.9262 15.3148 10.1146 15.6706 9.37241C16.1301 8.38271 16.9475 7.60378 17.9582 7.19235C18.6492 6.90525 19.3923 6.76428 20.1405 6.7783C21.0029 6.79202 21.8653 6.83091 22.7278 6.86065C22.8879 6.86065 23.048 6.89496 23.2082 6.90182C23.2974 6.90182 23.3271 6.94071 23.3271 7.02993C23.3271 7.54235 23.3271 8.05477 23.3271 8.5649C23.3271 9.16882 23.3271 9.77274 23.3271 10.3767C23.3271 10.4819 23.2974 10.5139 23.1921 10.5116C22.5379 10.5116 21.8814 10.5116 21.2271 10.5116C20.9287 10.5184 20.6316 10.5528 20.3395 10.6146C20.0822 10.6619 19.8463 10.7891 19.6653 10.9779C19.4842 11.1668 19.3672 11.4078 19.3307 11.6669C19.2857 11.893 19.2612 12.1226 19.2575 12.3531C19.2575 13.1904 19.2575 14.0299 19.2575 14.8695C19.2575 14.8946 19.2575 14.9198 19.2575 14.9564H23.0229C23.1807 14.9564 23.183 14.9564 23.1624 15.1074C23.0778 15.7662 22.9885 16.425 22.9039 17.0816C22.8322 17.6321 22.7636 18.1827 22.698 18.7332C22.6729 18.9437 22.6797 18.9437 22.4693 18.9437H19.2644V28.8992C19.2644 28.9793 19.2644 29.0593 19.2644 29.1394L15.1147 29.1371Z" fill="white"></path>
									<path id="Vector_2_Facebook" d="M17.5006 34.657C26.9761 34.657 34.6575 26.9756 34.6575 17.5001C34.6575 8.02465 26.9761 0.343262 17.5006 0.343262C8.02514 0.343262 0.34375 8.02465 0.34375 17.5001C0.34375 26.9756 8.02514 34.657 17.5006 34.657Z" stroke="white" stroke-width="1.0" stroke-miterlimit="10"></path>
								</g>
								<defs>
									<clipPath id="clip0_1717_1086">
										<rect width="35" height="35" fill="white"></rect>
									</clipPath>
								</defs>
							</svg>
						</a></li>
						<li class="list-inline-item"><a href="https://www.youtube.com/user/NLMNIH" aria-label="Youtube" target="_blank" rel="noopener noreferrer">
							<svg xmlns="http://www.w3.org/2000/svg" width="35" height="35" viewBox="0 0 36 35" fill="none">
								<title>Youtube</title>
								<g id="YouTube" clip-path="url(#clip0_1717_1101)">
									<path id="Vector_Youtube" d="M26.2571 11.4791C25.9025 11.1589 25.5709 10.9576 24.228 10.834C22.5512 10.6785 20.2797 10.6556 18.564 10.6533H16.4365C14.7208 10.6533 12.4493 10.6785 10.7725 10.834C9.43196 10.9576 9.09798 11.1589 8.7434 11.4791C7.81464 12.321 7.6202 14.6268 7.59961 16.8938C7.59961 17.3178 7.59961 17.741 7.59961 18.1635C7.62706 20.4121 7.82837 22.686 8.7434 23.521C9.09798 23.8412 9.42967 24.0425 10.7725 24.1661C12.4493 24.3216 14.7208 24.3445 16.4365 24.3468H18.564C20.2797 24.3468 22.5512 24.3216 24.228 24.1661C25.5686 24.0425 25.9025 23.8412 26.2571 23.521C27.1722 22.6929 27.3735 20.451 27.4009 18.2206C27.4009 17.7402 27.4009 17.2599 27.4009 16.7795C27.3735 14.5491 27.1699 12.3072 26.2571 11.4791ZM15.5604 20.5311V14.652L20.561 17.5001L15.5604 20.5311Z" fill="white"></path>
									<path id="Vector_2_Youtube" d="M17.5006 34.657C26.9761 34.657 34.6575 26.9756 34.6575 17.5001C34.6575 8.02465 26.9761 0.343262 17.5006 0.343262C8.02514 0.343262 0.34375 8.02465 0.34375 17.5001C0.34375 26.9756 8.02514 34.657 17.5006 34.657Z" stroke="white" stroke-width="1.0" stroke-miterlimit="10"></path>
								</g>
								<defs>
									<clipPath id="clip0_1717_1101">
										<rect width="35" height="35" fill="white"></rect>
									</clipPath>
								</defs>
							</svg>
						</a></li>
					</ul>
				</div>
				<div class="col-lg-3 col-12">
					<p class="address_footer text-white">National Library of Medicine<br />
						<a href="https://www.google.com/maps/place/8600+Rockville+Pike,+Bethesda,+MD+20894/@38.9959508,-77.101021,17z/data=!3m1!4b1!4m5!3m4!1s0x89b7c95e25765ddb:0x19156f88b27635b8!8m2!3d38.9959508!4d-77.0988323" class="text-white" target="_blank" rel="noopener noreferrer">8600 Rockville Pike<br />
							Bethesda, MD 20894</a></p>
				</div>
				<div class="col-lg-3 col-12 centered-lg">
					<p><a href="https://www.nlm.nih.gov/web_policies.html" class="text-white">Web Policies</a><br />
						<a href="https://www.nih.gov/institutes-nih/nih-office-director/office-communications-public-liaison/freedom-information-act-office" class="text-white">FOIA</a><br />
						<a href="https://www.hhs.gov/vulnerability-disclosure-policy/index.html" class="text-white" id="vdp">HHS Vulnerability Disclosure</a></p>
				</div>
				<div class="col-lg-3 col-12 centered-lg">
					<p><a class="supportLink text-white" href="https://support.nlm.nih.gov/">Help</a><br />
						<a href="https://www.nlm.nih.gov/accessibility.html" class="text-white">Accessibility</a><br />
						<a href="https://www.nlm.nih.gov/careers/careers.html" class="text-white">Careers</a></p>
				</div>
			</div>
			<div class="row">
				<div class="col-lg-12 centered-lg">
					<nav class="bottom-links">
						<ul class="mt-3">
							<li>
								<a class="text-white" href="//www.nlm.nih.gov/">NLM</a>
							</li>
							<li>
								<a class="text-white" href="https://www.nih.gov/">NIH</a>
							</li>
							<li>
								<a class="text-white" href="https://www.hhs.gov/">HHS</a>
							</li>
							<li>
								<a class="text-white" href="https://www.usa.gov/">USA.gov</a>
							</li>
						</ul>
					</nav>
				</div>
			</div>
		</div>
	</section>
	<script type="text/javascript" src="/portal/portal3rc.fcgi/rlib/js/InstrumentOmnitureBaseJS/InstrumentNCBIConfigJS/InstrumentNCBIBaseJS/InstrumentPageStarterJS.js?v=1"> </script>
	<script type="text/javascript" src="/portal/portal3rc.fcgi/static/js/hfjs2.js"> </script>
</div>
</div>
                        <!--/.footer-->
                        <p class="last-updated small">Last updated: 2025-01-08T15:08:23Z</p>
                    </div>
                    <!--/.page-->
                </div>
                <!--/.wrap-->
                <span class="PAFAppResources"></span>


            </div><!-- /.twelve_col -->
        </div>
        <!-- /.grid -->


        <!-- usually for JS scripts at page bottom -->
        <span class="pagefixtures"></span>


<!-- CE8B5AF87C7FFCB1_0191SID /projects/dbGap/gapdocs@1.2 portal106 v4.1.r689238 Tue, Oct 22 2024 16:10:51 -->
<span id="portal-csrf-token" style="display:none" data-token="CE8B5AF87C7FFCB1_0191SID"></span>

<script type="text/javascript" src="//static.pubmed.gov/portal/portal3rc.fcgi/4176647/js/3879255/4121861/4175147/4087685.js" snapshot="gap"></script></body>
</html>