nih-gov/www.ncbi.nlm.nih.gov/pathogens/docs/data_processing

1078 lines
No EOL
70 KiB
Text
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!DOCTYPE html>
<html lang="en" >
<head >
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<!-- Mobile properties -->
<meta name="HandheldFriendly" content="True">
<meta name="MobileOptimized" content="320">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<!-- Stylesheets -->
<link href="/pathogens/static/django_uswds/uswds/css/uswds.css" rel="stylesheet" />
<link rel="stylesheet" href="/pathogens/static/nwds/css/nwds.css" />
<link rel="stylesheet" href="/pathogens/static/nwds/css/header.css" />
<link rel="stylesheet" href="/pathogens/static/nwds/css/footer.css" />
<link rel="stylesheet" href="/pathogens/static/nwds/css/form.css" />
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css"/>
<title>
NCBI -
Pathogen Detection - NCBI
</title>
<!-- extra head -->
<!-- Favicons -->
<link rel="shortcut icon" type="image/ico" href="https://www.ncbi.nlm.nih.gov/coreutils/nwds/img/favicons/favicon.ico" />
<link rel="icon" type="image/png" href="https://www.ncbi.nlm.nih.gov/coreutils/nwds/img/favicons/favicon.png" />
<!-- 192x192, as recommended for Android
http://updates.html5rocks.com/2014/11/Support-for-theme-color-in-Chrome-39-for-Android
-->
<link rel="icon" type="image/png" sizes="192x192" href="https://www.ncbi.nlm.nih.gov/coreutils/nwds/img/favicons/favicon-192.png" />
<!-- 57x57 (precomposed) for iPhone 3GS, pre-2011 iPod Touch and older Android devices -->
<link rel="apple-touch-icon-precomposed" href="https://www.ncbi.nlm.nih.gov/coreutils/nwds/img/favicons/favicon-57.png">
<!-- 72x72 (precomposed) for 1st generation iPad, iPad 2 and iPad mini -->
<link rel="apple-touch-icon-precomposed" sizes="72x72" href="https://www.ncbi.nlm.nih.gov/coreutils/nwds/img/favicons/favicon-72.png">
<!-- 114x114 (precomposed) for iPhone 4, 4S, 5 and post-2011 iPod Touch -->
<link rel="apple-touch-icon-precomposed" sizes="114x114" href="https://www.ncbi.nlm.nih.gov/coreutils/nwds/img/favicons/favicon-114.png">
<!-- 144x144 (precomposed) for iPad 3rd and 4th generation -->
<link rel="apple-touch-icon-precomposed" sizes="144x144" href="https://www.ncbi.nlm.nih.gov/coreutils/nwds/img/favicons/favicon-144.png">
<link rel="stylesheet" href="/pathogens/static/main/styles/app.css" type="text/css" media="screen">
<link rel="stylesheet" href="/pathogens/static/main/styles/breadcrumbs.css" type="text/css" media="screen">
<style type="text/css">
a { text-decoration: underline }
</style>
<!-- Logging params: Pinger defaults -->
<meta name="ncbi_app" content="labs_pathogens" />
<meta name="ncbi_pdid" content="static" />
<meta name="ncbi_phid" content="D0BD13BDF4F7203500004E8EC341E445.1.m_5" />
<!-- /extra head -->
<link rel="stylesheet" href="/pathogens/static/main/styles/page-search.css" type="text/css">
</head>
<body >
<a class="usa-skipnav" href="#main_content">Skip to main page content</a>
<!-- ========== BEGIN HEADER ========== -->
<section class="usa-banner">
<div class="usa-accordion">
<header class="usa-banner-header">
<div class="usa-grid usa-banner-inner">
<img src="https://www.ncbi.nlm.nih.gov/coreutils/uswds/img/favicons/favicon-57.png" alt="U.S. flag" />
<p>An official website of the United States government</p>
<button
class="usa-accordion-button usa-banner-button"
aria-expanded="false"
aria-controls="gov-banner-top"
>
<span class="usa-banner-button-text">Here's how you know</span>
</button>
</div>
</header>
<div
class="usa-banner-content usa-grid usa-accordion-content"
id="gov-banner-top"
>
<div class="usa-banner-guidance-gov usa-width-one-half">
<img
class="usa-banner-icon usa-media_block-img"
src="https://www.ncbi.nlm.nih.gov/coreutils/uswds/img/icon-dot-gov.svg"
alt="Dot gov"
/>
<div class="usa-media_block-body">
<p>
<strong>The .gov means its official.</strong>
<br />
Federal government websites often end in .gov or .mil. Before
sharing sensitive information, make sure youre on a federal
government site.
</p>
</div>
</div>
<div class="usa-banner-guidance-ssl usa-width-one-half">
<img
class="usa-banner-icon usa-media_block-img"
src="https://www.ncbi.nlm.nih.gov/coreutils/uswds/img/icon-https.svg"
alt="Https"
/>
<div class="usa-media_block-body">
<p>
<strong>The site is secure.</strong>
<br />
The <strong>https://</strong> ensures that you are connecting to the
official website and that any information you provide is encrypted
and transmitted securely.
</p>
</div>
</div>
</div>
</div>
</section>
<div class="usa-overlay"></div>
<header class="ncbi-header" role="banner" data-section="Header">
<div class="usa-grid">
<div class="usa-width-one-whole">
<div class="ncbi-header__logo">
<a href="https://www.ncbi.nlm.nih.gov/" class="logo" aria-label="NCBI Logo" data-ga-action="click_image" data-ga-label="NIH NLM Logo">
<img src="https://www.ncbi.nlm.nih.gov/coreutils/nwds/img/logos/AgencyLogo.svg" alt="NIH NLM Logo" />
</a>
</div>
<div class="ncbi-header__account">
<a id="account_login" href="https://account.ncbi.nlm.nih.gov" class="usa-button header-button" style="display:none" data-ga-action="open_menu" data-ga-label="account_menu">Log in</a>
<button id="account_info" class="header-button" style="display:none"
aria-controls="account_popup">
<span class="fa fa-user" aria-hidden="true"></span>
<span class="username desktop-only" aria-hidden="true" id="uname_short"></span>
<span class="sr-only">Show account info</span>
</button>
</div>
<div class="ncbi-popup-anchor">
<div class="ncbi-popup account-popup" id="account_popup" aria-hidden="true">
<div class="ncbi-popup-head">
<button class="ncbi-close-button" data-ga-action="close_menu" data-ga-label="account_menu"><span class="fa fa-times"></span><span class="usa-sr-only">Close</span></button>
<h4>Account</h4>
</div>
<div class="account-user-info">
Logged in as:<br/>
<b><span class="username" id="uname_long">username</span></b>
</div>
<div class="account-links">
<ul class="usa-unstyled-list">
<li><a id="account_myncbi" href="/myncbi/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_myncbi">Dashboard</a></li>
<li><a id="account_pubs" href="/myncbi/collections/bibliography/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_pubs">Publications</a></li>
<li><a id="account_settings" href="/account/settings/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_settings">Account settings</a></li>
<li><a id="account_logout" href="/account/signout/" class="set-base-url" data-ga-action="click_menu_item" data-ga-label="account_logout">Log out</a></li>
</ul>
</div>
</div>
</div>
</div>
</div>
</header>
<div role="navigation" aria-label="access keys">
<a id="nws_header_accesskey_0" href="https://www.ncbi.nlm.nih.gov/guide/browsers/#ncbi_accesskeys" class="usa-sr-only" accesskey="0" tabindex="-1">Access keys</a>
<a id="nws_header_accesskey_1" href="https://www.ncbi.nlm.nih.gov" class="usa-sr-only" accesskey="1" tabindex="-1">NCBI Homepage</a>
<a id="nws_header_accesskey_2" href="/myncbi/" class="set-base-url usa-sr-only" accesskey="2" tabindex="-1">MyNCBI Homepage</a>
<a id="nws_header_accesskey_3" href="#maincontent" class="usa-sr-only" accesskey="3" tabindex="-1">Main Content</a>
<a id="nws_header_accesskey_4" href="#" class="usa-sr-only" accesskey="4" tabindex="-1">Main Navigation</a>
</div>
<section data-section="Alerts">
<div class="ncbi-alerts-placeholder"></div>
</section>
<!-- ========== END HEADER ========== -->
<main id="main_content" accesskey="3">
<section class="pathogens-content-page usa-grid">
<div class="usa-width-one-whole ptg-top">
<div><ul id="breadcrumbs" class="usa-unstyled-list breadcrumbs">
<li><a href="https://www.ncbi.nlm.nih.gov/home/health.shtml">Health</a></li>
<li><a href=/pathogens/>Pathogen Detection</a></li>
<li><a href="/pathogens/pathogens_help/">Help</a></li>
<li>NCBI</li>
</ul>
</div>
<div class="ptg-search">
<input placeholder="Search page" type="search"/><button title="Previous" disabled="disabled"></button><button title="Next" disabled="disabled"></button>
</div>
</div>
<div> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<!-- ============== HEAD =============== -->
<head><meta http-equiv="Content-Type" content="text/html; charset=ASCII" />
<title>NCBI Pathogen Detection Project Help Document - Data Processing</title>
<meta name="description" content="Help document for the Pathogen Detection Project, a resource of the National Center for Biotechnology Information (NCBI) for microbial genome sequence analysis for epidemiologic surveillance." />
<meta name="keywords" content="pipeline, quality control, theory of operation, validation, pathogens, genomes, genome sequences, antimicrobial resistance, AMR, antibiotic resistance, Reference Gene Catalog, genes, alleles, bacteria, microbes, Escherichia coli, E coli, Salmonella, Listeria, food borne illness, outbreaks, CDC, FDA, NCBI, National Center for Biotechnology Information" />
<meta name="robots" content="index,follow,noarchive" />
<style>
table, th, td {
border: 1px solid black;
}
</style>
</head>
<!-- ============= END_HEAD ============= -->
<!-- ============== BODY =============== -->
<body>
<!-- ########## H2_SECTION:DATA_PROCESSING ########## -->
<h2 id="data-processing">Data Processing Pipeline<img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="25" height="1" border="0" /><a href="/pathogens/pathogens_help/#data-processing"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/arrowup_blue.gif" width="12" height="12" border="0" alt="back to top" /></a></h2>
<!-- ========== MINI-TOC =========== -->
<div style="padding-left: 2em">
<ul>
<li><a href="#data-processing-overview">Overview</a></li>
<li><a href="#data-processing-assembly">Assembly pipeline</a></li>
<li><a href="#data-processing-clustering">Clustering</a></li>
<li><a href="#data-processing-phylogenetic-tree-construction">Phylogenetic tree reconstruction</a></li>
<li><a href="#data-processing-annotation">Annotation and antimicrobial gene/protein identification</a></li>
<ul>
<li><a href="#genotype-categories">Genotype categories</a></li>
</ul>
<li><a href="#data-processing-qc">Quality control (QC)</a></li>
<ul>
<li><a href="#qc-validation-types">QC validation types</a></li>
<li><a href="#qc-exceptions-report">QC Exceptions Report</a></li>
<li><a href="#qc-validation-criteria">Validation criteria and tresholds</a></li>
</ul>
</ul>
</div>
<!-- ========== END_MINI-TOC =========== -->
<div style="padding-left: 2em">
<!-- ========== DATA_PROCESSING_OVERVIEW =========== -->
<h3 id="data-processing-overview">Overview <img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="25" height="1" border="0" /><a href="#data-processing"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/buttons/toc_icon.png" width="15" height="12" border="0" alt="Data Processing, topic list" /></a><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="5" height="1" border="0" /><a href="#top"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/arrowup_blue.gif" width="12" height="12" border="0" alt="back to top" /></a></h3>
<p class="text-center">
<img src="/core/assets/pathogens/images/simple_pipeline_diagram.png" width="500" />
</p>
NCBI has developed a multi-stage pipeline with two goals: 1) clustering closely related pathogen isolates and 2) identifying antimicrobial resistance genes/proteins in pathogen genomes. The pipeline first assembles the <a href="/pathogens/pathogens_help#data-type-sequence-read">short read sequence data</a> for an isolate into a <a href="/pathogens/pathogens_help#data-type-genome">genome sequence</a>. This includes targeted assembly for certain genes of interest (such as <a href="/pathogens/pathogens_help#data-type-genotype">AMR genes</a>) for increased sensitivity. Second, the pipeline clusters the genomes from the assembly process along with the genomes found in GenBank for each organism (<a href="/pathogens/organisms/">see Organism Table for current list</a>). Third, phylogenetic trees are reconstructed after SNP calling within each cluster. The fourth step involves annotation and identification of AMR genes. Details on the full pipeline will be published at a later date. Note: there is a small pilot project pipeline that simply assembles and using the wgMLST scheme to generate a table of nearest neighbors. That pipeline currently only runs for Listeria and Salmonella.<br />
<br />
More details about the data processing pipeline can be found in <a href="https://ftp.ncbi.nlm.nih.gov/pathogen/Methods.txt">Methods.txt</a> on FTP.
<br />
<!-- ========== DATA_PROCESSING_ASSEMBLY =========== -->
<h3 id="data-processing-assembly">Assembly pipeline <img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="25" height="1" border="0" /><a href="#data-processing"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/buttons/toc_icon.png" width="15" height="12" border="0" alt="Data Processing, topic list" /></a><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="5" height="1" border="0" /><a href="#top"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/arrowup_blue.gif" width="12" height="12" border="0" alt="back to top" /></a></h3>
<ul>
<li>The assembly pipeline uses <a href="https://www.ncbi.nlm.nih.gov/pubmed/30286803">SKESA</a> to generate <i>de novo</i> assemblies as well as the guided assembler <a href="https://pubmed.ncbi.nlm.nih.gov/34289805">SAUTE</a> to sensitively and comprehensively catalog antimicrobial resistance genes. The current pipeline only assembles Illumina data, assemblies from other sequencing technologies are included when uploaded to GenBank. Note that the <i>de novo</i> and guided assembler pipelines may both independently assemble the same region of the genome, so there will often be duplicated sequence in the final assembly.</li>
</ul>
For any Bioproject that is flagged to monitor for incoming data, the assembly process automatically initiates as data are submitted. Not all BioProjects are flagged, and not all SRA data are automatically added to the system. Note that the assemblies generated by this process are submitted to GenBank when possible.<br />
<!-- ========== DATA_PROCESSING_CLUSTERING =========== -->
<h3 id="data-processing-clustering">Clustering <img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="25" height="1" border="0" /><a href="#data-processing"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/buttons/toc_icon.png" width="15" height="12" border="0" alt="Data Processing, topic list" /></a><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="5" height="1" border="0" /><a href="#top"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/arrowup_blue.gif" width="12" height="12" border="0" alt="back to top" /></a></h3>
There are also two different clustering pipelines in operation. Clustering automatically starts once a day for each organism only if new data are submitted.
<ol type="a">
<li>The first uses a reference wgMLST scheme (one for each organism if one exists), identifies the loci and alleles in each assembled genome, and uses a 25-allele cut-off to cluster related isolates. This system is gradually being rolled out. Most of the taxgroups with large numbers of isolates submitted are using the wgMLST method. A hard cut-off of 1000 isolates is in place before a reference wgMLST scheme is developed, therefore not all organisms will be switched to this system.</li>
<li>The second uses k-mer distances to first cluster related isolates, then a first pass SNP analysis. Clusters are created using 50-SNP single-linkage clustering. This system is gradually being replaced by the wgMLST but will remain for those organisms that have less than 1000 isolates.</li>
</ol>
For BOTH pipelines, once clusters are created, within each cluster of closely related isolates, a reference assembly is chosen, assemblies are aligned, SNPs are called, and phylogenetic trees are inferred. For each organism group there will be isolates that do not end up in a cluster. For those that do end up in a cluster, the cluster sizes can be from size two to several thousand.<br />
<!-- ====== DATA_PROCESSING_PHYLOGENETIC_TREE_CONSTRUCTION ===== -->
<h3 id="data-processing-phylogenetic-tree-construction">Phylogenetic tree reconstruction <img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="25" height="1" border="0" /><a href="#data-processing"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/buttons/toc_icon.png" width="15" height="12" border="0" alt="Data Processing, topic list" /></a><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="5" height="1" border="0" /><a href="#top"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/arrowup_blue.gif" width="12" height="12" border="0" alt="back to top" /></a></h3>
For each cluster, a phylogenetic tree is reconstructed from the SNPs for that cluster by using the <a href="https://www.ncbi.nlm.nih.gov/pubmed/28231758">maximum compatibility criteria</a>.<br />
<!-- ========== DATA_PROCESSING_ANNOTATION =========== -->
<h3 id="data-processing-annotation">Annotation and antimicrobial gene/protein identification <img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="25" height="1" border="0" /><a href="#data-processing"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/buttons/toc_icon.png" width="15" height="12" border="0" alt="Data Processing, topic list" /></a><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="5" height="1" border="0" /><a href="#top"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/arrowup_blue.gif" width="12" height="12" border="0" alt="back to top" /></a></h3>
<div style="padding-left: 2em">
Annotation of assembled genomes uses the <a href="https://www.ncbi.nlm.nih.gov/genome/annotation_prok/">NCBI Prokaryotic Genome Annotation Pipeline (PGAP) system</a>. Antimicrobial resistance (AMR) genes are identified using <a href="/pathogens/antimicrobial-resistance/AMRFinder/">AMRFinderPlus</a> <i>(additional details are provided in an <a href="#amr-finder">overview about AMRFinderPlus</a> and a publication by <a href="https://www.ncbi.nlm.nih.gov/pubmed/31427293">Feldgarden M, et al., 2019</a>)</i>. Genes are grouped into <a href="#genotype-categories">genotype categories</a>, as described below.<br /><br />
Each assembled genome that passes validation criteria will end up in the <a href="/pathogens/isolates#/search/">NCBI Pathogen Detection Isolates Browser</a>. Each SNP cluster is also available, both on <a href="https://ftp.ncbi.nlm.nih.gov/pathogen/Results/">FTP</a> as well as in the <a href="/pathogens/isolates#/search/">NCBI Pathogen Detection Isolates Browser</a>. AMR results are available both on FTP and in the browser as a separate column. Rapid Reports are only available on FTP.<br /><br />
New isolates are analyzed using the latest version of the AMRFinderPlus software and the latest version of <a href="/pathogens/isolates#/refgene/">Pathogen Detection Reference Gene Catalog</a> <i>(<a href="pathogens_help#reference-gene-catalog">read about the Reference Gene Catalog</a>)</i>. Older isolates may have been analyzed with earlier versions of the AMRFinderPlus software and the Reference Gene Catalog. There might be occasional updates to annotation on all isolates in special circumstances, such as the identification of a new genes (e.g., mobilized colistin resistance (<i>mcr</i>) genes). <a href="/pathogens/pathogens_help#isolates-browser-data-fields">Data fields in the Isolates Browser</a> indicate the analysis type (<a href="/pathogens/pathogens_help#isolates-browser-data-field-amrfinderplus-analysis-type">amrfinderplus_analysis_type</a>), AMRFinderPlus version (<a href="/pathogens/pathogens_help#isolates-browser-data-field-amrfinderplus-version">amrfinderplus_version</a>), and Reference Gene Catalog version (<a href="/pathogens/pathogens_help#isolates-browser-data-field-refgene-db-version">refgene_db_version</a>) that were used in the analysis of a given isolate.<br /><br />
<i>(Separate sections of this file provide <a href="/pathogens/pathogens_help#isolates-browser">Isolates Browser help documentation</a> and an <a href="/pathogens/pathogens_help#ftp">overview of the data available on the FTP site</a>. The <a href="https://github.com/ncbi/amr/wiki">AMRFinderPlus wiki</a> provides details about installing and running the program, interpreting the results, and methods used for isolate genome analysis.)</i><br />
</div>
<!-- ========= GENOTYPE_CATEGORIES =========== -->
<div style="padding-left: 2em">
<h4 id="genotype-categories">Genotype Categories <img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="25" height="1" border="0" /><a href="#data-processing"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/buttons/toc_icon.png" width="15" height="12" border="0" alt="Data Processing, topic list" /></a><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="5" height="1" border="0" /><a href="#top"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/arrowup_blue.gif" width="12" height="12" border="0" alt="back to top" /></a></h4>
The genes identified in an isolate's genome by the NCBI Pathogen Detection data processing pipeline are grouped into <b>genotype categories</b>.<br /><br />
The stand-alone <b>AMRFinderPlus</b> software produces a <b>detailed categorization</b>, based on the method used to identify the genotypes. <i>(The <a href="https://github.com/ncbi/amr/wiki">AMRFinderPlus wiki</a> provides details about the methods, under "Running AMRFinderPlus &gt; <a href="https://github.com/ncbi/amr/wiki/Running-AMRFinderPlus#output-format">Output Format</a> &gt; Fields &gt; Method".)</i><br /><br />
The <b>Isolates Browser web interface</b> displays a <b>simplified categorization</b> of genotypes. <i>(The genotype categories appear when you use the <a href="/pathogens/pathogens_help#isolates-browser-display-options">choose columns</a> function to display data such as <a href="/pathogens/pathogens_help#isolates-browser-data-field-AMRGenotypes">AMR genotypes (AMR_genotypes)</a>, <a href="/pathogens/pathogens_help#isolates-browser-data-field-stress-genotypes">Stress genotypes (stress_genotypes)</a>, and/or <a href="/pathogens/pathogens_help#isolates-browser-data-field-virulence-genotypes">Virulence genotypes (virulence_genotypes)</a>.)</i><br /><br />
The table below shows the <b>correspondences</b> between the <b>AMRFinderPlus methods</b> used to identify genotypes and the <b>simplified genotype categories</b> displayed by the Isolates Browser web interface:<br /><br />
<!-- ====== GENOTYPE_CATEGORIES_TABLE_16_ROWS_3_COLUMNS ===== -->
<table width="100%">
<tr>
<td width="25%" valign="top" bgcolor="#A8C5E1"><b>AMRFinderPlus Method</b></td>
<td width="25%" valign="top" bgcolor="#A8C5E1"><b>Genotype Category<br />in the Isolates Browser<br />web display</b></td>
<td width="50%" valign="top" bgcolor="#A8C5E1"><b>Notes</b></td>
</tr>
<tr>
<td width="25%" valign="top" bgcolor="#FFFFCC">ALLELEP</td>
<td width="25%" valign="top" bgcolor="#FFFFCC" rowspan="6">COMPLETE</td>
<td width="50%" valign="top" bgcolor="#FFFFCC" rowspan="6">"Complete" genes are sequences that have BLAST alignments that cover &#8805; 90% of the reference protein in the <a href="/pathogens/isolates#/refgene/">Pathogen Detection Reference Gene Catalog</a> (sometimes referred to as the AMRFinderPlus database).<br /><br />
Specifically:
<ul>
<li>Those identified by the ALLELEP or ALLELEX method have a 100% sequence match to 100% of length to a protein named at the allele level in the Pathogen Detection Reference Gene Catalog.</li>
<li>Those identified by the EXACTP or EXACTX method have a 100% sequence match to 100% of length to a protein in the in the Pathogen Detection Reference Gene Catalog that is not a named allele.</li>
<li>Those identified by the BLASTP or BLASTX method have a BLAST alignment that covers &gt; 90% of the length, and a sequence identity of &gt; 90% (default cutoff), to a protein in the Pathogen Detection Reference Gene Catalog. For some genes, however, the sequence identity cutoff may be higher or lower, based on manual curation.</li>
</ul>
The suffix "P" refers to Protein BLAST (protein vs protein sequence comparisons), and the suffix "X" refers to Translated BLAST (nucleotide vs protein sequence comparisons).
<!-- from PD-3235: "Complete" genes are identified by blast to cover > 90% of a reference gene. -->
<!-- From AMRFinderPlus wiki page about output format/methods:
ALLELE - 100% sequence match over 100% of length to a protein named at the allele level in the AMRFinderPlus database.
EXACT - 100% sequence match over 100% of length to a protein in the database that is not a named allele.
BLAST - BLAST alignment is > 90% of length and > 90% identity to a protein in the AMRFinderPlus database. -->
</td>
</tr>
<tr>
<td width="25%" valign="top" bgcolor="#FFFFCC">ALLELEX</td>
<!-- td width="25%" valign="top" bgcolor="#FFFFCC">COMPLETE</td -->
</tr>
<tr>
<td width="25%" valign="top" bgcolor="#FFFFCC">BLASTP</td>
<!-- td width="25%" valign="top" bgcolor="#FFFFCC">COMPLETE</td -->
</tr>
<tr>
<td width="25%" valign="top" bgcolor="#FFFFCC">BLASTX</td>
<!-- td width="25%" valign="top" bgcolor="#FFFFCC">COMPLETE</td -->
</tr>
<tr>
<td width="25%" valign="top" bgcolor="#FFFFCC">EXACTP</td>
<!-- td width="25%" valign="top" bgcolor="#FFFFCC">COMPLETE</td -->
</tr>
<tr>
<td width="25%" valign="top" bgcolor="#FFFFCC">EXACTX</td>
<!-- td width="25%" valign="top" bgcolor="#FFFFCC">COMPLETE</td -->
</tr>
<tr>
<td width="25%" valign="top" bgcolor="#F0F8FF">HMM</td>
<td width="25%" valign="top" bgcolor="#F0F8FF">HMM</td>
<td width="50%" valign="top" bgcolor="#F0F8FF">These are proteins that were found by HMM only, more distant to reference proteins than our BLAST cutoffs. (The HMM was hit above the cutoff, but there was not a BLAST hit that met standards for BLAST or PARTIAL. This does not have a suffix of "P" or "X" because only protein sequences are searched by HMM.)
<!-- from PD-3235: These are proteins that were found by HMM only, more distant to reference proteins then our blast cutoffs -->
<!-- From AMRFinderPlus wiki page about output format/methods:
HMM was hit above the cutoff, but there was not a BLAST hit that met standards for BLAST or PARTIAL. This does not have a suffix because only protein sequences are searched by HMM. -->
</td>
</tr>
<tr>
<td width="25%" valign="top" bgcolor="#FFFFCC">INTERNAL_STOP</td>
<td width="25%" valign="top" bgcolor="#FFFFCC">MISTRANSLATION</td>
<td width="50%" valign="top" bgcolor="#FFFFCC"><!-- from PD-3235: -->Indicates a stop codon was found within the BLASTX alignment of the nucleotide sequence to the reference protein. In the future this may be extended to include frame shifts (which are currently not directly detected by AMRFinderPlus).
<!-- From AMRFinderPlus wiki page about output format/methods: Translated BLAST (BLASTX) reveals a stop codon that occurred before the end of the protein. This can only be assessed if the -n <nucleotide_fasta> option is used. --></td>
</tr>
<tr>
<td width="25%" valign="top" bgcolor="#F0F8FF">PARTIALP</td>
<td width="25%" valign="top" bgcolor="#F0F8FF" rowspan="2">PARTIAL</td>
<td width="50%" valign="top" bgcolor="#F0F8FF" rowspan="2">"Partial" genes are identified by BLAST to cover &gt; 50% but &lt; 90% of the length of the reference sequence, and the BLAST alignment does not end at a contig boundary. The aligned region has &gt; 90% identity to the reference protein (default cutoff). For some genes, however, the sequence identity cutoff may be higher or lower, based on manual curation.
<!-- from PD-3235: "Partial" genes are identified by BLAST to cover &#0060; 90%, but &#0062; 50% of a reference gene. -->
<!-- From AMRFinderPlus wiki page about output format/methods: BLAST alignment is > 50% of length, but < 90% of length and > 90% identity to the reference, and does not end at a contig boundary. --></td>
</tr>
<tr>
<td width="25%" valign="top" bgcolor="#F0F8FF">PARTIALX</td>
<!-- td width="25%" valign="top" bgcolor="#F0F8FF">PARTIAL</td -->
</tr>
<tr>
<td width="25%" valign="top" bgcolor="#FFFFCC">PARTIAL_CONTIG_ENDP</td>
<td width="25%" valign="top" bgcolor="#FFFFCC" rowspan="3">PARTIAL_END_OF_CONTIG</td>
<td width="50%" valign="top" bgcolor="#FFFFCC" rowspan="3">"Partial end of contig" genes are "partial" alignments that end at contig boundaries, indicating that they are more likely to have been split by a sequencing or assembly issue. Like "partial" genes, these are identified by BLAST to cover &gt; 50% but &lt; 90% of the length of the reference sequence. The aligned region has &gt; 90% sequence identity to the reference (default cutoff). For some genes, however, the sequence identity cutoff may be higher or lower, based on manual curation.
<!-- From AMRFinderPlus wiki page about output format/methods:
BLAST alignment is > 50% of length, but < 90% of length and > 90% identity to the reference, and the break occurrs at a contig boundary, indicating that this gene is more likely to have been split by an assembly issue. -->
</td>
</tr>
<tr>
<td width="25%" valign="top" bgcolor="#FFFFCC">PARTIAL_CONTIG_ENDX</td>
<!-- td width="25%" valign="top" bgcolor="#FFFFCC">PARTIAL_END_OF_CONTIG</td -->
</tr>
<tr>
<td width="25%" valign="top" bgcolor="#FFFFCC">PARTIAL_CONTIG_END</td>
<!-- td width="25%" valign="top" bgcolor="#FFFFCC">PARTIAL_END_OF_CONTIG</td -->
</tr>
<tr>
<td width="25%" valign="top" bgcolor="#F0F8FF">POINTN</td>
<td width="25%" valign="top" bgcolor="#F0F8FF" rowspan="3">POINT</td>
<td width="50%" valign="top" bgcolor="#F0F8FF" rowspan="3">Point mutation identified by BLAST:
<ul>
<li>POINTN mutations were identified by nucleotide BLAST (BLASTN)</li>
<li>POINTP mutations were identified by protein BLAST (BLASTP)</li>
<li>POINTX mutations were identified by translated BLAST (BLASTX)</li>
</ul>
</td>
</tr>
<tr>
<td width="25%" valign="top" bgcolor="#F0F8FF">POINTP</td>
<!-- td width="25%" valign="top" bgcolor="#F0F8FF">POINT</td -->
</tr>
<tr>
<td width="25%" valign="top" bgcolor="#F0F8FF">POINTX</td>
<!-- td width="25%" valign="top" bgcolor="#F0F8FF">POINT</td -->
</tr>
<!-- tr>
<td width="25%" valign="top">______________</td>
<td width="25%" valign="top">______________</td>
<td width="50%" valign="top">______________</td>
</tr -->
</table>
<!-- ====== END_GENOTYPE_CATEGORIES_TABLE_16_ROWS_3_COLUMNS ===== -->
</div>
<br />
<!-- ========= END_GENOTYPE_CATEGORIES ========= -->
<!-- ========== DATA_PROCESSING_QC =========== -->
<h3 id="data-processing-qc">Quality control (QC) <img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="25" height="1" border="0" /><a href="#data-processing"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/buttons/toc_icon.png" width="15" height="12" border="0" alt="Data Processing, topic list" /></a><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="5" height="1" border="0" /><a href="#top"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/arrowup_blue.gif" width="12" height="12" border="0" alt="back to top" /></a></h3>
<div style="padding-left: 2em">
The Pathogen Detection pipeline applies quality control tests with robust validation rules applied at every stage.
Instances where pipeline validation fails (called "exceptions") are communicated through the <a href="/pathogens/pathogens_help/#isolates-browser-exceptions-table">Exceptions report</a> which appears both in the Isolates Browser and on FTP.
<br />
<h4 id="qc-validation-types">QC validation types<img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="25" height="1" border="0" /><a href="#data-processing"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/buttons/toc_icon.png" width="15" height="12" border="0" alt="Data Processing, topic list" /></a><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="5" height="1" border="0" /><a href="#top"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/arrowup_blue.gif" width="12" height="12" border="0" alt="back to top" /></a></h4>
Quality control is applied with the following validation types:
<br />
<ul>
<li><b>Duplication check</b> - Each run deposited in the SRA is assigned a checksum that uniquely identifies it based on its content.
The goal is to identify and avoid unintended submission of the same isolate data from the same submitter at different times, or from different submitters.
When a "new" run is recognized by Pathogen Detection for processing, it is first tested against the existing database of run checksums. If there is a match, the "new" run is not processed.
</li>
<li><b>GenBank validity check</b> - NCBI GenBank continually checks assemblies for adherence to GenBank quality criteria.
When an assembly deposited in GenBank that could be used by Pathogen Detection is determined to fail minimum quality checks, the assembly is marked "anomalous" and removed from consideration by Pathogen Detection. See <a href="https://www.ncbi.nlm.nih.gov/datasets/docs/v2/policies-annotation/genome-processing/genome_notes/">Genome Notes</a> for more details.
</li>
<li><b>Readset validation</b> - On intake and prior to assembly, SRA runs are checked at their individual read level for sizing criteria and consistency.
For example reads are tested for minimal length, minimal coverage (submitter-identified-species dependent), and the submitted <a href="https://www.ncbi.nlm.nih.gov/sra/docs/sra-cloud-based-metadata-table/">LibraryLayout</a> is checked against actual mate pairing in the data.
These checks prevent bad runs from being used by downstream sections of the pipeline including assembly and SNP clustering.
Note that identification and contamination are <b>not</b> tested at this stage because no assembly yet exists.
</li>
<li><b>Assembly validation</b> - If readset validation is successful, the isolate's data is assembled into a genome assembly, which is then validated according to a number of tests.
Thresholds applied in the tests are specific to the species that the submitter has identified for the isolate.
These tests determine whether the assembly can be included in SNP clustering, reported for AMR, and submitted to Genbank (subject to further validation).
</li>
<li><b>Foreign contamination check</b> - Assemblies are checked for "foreign contamination" using a standard GenBank assay.
This assay tests for technical adapters in sequencing data, eukaryote organism contamination, viral contamination (including SARS-Cov2), and phage contamination (including Phi-X). See <a href="https://github.com/ncbi/fcs/">FCS pipeline</a> for more details.
Assemblies that are found to have contamination are reported in the Exceptions channel but are still included in SNP clustering and AMR, but not GenBank submission.
</li>
<li><b>wgMLST validation</b> - For those organism groups that use wgMLST for cluster formation there is an additional test whether the minimal number of loci have been found for the organism group.
This test prevents use of assemblies that are likely mis-identified from being used in SNP clustering, AMR reporting, or GenBank submission.
</li>
<li><b>kmer validation</b> - For those organism groups that use kmer distance for cluster formation a "triangle inequality" test of kmer distance between each subset of three isolate assemblies in a cluster.
This test prevents use of assemblies that are likely contaminated or mis-identified from being used in SNP clustering.
</li>
<li><b>ANI species check</b> - An <a href="https://www.ncbi.nlm.nih.gov/datasets/docs/v2/policies-annotation/quality/ani/">average nucleotide identity (ANI)</a> test is applied to each genome assembly to determine whether the assembly is consistent with type assemblies for the submitter-identified
species. If this result doesn't match then it is likely that the isolate is mis-identified. This test has greater resolution than the <b>wgMLST validation</b> test but it does not
prevent the assembly from being included in SNP clustering and AMR. It does prevent GenBank submission.
</li>
<li><b>GenBank QC check</b> - For those assemblies passing all other validation, further tests determine whether the assembly is suitable for submission to GenBank, either on behalf of the
primary data submitter (by prior agreement), or as a "third party annotation" (TPA).
The <a href="https://www.ncbi.nlm.nih.gov/refseq/annotation_prok/">PGAP annotation</a> of the assembly is validated using GenBank criteria for assembly sizing, annotation consistency, and presence of strain or isolate identifier. See <a href="https://www.ncbi.nlm.nih.gov/datasets/docs/v2/policies-annotation/genome-processing/genome_notes/">Genome Notes</a> for more details.
This test outcome does not affect SNP clustering or AMR calling.
</li>
</ul>
</div>
<br />
<h4 id="qc-exceptions-report">QC Exceptions Report<img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="25" height="1" border="0" /><a href="#data-processing"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/buttons/toc_icon.png" width="15" height="12" border="0" alt="Data Processing, topic list" /></a><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="5" height="1" border="0" /><a href="#top"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/arrowup_blue.gif" width="12" height="12" border="0" alt="back to top" /></a></h4>
<br />
QC validation <a href="/pathogens/pathogens_help/#isolates-browser-exceptions-table">Exceptions report</a> are reported both to FTP and to the Isolates Browser.
<br /><br />
On FTP, a file is produced that presents those isolates which fail validation and the reasons for the failure.
Submitters can find out why their isolate didn't get published from this file. The file has the following format:
<ul>
<li><b>exception type</b> -
<ul>
<li><b>ANI species check</b> - The biosample's species is checked against a database of type strains using average nucleotide identity (ANI) on the assembled sequence.
</li>
<li><b>Readset validation failure</b> - The SRA run was not valid and could not be used.
</li>
<li><b>Assembly validation failure</b> - The pathogen assembly was not valid and could not be used.
</li>
<li><b>wgMLST validation failure</b> - The assembly (pathogen or GenBank) could not be used for wgMLST analysis.
</li>
<li><b>Bad triples</b> - isolate failed triangle inequality in legacy kmer clustering step
</li>
</ul>
</li>
<li><b>exception</b> - Short message indicating the reason for failing validation.
</li>
<li><b>consequence</b> -
<ul>
<li><b>Not published</b> - The isolate will not appear in any published organism group (PDG).
</li>
<li><b>Not clustered</b> - The isolate will appear in a published organism group (PDG) but will be presented as a singleton (ie no clustering attempted).
</li>
<li><b>Not submitted</b> - The isolate will appear in a published organism group (PDG) and will be clustered, but its assembled sequence will not be submitted to Genbank.
</li>
</ul>
</li>
<li><b>lower limit</b> - Lower limit of the valid range (as relevant).
</li>
<li><b>upper limit</b> - Upper limit of the valid range (as relevant). In some contexts this is the submitted value of the field.
</li>
<li><b>actual value</b> - Actual value recorded by the system. In some contexts this is the actual result of an assay.
</li>
<li><b><a href="/pathogens/pathogens_help/#isolates-browser-data-field-BioSample">biosample_acc</a></b> - INSDC accession of the isolate's biosample record.
</li>
<li><b><a href="https://www.ncbi.nlm.nih.gov/pathogens/pathogens_help/#isolates-browser-data-field-Run">run(s)</a></b> - INSDC accession(s) of the isolate's SRA run record. If there is more than one run for the isolate, only the "representative" run is reported (the run that is best among earliest candidates).
</li>
<li><b><a href="/pathogens/pathogens_help/#isolates-browser-data-field-Isolate">pathogen target</a></b> - Pathogen target accession (PDT) for this isolate.
</li>
<li><b><a href="/pathogens/pathogens_help/#isolates-browser-data-field-Assembly">Assembly</a></b> - GenBank assembly accession.
</li>
<li><b><a href="/pathogens/pathogens_help/#isolates-browser-data-field-ScientificName">organism</a></b> - NCBI taxonomy (scientific_name) of the isolate.
</li>
<li><b><a href="/pathogens/pathogens_help/#isolates-browser-data-field-Strain">strain</a></b> - Submitter provided strain name for isolate.
</li>
<li><b><a href="/pathogens/pathogens_help/#isolates-browser-data-field-SRACenter">sra center</a></b> - SRA submitter lab name.
</li>
</ul>
<br />
In the Isolates Browser, exceptions are reported using the same fields as in FTP, but only for those isolates specifically queried.
For a query of an entire organism group, exceptions are not returned for every member of the group in the result (because there are many exceptions).
The report gives one row per exception found for an isolate as far as it got in the pipeline. An isolate can have multiple exceptions.
<h4 id="qc-validation-criteria">Validation criteria and thresholds<img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="25" height="1" border="0" /><a href="#data-processing"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/buttons/toc_icon.png" width="15" height="12" border="0" alt="Data Processing, topic list" /></a><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/spacer.gif" width="5" height="1" border="0" /><a href="#top"><img src="https://www.ncbi.nlm.nih.gov/Structure/IMG/arrowup_blue.gif" width="12" height="12" border="0" alt="back to top" /></a></h4>
<br />
The following table shows validation criteria and tresholds for each validation type supported by Pathogen Detection.
<br />
<!-- ====== QC Validation Critiera Table 5 columns ===== -->
<div style="padding-left: 2em">
<table width="100%">
<tr>
<td width="5%" valign="top" bgcolor="#A8C5E1"><b>QC stage</b></td>
<td width="15%" valign="top" bgcolor="#A8C5E1"><b>exception type</b></td>
<td width="40%" valign="top" bgcolor="#A8C5E1"><b>exception</b></td>
<td width="15%" valign="top" bgcolor="#A8C5E1"><b>consequence</b></td>
<td width="25%" valign="top" bgcolor="#A8C5E1"><b>criteria or thresholds</b></td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">Duplication pre-check</td>
<td valign="top" bgcolor="#F0F8FF">not reported</td>
<td valign="top" bgcolor="#F0F8FF">not reported</td>
<td valign="top" bgcolor="#F0F8FF">prevents use in Pathogen Detection</td>
<td valign="top" bgcolor="#F0F8FF">New run checksum must not match that of one already tracked in Pathogen Detection.</td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">GenBank validity pre-check</td>
<td valign="top" bgcolor="#F0F8FF">not reported</td>
<td valign="top" bgcolor="#F0F8FF">not reported</td>
<td valign="top" bgcolor="#F0F8FF">prevents use in Pathogen Detection</td>
<td valign="top" bgcolor="#F0F8FF">GenBank assembly must not be marked "anomalous". <a href="https://www.ncbi.nlm.nih.gov/datasets/docs/v2/policies-annotation/genome-processing/genome_notes">Documentation</a></td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">Readset validation</td>
<td valign="top" bgcolor="#F0F8FF">Readset validation failure</td>
<td valign="top" bgcolor="#F0F8FF">Base imbalance; A/T to C/G ratio too small</td>
<td valign="top" bgcolor="#F0F8FF">prevents use in Pathogen Detection</td>
<td valign="top" bgcolor="#F0F8FF">Ratio of AT to GC counts within range <tt>[0.7, 1.43)</tt></td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">Readset validation</td>
<td valign="top" bgcolor="#F0F8FF">Readset validation failure</td>
<td valign="top" bgcolor="#F0F8FF">Insufficient coverage</td>
<td valign="top" bgcolor="#F0F8FF">prevents use in Pathogen Detection</td>
<td valign="top" bgcolor="#F0F8FF">Ratio of run bases to expected genome size must be 20X or greater.</td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">Readset validation</td>
<td valign="top" bgcolor="#F0F8FF">Readset validation failure</td>
<td valign="top" bgcolor="#F0F8FF">Insufficient or inconsistent metadata</td>
<td valign="top" bgcolor="#F0F8FF">prevents use in Pathogen Detection</td>
<td valign="top" bgcolor="#F0F8FF">Submitted <a href="https://www.ncbi.nlm.nih.gov/sra/docs/sra-cloud-based-metadata-table/">LibraryLayout</a> and actual SRA content must be consistent.</td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">Readset validation</td>
<td valign="top" bgcolor="#F0F8FF">Readset validation failure</td>
<td valign="top" bgcolor="#F0F8FF">Read length too high</td>
<td valign="top" bgcolor="#F0F8FF">prevents use in Pathogen Detection</td>
<td valign="top" bgcolor="#F0F8FF">Read length must be less than 1024 bp</td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">Readset validation</td>
<td valign="top" bgcolor="#F0F8FF">Readset validation failure</td>
<td valign="top" bgcolor="#F0F8FF">Read length too low</td>
<td valign="top" bgcolor="#F0F8FF">prevents use in Pathogen Detection</td>
<td valign="top" bgcolor="#F0F8FF">Read length must be greater than 41 bp</td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">Readset validation</td>
<td valign="top" bgcolor="#F0F8FF">Readset validation failure</td>
<td valign="top" bgcolor="#F0F8FF">Run platforms don't allow selection of de-novo assemblers</td>
<td valign="top" bgcolor="#F0F8FF">prevents use in Pathogen Detection</td>
<td valign="top" bgcolor="#F0F8FF">Sequencing platform must be ILLUMINA.</td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">Readset validation</td>
<td valign="top" bgcolor="#F0F8FF">Readset validation failure</td>
<td valign="top" bgcolor="#F0F8FF">SRA Run metadata library layout issue</td>
<td valign="top" bgcolor="#F0F8FF">prevents use in Pathogen Detection</td>
<td valign="top" bgcolor="#F0F8FF">Submitted <tt>LibraryLayout</tt> (ie PAIRED vs SINGLE) and actual SRA content must be consistent.</td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">Readset validation</td>
<td valign="top" bgcolor="#F0F8FF">Readset validation failure</td>
<td valign="top" bgcolor="#F0F8FF">Serotype must be submitted in the serovar field (for <i>Salmonella</i> biosamples only)</td>
<td valign="top" bgcolor="#F0F8FF">prevents use in Pathogen Detection</td>
<td valign="top" bgcolor="#F0F8FF">BioSample <tt>serovar</tt> field must be used instead of <tt>serotype</tt> for <i>Salmonella</i> isolates (only).</td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">Readset validation</td>
<td valign="top" bgcolor="#F0F8FF">Readset validation failure</td>
<td valign="top" bgcolor="#F0F8FF">invalid biosample record</td>
<td valign="top" bgcolor="#F0F8FF">prevents use in Pathogen Detection</td>
<td valign="top" bgcolor="#F0F8FF">BioSample record must be valid according to NCBI BioSample.</td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">Assembly validation</td>
<td valign="top" bgcolor="#F0F8FF">Assembly validation failure</td>
<td valign="top" bgcolor="#F0F8FF">Genome length too large (<i>species</i>)</td>
<td valign="top" bgcolor="#F0F8FF">prevents use in Pathogen Detection</td>
<td valign="top" bgcolor="#F0F8FF">Assembled size of reads must not exceed upper limit for species. See <a href="https://www.ncbi.nlm.nih.gov/genbank/genome-size-check">Genome Size Check</a></td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">Assembly validation</td>
<td valign="top" bgcolor="#F0F8FF">Assembly validation failure</td>
<td valign="top" bgcolor="#F0F8FF">Genome length too small (<i>species</i>)</td>
<td valign="top" bgcolor="#F0F8FF">prevents use in Pathogen Detection</td>
<td valign="top" bgcolor="#F0F8FF">Assembled size of reads must not exceed lower limit for species. See <a href="https://www.ncbi.nlm.nih.gov/genbank/genome-size-check">Genome Size Check</a></td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">Assembly validation</td>
<td valign="top" bgcolor="#F0F8FF">Assembly validation failure</td>
<td valign="top" bgcolor="#F0F8FF">Insufficient number of loci</td>
<td valign="top" bgcolor="#F0F8FF">prevents use in Pathogen Detection</td>
<td valign="top" bgcolor="#F0F8FF">wgMLST loci found must exceed the minimum established for the species. The exception report indicates the threshhold value for the species.</td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">Assembly validation</td>
<td valign="top" bgcolor="#F0F8FF">Assembly validation failure</td>
<td valign="top" bgcolor="#F0F8FF">Low contig N50</td>
<td valign="top" bgcolor="#F0F8FF">prevents use in Pathogen Detection</td>
<td valign="top" bgcolor="#F0F8FF">Assembly contig N50 bases must be at least 10000 bp.</td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">Assembly validation</td>
<td valign="top" bgcolor="#F0F8FF">Assembly validation failure</td>
<td valign="top" bgcolor="#F0F8FF">No assembly produced</td>
<td valign="top" bgcolor="#F0F8FF">prevents use in Pathogen Detection</td>
<td valign="top" bgcolor="#F0F8FF">Reads must be assemblable.</td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">Assembly validation</td>
<td valign="top" bgcolor="#F0F8FF">Assembly validation failure</td>
<td valign="top" bgcolor="#F0F8FF">Too many assembly contigs</td>
<td valign="top" bgcolor="#F0F8FF">prevents use in Pathogen Detection</td>
<td valign="top" bgcolor="#F0F8FF">Assembly number of contigs must not exceed 500 (except <i>Escherichia, Shigella, Candidozyma spp.</i> which have a larger maximum).</td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">Foreign contamination</td>
<td valign="top" bgcolor="#F0F8FF">Contamination check</td>
<td valign="top" bgcolor="#F0F8FF">contaminated genome assembly</td>
<td valign="top" bgcolor="#F0F8FF">prevents submission to GenBank</td>
<td valign="top" bgcolor="#F0F8FF">Assembly must not exhibit significant contamination. <a href="https://github.com/ncbi/fcs/">Documentation.</a></td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">kmer validation</td>
<td valign="top" bgcolor="#F0F8FF">Bad triples ERD</td>
<td valign="top" bgcolor="#F0F8FF">Number of SNPs in comparison to two other assemblies is indicative of mixed samples</td>
<td valign="top" bgcolor="#F0F8FF">prevents SNP clustering</td>
<td valign="top" bgcolor="#F0F8FF">Isolate kmer distances must pass triangle inequality test for any three candidate members of a cluster.</td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">wgMLST validation</td>
<td valign="top" bgcolor="#F0F8FF">wgMLST validation failure</td>
<td valign="top" bgcolor="#F0F8FF">Too few wgMLST loci found</td>
<td valign="top" bgcolor="#F0F8FF">prevents SNP clustering</td>
<td valign="top" bgcolor="#F0F8FF">wgMLST loci found must exceed the minimum established for the species. The exception report indicates the threshhold value for the species.</td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">ANI species check</td>
<td valign="top" bgcolor="#F0F8FF">ANI species check</td>
<td valign="top" bgcolor="#F0F8FF">contaminated</td>
<td valign="top" bgcolor="#F0F8FF">prevents submission to GenBank</td>
<td valign="top" bgcolor="#F0F8FF">Assembly must not exhibit high-confidence contamination. <a href="https://www.ncbi.nlm.nih.gov/datasets/docs/v2/policies-annotation/quality/ani/">Documentation.</a></td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">ANI species check</td>
<td valign="top" bgcolor="#F0F8FF">ANI species check</td>
<td valign="top" bgcolor="#F0F8FF">species misidentified</td>
<td valign="top" bgcolor="#F0F8FF">prevents submission to GenBank</td>
<td valign="top" bgcolor="#F0F8FF">Assembly must not exhibit high-confidence mis-identification. <a href="https://www.ncbi.nlm.nih.gov/datasets/docs/v2/policies-annotation/quality/ani/">Documentation.</a></td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">GenBank submission</td>
<td valign="top" bgcolor="#F0F8FF">GenBank QC check</td>
<td valign="top" bgcolor="#F0F8FF">assembly missing both strain and isolate information</td>
<td valign="top" bgcolor="#F0F8FF">prevents submission to GenBank</td>
<td valign="top" bgcolor="#F0F8FF">BioSample must have a value for <tt>strain</tt> or <tt>isolate</tt> attributes. <a href="https://ncbi.nlm.nih.gov/biosample/docs/submission/validation-errors/">Documentation.</a></td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">GenBank submission</td>
<td valign="top" bgcolor="#F0F8FF">GenBank QC check</td>
<td valign="top" bgcolor="#F0F8FF">atypical genome annotation</td>
<td valign="top" bgcolor="#F0F8FF">prevents submission to GenBank</td>
<td valign="top" bgcolor="#F0F8FF">PGAP annotation must pass validation. <a href="https://www.ncbi.nlm.nih.gov/datasets/docs/v2/policies-annotation/genome-processing/genome_notes">Documentation.</a> </td>
</tr>
<tr>
<td valign="top" bgcolor="#F0F8FF">GenBank submission</td>
<td valign="top" bgcolor="#F0F8FF">GenBank QC check</td>
<td valign="top" bgcolor="#F0F8FF">atypical genome assembly</td>
<td valign="top" bgcolor="#F0F8FF">prevents submission to GenBank</td>
<td valign="top" bgcolor="#F0F8FF">Assembly must conform to GenBank sizing thresholds. See <a href="https://www.ncbi.nlm.nih.gov/datasets/docs/v2/policies-annotation/genome-processing/genome_notes">Documentation.</a> </td>
</tr>
</table>
</div>
</div>
<!-- ======== END_DATA_PROCESSING ========= -->
</body>
<!-- ============ END_BODY ============= -->
</html></div>
</section>
</main>
<!-- ========== BEGIN FOOTER ========== -->
<footer>
<section class="icon-section">
<div id="icon-section-header" class="icon-section_header">Follow NCBI</div>
<div class="grid-container container">
<div class="icon-section_container">
<a class="footer-icon" id="footer_twitter" href="https://twitter.com/ncbi" aria-label="Twitter">
<svg width="40" height="40" viewBox="0 0 40 37" fill="none" xmlns="http://www.w3.org/2000/svg">
<title>Twitter</title>
<g id="twitterx1008">
<path id="path1008"
d="M6.06736 7L16.8778 20.8991L6.00001 32.2H10.2L18.6 23.1L25.668 32.2H34L22.8 17.5L31.9 7H28.4L20.7 15.4L14.401 7H6.06898H6.06736ZM9.66753 8.73423H12.9327L29.7327 30.4658H26.5697L9.66753 8.73423Z"
fill="#5B616B"/>
</g>
</svg>
</a>
<a class="footer-icon" id="footer_facebook" href="https://www.facebook.com/ncbi.nlm" aria-label="Facebook"><svg
data-name="Layer 1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 300 300">
<title>Facebook</title>
<path class="cls-11"
d="M210.5,115.12H171.74V97.82c0-8.14,5.39-10,9.19-10h27.14V52l-39.32-.12c-35.66,0-42.42,26.68-42.42,43.77v19.48H99.09v36.32h27.24v109h45.41v-109h35Z">
</path>
</svg></a>
<a class="footer-icon" id="footer_linkedin"
href="https://www.linkedin.com/company/ncbinlm"
aria-label="LinkedIn"><svg data-name="Layer 1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 300 300">
<title>LinkedIn</title>
<path class="cls-11"
d="M101.64,243.37H57.79v-114h43.85Zm-22-131.54h-.26c-13.25,0-21.82-10.36-21.82-21.76,0-11.65,8.84-21.15,22.33-21.15S101.7,78.72,102,90.38C102,101.77,93.4,111.83,79.63,111.83Zm100.93,52.61A17.54,17.54,0,0,0,163,182v61.39H119.18s.51-105.23,0-114H163v13a54.33,54.33,0,0,1,34.54-12.66c26,0,44.39,18.8,44.39,55.29v58.35H198.1V182A17.54,17.54,0,0,0,180.56,164.44Z">
</path>
</svg></a>
<a class="footer-icon" id="footer_github" href="https://github.com/ncbi" aria-label="GitHub"><svg
data-name="Layer 1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 300 300">
<defs>
<style>
.cls-11,
.cls-12 {
fill: #737373;
}
.cls-11 {
fill-rule: evenodd;
}
</style>
</defs>
<title>GitHub</title>
<path class="cls-11"
d="M151.36,47.28a105.76,105.76,0,0,0-33.43,206.1c5.28,1,7.22-2.3,7.22-5.09,0-2.52-.09-10.85-.14-19.69-29.42,6.4-35.63-12.48-35.63-12.48-4.81-12.22-11.74-15.47-11.74-15.47-9.59-6.56.73-6.43.73-6.43,10.61.75,16.21,10.9,16.21,10.9,9.43,16.17,24.73,11.49,30.77,8.79,1-6.83,3.69-11.5,6.71-14.14C108.57,197.1,83.88,188,83.88,147.51a40.92,40.92,0,0,1,10.9-28.39c-1.1-2.66-4.72-13.42,1-28,0,0,8.88-2.84,29.09,10.84a100.26,100.26,0,0,1,53,0C198,88.3,206.9,91.14,206.9,91.14c5.76,14.56,2.14,25.32,1,28a40.87,40.87,0,0,1,10.89,28.39c0,40.62-24.74,49.56-48.29,52.18,3.79,3.28,7.17,9.71,7.17,19.58,0,14.15-.12,25.54-.12,29,0,2.82,1.9,6.11,7.26,5.07A105.76,105.76,0,0,0,151.36,47.28Z">
</path>
<path class="cls-12"
d="M85.66,199.12c-.23.52-1.06.68-1.81.32s-1.2-1.06-.95-1.59,1.06-.69,1.82-.33,1.21,1.07.94,1.6Zm-1.3-1">
</path>
<path class="cls-12"
d="M90,203.89c-.51.47-1.49.25-2.16-.49a1.61,1.61,0,0,1-.31-2.19c.52-.47,1.47-.25,2.17.49s.82,1.72.3,2.19Zm-1-1.08">
</path>
<path class="cls-12"
d="M94.12,210c-.65.46-1.71,0-2.37-.91s-.64-2.07,0-2.52,1.7,0,2.36.89.65,2.08,0,2.54Zm0,0"></path>
<path class="cls-12"
d="M99.83,215.87c-.58.64-1.82.47-2.72-.41s-1.18-2.06-.6-2.7,1.83-.46,2.74.41,1.2,2.07.58,2.7Zm0,0">
</path>
<path class="cls-12"
d="M107.71,219.29c-.26.82-1.45,1.2-2.64.85s-2-1.34-1.74-2.17,1.44-1.23,2.65-.85,2,1.32,1.73,2.17Zm0,0">
</path>
<path class="cls-12"
d="M116.36,219.92c0,.87-1,1.59-2.24,1.61s-2.29-.68-2.3-1.54,1-1.59,2.26-1.61,2.28.67,2.28,1.54Zm0,0">
</path>
<path class="cls-12"
d="M124.42,218.55c.15.85-.73,1.72-2,1.95s-2.37-.3-2.52-1.14.73-1.75,2-2,2.37.29,2.53,1.16Zm0,0"></path>
</svg></a>
<a class="footer-icon" id="footer_blog" href="https://ncbiinsights.ncbi.nlm.nih.gov/" aria-label="Blog">
<svg id="Layer_1" data-name="Layer 1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 40 40"><defs><style>.cls-1{fill:#737373;}</style></defs><path class="cls-1" d="M14,30a4,4,0,1,1-4-4,4,4,0,0,1,4,4Zm11,3A19,19,0,0,0,7.05,15a1,1,0,0,0-1,1v3a1,1,0,0,0,.93,1A14,14,0,0,1,20,33.07,1,1,0,0,0,21,34h3a1,1,0,0,0,1-1Zm9,0A28,28,0,0,0,7,6,1,1,0,0,0,6,7v3a1,1,0,0,0,1,1A23,23,0,0,1,29,33a1,1,0,0,0,1,1h3A1,1,0,0,0,34,33Z"/></svg>
</a>
</div>
</div>
</section>
<section class="container-fluid bg-primary">
<div class="container pt-5">
<div class="row mt-3">
<div class="col-lg-3 col-12">
<p><a class="text-white" href="https://www.nlm.nih.gov/socialmedia/index.html">Connect with NLM</a></p>
<ul class="list-inline social_media">
<li class="list-inline-item"><a href="https://twitter.com/NLM_NIH" aria-label="Twitter"
target="_blank" rel="noopener noreferrer">
<svg width="35" height="35" viewBox="0 0 38 35" fill="none" xmlns="http://www.w3.org/2000/svg">
<title>Twitter</title>
<g id="twitterx1009" clip-path="url(#clip0_65276_3946)">
<path id="Vector" d="M17.5006 34.6565C26.9761 34.6565 34.6575 26.9751 34.6575 17.4996C34.6575 8.02416 26.9761 0.342773 17.5006 0.342773C8.02514 0.342773 0.34375 8.02416 0.34375 17.4996C0.34375 26.9751 8.02514 34.6565 17.5006 34.6565Z" fill="#205493" stroke="white" stroke-width="1.2" stroke-miterlimit="10"></path>
<path id="path1009" d="M8.54811 8.5L16.2698 18.4279L8.50001 26.5H11.5L17.5 20L22.5486 26.5H28.5L20.5 16L27 8.5H24.5L19 14.5L14.5007 8.5H8.54927H8.54811ZM11.1197 9.73873H13.4519L25.4519 25.2613H23.1926L11.1197 9.73873Z" fill="white"></path>
</g>
<defs>
<clipPath id="clip0_65276_3946">
<rect width="38" height="38" fill="white"></rect>
</clipPath>
</defs>
</svg></a>
</li>
<li class="list-inline-item"><a href="https://www.facebook.com/nationallibraryofmedicine"
aria-label="Facebook" rel="noopener noreferrer" target="_blank">
<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px"
y="0px" viewBox="0 0 249 249" style="enable-background:new 0 0 249 249;" xml:space="preserve">
<style type="text/css">
.st10 {
fill: #FFFFFF;
}
.st110 {
fill: none;
stroke: #FFFFFF;
stroke-width: 8;
stroke-miterlimit: 10;
}
</style>
<title>SM-Facebook</title>
<g>
<g>
<path class="st10" d="M159,99.1h-24V88.4c0-5,3.3-6.2,5.7-6.2h16.8V60l-24.4-0.1c-22.1,0-26.2,16.5-26.2,27.1v12.1H90v22.5h16.9
v67.5H135v-67.5h21.7L159,99.1z"></path>
</g>
</g>
<circle class="st110" cx="123.6" cy="123.2" r="108.2"></circle>
</svg>
</a></li>
<li class="list-inline-item"><a href="https://www.youtube.com/user/NLMNIH" aria-label="Youtube"
target="_blank" rel="noopener noreferrer"><svg version="1.1" xmlns="http://www.w3.org/2000/svg"
xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" viewBox="0 0 249 249"
style="enable-background:new 0 0 249 249;" xml:space="preserve">
<title>SM-Youtube</title>
<style type="text/css">
.st4 {
fill: none;
stroke: #FFFFFF;
stroke-width: 8;
stroke-miterlimit: 10;
}
.st5 {
fill: #FFFFFF;
}
</style>
<circle class="st4" cx="124.2" cy="123.4" r="108.2"></circle>
<g transform="translate(0,-952.36218)">
<path class="st5"
d="M88.4,1037.4c-10.4,0-18.7,8.3-18.7,18.7v40.1c0,10.4,8.3,18.7,18.7,18.7h72.1c10.4,0,18.7-8.3,18.7-18.7
v-40.1c0-10.4-8.3-18.7-18.7-18.7H88.4z M115.2,1058.8l29.4,17.4l-29.4,17.4V1058.8z"></path>
</g>
</svg></a></li>
</ul>
</div>
<div class="col-lg-3 col-12">
<p class="address_footer text-white">National Library of Medicine<br>
<a href="https://www.google.com/maps/place/8600+Rockville+Pike,+Bethesda,+MD+20894/@38.9959508,-77.101021,17z/data=!3m1!4b1!4m5!3m4!1s0x89b7c95e25765ddb:0x19156f88b27635b8!8m2!3d38.9959508!4d-77.0988323"
class="text-white" target="_blank" rel="noopener noreferrer">8600 Rockville Pike<br>
Bethesda, MD 20894</a></p>
</div>
<div class="col-lg-3 col-12 centered-lg">
<p><a href="https://www.nlm.nih.gov/web_policies.html" class="text-white">Web Policies</a><br>
<a href="https://www.nih.gov/institutes-nih/nih-office-director/office-communications-public-liaison/freedom-information-act-office"
class="text-white">FOIA</a><br>
<a href="https://www.hhs.gov/vulnerability-disclosure-policy/index.html" class="text-white" id="vdp">HHS Vulnerability Disclosure</a></p>
</div>
<div class="col-lg-3 col-12 centered-lg">
<p><a class="supportLink text-white" href="https://support.nlm.nih.gov/">Help</a><br>
<a href="https://www.nlm.nih.gov/accessibility.html" class="text-white">Accessibility</a><br>
<a href="https://www.nlm.nih.gov/careers/careers.html" class="text-white">Careers</a></p>
</div>
</div>
<div class="row">
<div class="col-lg-12 centered-lg">
<nav class="bottom-links">
<ul class="mt-3">
<li>
<a class="text-white" href="//www.nlm.nih.gov/">NLM</a>
</li>
<li>
<a class="text-white"
href="https://www.nih.gov/">NIH</a>
</li>
<li>
<a class="text-white" href="https://www.hhs.gov/">HHS</a>
</li>
<li>
<a
class="text-white" href="https://www.usa.gov/">USA.gov</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
</section>
</footer>
<!-- ========== END FOOTER ========== -->
<!-- javascript to inject NWDS meta tags. Note: value of nwds_version is updated by "npm version" command -->
<script type="text/javascript">
var nwds_version = "1.2.3";
var meta_nwds_ver = document.createElement('meta');
meta_nwds_ver.name = 'ncbi_nwds_ver';
meta_nwds_ver.content = nwds_version;
document.getElementsByTagName('head')[0].appendChild(meta_nwds_ver);
var meta_nwds = document.createElement('meta');
meta_nwds.name = 'ncbi_nwds';
meta_nwds.content = 'yes';
document.getElementsByTagName('head')[0].appendChild(meta_nwds);
var alertsUrl = "/core/alerts/alerts.js";
if (typeof ncbiBaseUrl !== 'undefined') {
alertsUrl = ncbiBaseUrl + alertsUrl;
}
</script>
<!-- JavaScript -->
<script src="/pathogens/static/django_uswds/uswds/js/uswds.js"></script>
<script src="https://code.jquery.com/jquery-3.5.0.min.js"
integrity="sha256-xNzN2a4ltkB44Mc/Jz3pT4iU1cmeR0FkXs4pru/JxaQ="
crossorigin="anonymous">
</script>
<script>
var fallbackJquery = "/pathogens/static/base/js/jquery-3.5.0.min.js";
window.jQuery || document.write("<script src=" + fallbackJquery + ">\x3C/script>")
</script>
<script src="/pathogens/static/nwds/js/nwds.js" type="text/javascript"> </script>
<script src="/pathogens/static/nwds/js/header.js" type="text/javascript"> </script>
<script src="/pathogens/static/nwds/js/ncbipopup.js" type="text/javascript"> </script>
<script src="/pathogens/static/nwds/js/ncbiclearbutton.js" type="text/javascript"> </script>
<script src="/pathogens/static/nwds/js/override-uswds.js" type="text/javascript"> </script>
<script src="/pathogens/static/nwds/js/ncbifeedback.js" type="text/javascript"> </script>
<script type="text/javascript" src="https://www.ncbi.nlm.nih.gov/core/pinger/pinger.js"> </script>
<script type="text/javascript" src="/pathogens/static/main/scripts/page-search.js" charset="utf-8"></script>
</body>
</html>