504 lines
No EOL
23 KiB
Text
504 lines
No EOL
23 KiB
Text
<!DOCTYPE html>
|
|
|
|
|
|
|
|
|
|
<html lang="en" >
|
|
<head >
|
|
<meta charset="UTF-8">
|
|
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
|
|
|
<!-- Mobile properties -->
|
|
<meta name="HandheldFriendly" content="True">
|
|
<meta name="MobileOptimized" content="320">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
|
|
|
|
<!-- Stylesheets -->
|
|
|
|
<link href="/research/bionlp/static/django_uswds/uswds/css/uswds.css" rel="stylesheet" />
|
|
|
|
|
|
|
|
<title>
|
|
tmVar: A text mining approach for extracting sequence variants in biomedical literature
|
|
</title>
|
|
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="/research/bionlp/static/main/css/uswds.css">
|
|
<link rel="stylesheet" href="/research/bionlp/static/main/css/header.css">
|
|
<link rel="stylesheet" href="/research/bionlp/static/main/css/footer.css">
|
|
<link rel="stylesheet" href="/research/bionlp/static/main/css/form.css">
|
|
|
|
<!-- Labs template -->
|
|
<link rel="stylesheet" href="/research/bionlp/static/main/css/atoms.css">
|
|
<link rel="stylesheet" href="/research/bionlp/static/main/css/docsum.css">
|
|
<link rel="stylesheet" href="/research/bionlp/static/main/css/media.css">
|
|
|
|
<!-- Additional template -->
|
|
<link rel="stylesheet" href="/research/bionlp/static/main/css/journals.molecules.css">
|
|
<link rel="stylesheet" href="/research/bionlp/static/main/css/custom.css">
|
|
<link rel="stylesheet" href="/research/bionlp/static/main/css/journals.journal-page.css">
|
|
<link rel="stylesheet" href="/research/bionlp/static/main/css/iconic-glyphs.css">
|
|
<link rel="stylesheet" href="/research/bionlp/static/main/css/featherlight.min.css">
|
|
<link rel="stylesheet" href="/research/bionlp/static/main/css/styles.css">
|
|
<!--[if lt IE 9]>
|
|
<link rel="stylesheet" href="/research/bionlp/static/main/css/iconic-glyphs-legacy.css">
|
|
<![endif]-->
|
|
|
|
<!-- Some JS -->
|
|
<script src="/research/bionlp/static/main/js/jquery.js"></script>
|
|
<script src="/research/bionlp/static/main/js/modernizr.js"></script>
|
|
<script src="/research/bionlp/static/main/js/featherlight.min.js"></script>
|
|
<script src="/research/bionlp/static/main/js/custom.js"></script>
|
|
|
|
|
|
|
|
|
|
</head>
|
|
<body >
|
|
|
|
|
|
<div>
|
|
<a class="skipnav" href="#maincontent">
|
|
Skip to main page content
|
|
</a>
|
|
<header class="ncbi-page-header" role="banner">
|
|
<div class="prefix">
|
|
<span class="nih" title="National Institutes of Health">
|
|
<a href="https://www.nih.gov/" title="To NIH homepage">
|
|
<img src="/research/bionlp/static/base/images/nih-logo-header.svg" alt="NIH">
|
|
</a>
|
|
</span>
|
|
<span class="nlm">
|
|
<a href="https://www.nlm.nih.gov/" title="To NLM homepage">U.S. National Library of Medicine</a>
|
|
</span>
|
|
</div>
|
|
|
|
<div class="ncbi">
|
|
<!-- <abbr class="abbr">
|
|
<a href="https://www.ncbi.nlm.nih.gov/" title="To NCBI homepage">NCBI</a>
|
|
</abbr>
|
|
<span class="name">
|
|
<a href="https://www.ncbi.nlm.nih.gov/" accesskey="1" title="To NCBI homepage">National Center for Biotechnology Information</a>
|
|
</span> -->
|
|
<!-- <abbr class="abbr">
|
|
<a href="https://www.nlm.nih.gov/research/index.html" title="To DIR homepage">DIR</a>
|
|
</abbr> -->
|
|
<span class="name">
|
|
<a href="https://www.nlm.nih.gov/research/index.html" accesskey="1" title="To DIR homepage">Division of Intramural Research</a>
|
|
</span>
|
|
<div class="right">
|
|
|
|
<a id="in" href="/research/bionlp/accounts/login/?next=/research/bionlp/">Log in</a>
|
|
|
|
</div>
|
|
</div>
|
|
</header>
|
|
|
|
<!--app-specific header, something that might want to take full width of screen -->
|
|
|
|
<a class="skipnav" href="#maincontent">
|
|
Skip to main page content
|
|
</a>
|
|
|
|
<div class="breadcrumbs-container menu">
|
|
<div class="usa-grid-full">
|
|
<ul class="topnav" accesskey="4">
|
|
<li class="current">
|
|
<a href="/research/bionlp/" title="Home">
|
|
Home
|
|
</a>
|
|
</li>
|
|
<li class="separator"></li>
|
|
<li>
|
|
<a href="/research/bionlp/Zhiyong-Lu" title="Zhiyong Lu">
|
|
Zhiyong Lu
|
|
</a>
|
|
</li>
|
|
<li class="separator"></li>
|
|
<li>
|
|
<a href="/research/bionlp/News" title="Media">
|
|
Media
|
|
</a>
|
|
</li>
|
|
<li class="separator"></li>
|
|
<li>
|
|
<a href="/research/bionlp/Team" title="Team">
|
|
Team
|
|
</a>
|
|
</li>
|
|
<li class="separator"></li>
|
|
<li>
|
|
<a href="/research/bionlp/Research" title="Research">
|
|
Research
|
|
</a>
|
|
</li>
|
|
<li class="separator"></li>
|
|
<li>
|
|
<a href="/research/bionlp/Publications/" title="Publications">
|
|
Publications
|
|
</a>
|
|
</li>
|
|
<li class="separator"></li>
|
|
<li>
|
|
<a href="/research/bionlp/Tools/" title="Tools">
|
|
Tools
|
|
</a>
|
|
</li>
|
|
<li>
|
|
<a href="/research/bionlp/APIs/" title="Tools">
|
|
Web APIs
|
|
</a>
|
|
</li>
|
|
<li class="separator"></li>
|
|
<li>
|
|
<a href="/research/bionlp/Data/" title="Data">
|
|
AI Datasets
|
|
</a>
|
|
</li>
|
|
<li>
|
|
<a href="/research/bionlp/Visiting-us" title="Visiting us">
|
|
Visiting us
|
|
</a>
|
|
</li>
|
|
|
|
<li class="icon">
|
|
<a href="#">☰</a>
|
|
</li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
<!-- asign css class in case app will need to alter styles of this div -->
|
|
<div id="maincontent" class="usa-grid-full ncbi-base-page-container">
|
|
<div class="labs-pagecontent">
|
|
<div class="usa-width-one-whole">
|
|
<main class="usa-grid journals-lists">
|
|
|
|
|
|
|
|
<h3>tmVar: A text mining approach for extracting sequence variants in biomedical literature</h3>
|
|
<main class="usa-width-one-whole journal-container">
|
|
<div>
|
|
<div class="issue labs-docsums labs-content-box wrappall">
|
|
<h4>Authors: <a href="https://sites.google.com/site/chihhsuanwei/" target="_blank">Chih-Hsuan Wei</a>, <a
|
|
href="http://www.faculty.uci.edu/profile.cfm?faculty_id=5931" target="_blank">Bethany R. Harris</a>,
|
|
<a href="http://myweb.ncku.edu.tw/~hykao/" target="_blank">Hung-Yu Kao</a> and <a
|
|
href="/bionlp/" target="_blank">Zhiyong Lu</a> (PI)</h4>
|
|
<h4>Research highlights (<a href="/CBBresearch/Lu/Demo/tmTools/demo/tmVar/"
|
|
target="_blank">demo</a>)</h4>
|
|
<div class="usa-width-one-whole">
|
|
<p>
|
|
Text-mining mutation information from the literature becomes a critical part of the bioinformatics
|
|
approach for the analysis and interpretation of sequence variations in complex diseases in the
|
|
post-genomic era. Current approaches are mostly rule-based and focus on limited types of sequence
|
|
variations such as protein point mutations. Here we report tmVar, a text-mining approach based on
|
|
conditional random field (CRF) for extracting a wide range of sequence variants in both protein and
|
|
gene levels according to a standard sequence variants nomenclature developed by the human genome
|
|
variation society (HGVS). By doing so, we cover several important types of mutations that were not
|
|
considered in past studies. Using a novel CRF label model with a set of customized features, our
|
|
method achieves high performance of over 90% in F-measure on both our own corpus and a publicly
|
|
available benchmarking data set and compares favorably to the state of the art methods.
|
|
</p>
|
|
<p>
|
|
<img src="/research/bionlp/static/main/images/new.jpg" width="30" style="float:left;" />
|
|
tmVar is now able to normalize extracted variant mentions to unique identifiers (dbSNP RSIDs). In benchmarking results, tmVar achieves state of the art performance (~90% in F-measure). See the article below for more details.
|
|
<ul class="dot-list">
|
|
<li>Wei C-H, Phan L, Feltz J, Maiti R, Hefferon T, Lu Z. tmVar 2.0: Integrating genomic variant information from literature with dbSNP and ClinVar for precision medicine. Bioinformatics, 2017.</li>
|
|
</ul>
|
|
</p>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="issue labs-docsums labs-content-box wrappall">
|
|
<h4>Method overview</h4>
|
|
<div class="usa-width-one-whole">
|
|
<p>
|
|
As shown in belowe Figure, our method first performs tokenization on the input text as
|
|
pre-processing. Next, our method extracts mutation mentions from text using a CRF-based approach,
|
|
followed by some post-processing steps. As illustrated in the figure, instead of extracting a
|
|
mutation mention such as c.2708_2711delTTAG as a whole, our CRF module identifies each mutation
|
|
component (e.g.'del' as the mutation type) individually. Finally, we have implemented a
|
|
post-processing module to handle some rare mutation formula and nature language mentions that are
|
|
not curated in our own corpus.
|
|
</p>
|
|
<div class="figure">
|
|
<a href="#" data-featherlight="/research/bionlp/static/main/images/tools/tmVar.png">
|
|
<img src="/research/bionlp/static/main/images/tools/tmVar.png" width="350"/></a>
|
|
<span><b>Figure 1.</b> An overview of the tmVar workflow.</span>
|
|
</div>
|
|
<p>
|
|
<img src="/research/bionlp/static/main/images/new.jpg" width="30" style="float:left;" />
|
|
Next, we developed a new module for mapping each previously detected mutation mention to a corresponding RS number as shown in Figure 2. By using two main strategies, pattern matching and dictionary lookup, tmVar can find the corresponding RSIDs for variants. We firstly developed a set of patterns (e.g., “[Gene/Protein] ([DNAMutation] with [RSID])”) to detect a pair of mutation and RSID co-occurring in the same sentence. For the remaining mentions, we generated a list of candidate RSIDs by searching our lexicon.
|
|
</p>
|
|
<div class="figure">
|
|
<a href="#" data-featherlight="/research/bionlp/static/main/images/tools/tmVar2.png">
|
|
<img src="/research/bionlp/static/main/images/tools/tmVar2.png" width="350" /></a>
|
|
<span><b>Figure 2.</b> The overall workflow of our mutation normalization process.</span>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="issue labs-docsums labs-content-box wrappall">
|
|
<h4>Results</h4>
|
|
<div class="usa-width-one-whole">
|
|
<table class="customtable">
|
|
<tbody>
|
|
<tr>
|
|
<td align="center"><p> </td>
|
|
<td align="center"><strong>Methods</strong></td>
|
|
<td align="center"><strong>Precision</strong></td>
|
|
<td align="center"><strong>Recall</strong></td>
|
|
<td align="center"><strong>F-measure</strong></td>
|
|
</tr>
|
|
<tr>
|
|
<td rowspan="3" align="center">All Mutations</td>
|
|
<td align="center">MutationFinder</td>
|
|
<td align="center">91.66%</td>
|
|
<td align="center">33.21%</td>
|
|
<td align="center">48.76%</td>
|
|
</tr>
|
|
<tr>
|
|
<td align="center">MutationFinder+</td>
|
|
<td align="center">89.66%</td>
|
|
<td align="center">69.15%</td>
|
|
<td align="center">78.08%</td>
|
|
</tr>
|
|
<tr>
|
|
<td align="center" class="best">tmVar</td>
|
|
<td align="center" class="best">91.38%</td>
|
|
<td align="center" class="best">91.40%</td>
|
|
<td align="center" class="best">91.39%</td>
|
|
</tr>
|
|
<tr>
|
|
<td rowspan="3" align="center">Normalized<br>
|
|
Mutations
|
|
</td>
|
|
<td align="center">MutationFinder</td>
|
|
<td align="center">84.21%</td>
|
|
<td align="center">25.29%</td>
|
|
<td align="center">38.90%</td>
|
|
</tr>
|
|
<tr>
|
|
<td align="center">MutationFinder+</td>
|
|
<td align="center">84.09%</td>
|
|
<td align="center">63.25%</td>
|
|
<td align="center">72.20%</td>
|
|
</tr>
|
|
<tr>
|
|
<td align="center" class="best">tmVar</td>
|
|
<td align="center" class="best">87.74%</td>
|
|
<td align="center" class="best">87.46%</td>
|
|
<td align="center" class="best">87.60%</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
<span><b>Table 1.</b> Results on the test set of our corpus for mutation individual component.</span>
|
|
|
|
<table class="customtable">
|
|
<tbody>
|
|
<tr>
|
|
<td align="center"> </td>
|
|
<td align="center" nowrap="nowrap">Methods</td>
|
|
<td align="center">Precision</td>
|
|
<td align="center" nowrap="nowrap">Recall</td>
|
|
<td align="center" nowrap="nowrap">F-measure</td>
|
|
</tr>
|
|
<tr>
|
|
<td rowspan="2" align="center">All Mutations</td>
|
|
<td align="center" nowrap="nowrap">MutationFinder</td>
|
|
<td align="center">98.41%</td>
|
|
<td align="center" nowrap="nowrap">81.92%</td>
|
|
<td align="center" nowrap="nowrap">89.41%</td>
|
|
</tr>
|
|
<tr>
|
|
<td align="center" nowrap="nowrap" class="best">tmVar</td>
|
|
<td align="center" class="best">98.80%</td>
|
|
<td align="center" nowrap="nowrap" class="best">89.62%</td>
|
|
<td align="center" nowrap="nowrap" class="best">93.98%</td>
|
|
</tr>
|
|
<tr>
|
|
<td rowspan="2" align="center">Normalized<br>Mutations</td>
|
|
<td align="center" nowrap="nowrap">MutationFinder</td>
|
|
<td align="center">98.47%</td>
|
|
<td align="center" nowrap="nowrap">80.63%</td>
|
|
<td align="center" nowrap="nowrap">88.66%</td>
|
|
</tr>
|
|
<tr>
|
|
<td align="center" nowrap="nowrap" class="best">tmVar</td>
|
|
<td align="center" class="best">97.58%</td>
|
|
<td align="center" nowrap="nowrap" class="best">83.96%</td>
|
|
<td align="center" nowrap="nowrap" class="best">90.26%</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
<span><b>Table 2.</b> Results on the <a href="http://mutationfinder.sourceforge.net/" target="_blank">MutationFinder corpus</a> for mutation individual component.</span><br/><br/>
|
|
|
|
<img src="/research/bionlp/static/main/images/new.jpg" width="30" />
|
|
<table class="customtable" style="margin-top:0;">
|
|
<tbody>
|
|
<tr>
|
|
<td align="center">Corpus</td>
|
|
<td align="center">Method</td>
|
|
<td align="center">TP</td>
|
|
<td align="center">FP</td>
|
|
<td align="center">FN</td>
|
|
<td align="center">Precision</td>
|
|
<td align="center">Recall</td>
|
|
<td align="center">F-score</td>
|
|
</tr>
|
|
<tr>
|
|
<td rowspan="2" align="center">tmVar2</td>
|
|
<td align="center">tmVar</td>
|
|
<td align="center">565</td>
|
|
<td align="center">16</td>
|
|
<td align="center">60</td>
|
|
<td align="center">97.25%</td>
|
|
<td align="center">90.40%</td>
|
|
<td align="center">93.70%</td>
|
|
</tr>
|
|
<tr>
|
|
<td align="center">SETH</td>
|
|
<td align="center">466</td>
|
|
<td align="center">5</td>
|
|
<td align="center">159</td>
|
|
<td align="center">98.94%</td>
|
|
<td align="center">74.56%</td>
|
|
<td align="center">85.04%</td>
|
|
</tr>
|
|
<tr>
|
|
<td rowspan="2" align="center">OSIRIS</td>
|
|
<td align="center">tmVar</td>
|
|
<td align="center">208</td>
|
|
<td align="center">6</td>
|
|
<td align="center">50</td>
|
|
<td align="center">97.20%</td>
|
|
<td align="center">80.62%</td>
|
|
<td align="center">88.14%</td>
|
|
</tr>
|
|
<tr>
|
|
<td align="center">SETH</td>
|
|
<td align="center">179</td>
|
|
<td align="center">11</td>
|
|
<td align="center">79</td>
|
|
<td align="center">94.21%</td>
|
|
<td align="center">69.38%</td>
|
|
<td align="center">79.91%</td>
|
|
</tr>
|
|
<tr>
|
|
<td rowspan="2" align="center">Thomas</td>
|
|
<td align="center">tmVar</td>
|
|
<td align="center">465</td>
|
|
<td align="center">52</td>
|
|
<td align="center">62</td>
|
|
<td align="center">89.94%</td>
|
|
<td align="center">88.24%</td>
|
|
<td align="center">89.08%</td>
|
|
</tr>
|
|
<tr>
|
|
<td align="center">SETH</td>
|
|
<td align="center">303</td>
|
|
<td align="center">14</td>
|
|
<td align="center">224</td>
|
|
<td align="center">95.58%</td>
|
|
<td align="center">57.50%</td>
|
|
<td align="center">71.80%</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
<span><b>Table 3.</b> Normalization results on the tmVar normalization corpus, OSIRIS and SETH.
|
|
</div>
|
|
</div>
|
|
|
|
<div class="issue labs-docsums labs-content-box wrappall">
|
|
<h4>Downloads</h4>
|
|
<div class="usa-width-one-whole">
|
|
<p>
|
|
tmVar 2.0 Software (Includes normalization) in <a href="ftp://ftp.ncbi.nlm.nih.gov/pub/lu/Suppl/tmVar2/tmVarJava.zip" target="_blank">Java</a><br/>
|
|
<a href="/CBBresearch/Lu/Demo/tmTools/download/tmVar/tmVarCorpus.zip" target="_blank">tmVar NER Corpus</a> (<a href="/CBBresearch/Lu/Demo/PubTator/tutorial/tmVar.html" target="_blank">Mention forms, <a href="/CBBresearch/Lu/Demo/tmTools/download/tmVar/mutation_annotation_guidelines.txt" target="_blank">Annotation guidelines</a>)<br/>
|
|
<a href="/CBBresearch/Lu/Demo/tmTools/download/tmVar/tmVar.Normalization.txt" target="_blank">tmVar Normalization Corpus</a><br/>
|
|
tmVar-tagged PubMed results in <a href="/CBBresearch/Lu/Demo/PubTator/" target="_blank">PubTator</a><br/>
|
|
<a href="/research/bionlp/APIs/">tmVar RESTful API</a>
|
|
</p>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="issue labs-docsums labs-content-box wrappall">
|
|
<h4>Please cite</h4>
|
|
<div class="usa-width-one-whole">
|
|
<ul class="dot-list">
|
|
<li>Wei C-H, Harris BR, Kao H-Y, Lu Z. <a
|
|
href="http://bioinformatics.oxfordjournals.org/content/early/2013/04/04/bioinformatics.btt156.full.pdf+html"
|
|
target="_blank">tmVar: A text mining approach for extracting sequence variants in biomedical
|
|
literature</a>. Bioinformatics, 29(11) 1433-1439, doi:10.1093/bioinformatics/btt156 (2013)
|
|
</li>
|
|
<li>
|
|
Wei C-H, Phan L, Feltz J, Maiti R, Hefferon T, Lu Z. tmVar 2.0: Integrating genomic variant information from literature with dbSNP and ClinVar for precision medicine. Bioinformatics, 2017.
|
|
</li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
|
|
</div>
|
|
</main>
|
|
|
|
</main>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<footer class="usa-footer usa-footer-big ncbi-footer" role="contentinfo">
|
|
<div class="usa-grid">
|
|
<div class="usa-row">
|
|
<div class="usa-width-one-half">
|
|
<div>
|
|
<div class="org-section">
|
|
<a href="https://www.hhs.gov/"><img class="usa-footer-logo-img hhs-logo"
|
|
src="/research/bionlp/static/base/images/dhhs-logo-white.svg"
|
|
alt="U.S. Department of Health & Human Services">
|
|
<span class="usa-sr-only">Department of Health and Human Services</span></a>
|
|
<a href="https://www.nih.gov/"><img class="usa-footer-logo-img nih-logo"
|
|
src="/research/bionlp/static/base/images/nih-logo-white.svg"
|
|
alt="National Institutes of Health">
|
|
<span class="usa-sr-only">National Institutes of Health</span></a>
|
|
<a href="https://www.nlm.nih.gov/"><img class="usa-footer-logo-img nlm-logo"
|
|
src="/research/bionlp/static/base/images/nlm-logo-letters-white.svg"
|
|
alt="National Library of Medicine">
|
|
<span class="usa-sr-only">National Library of Medicine</span></a>
|
|
<a href="https://www.usa.gov/"><img class="usa-footer-logo-img usagov-logo"
|
|
src="/research/bionlp/static/base/images/usagov-logo-white.svg"
|
|
alt="USA.gov"/>
|
|
<span class="usa-sr-only">USA.gov</span></a>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="usa-width-one-half">
|
|
<div>
|
|
<p class="about-links">
|
|
<a href="https://www.nlm.nih.gov/research/index.html">About DIR</a>
|
|
<a href="https://www.nlm.nih.gov/web_policies.html">Web Policies</a></p>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</footer>
|
|
</div>
|
|
|
|
|
|
|
|
<!-- JavaScript -->
|
|
|
|
<script src="/research/bionlp/static/django_uswds/uswds/js/uswds.js"></script>
|
|
|
|
|
|
|
|
|
|
<script type="text/javascript" src="/research/bionlp/static/base/header.js"></script>
|
|
|
|
</body>
|
|
</html> |