nih-gov/www.ncbi.nlm.nih.gov/research/bionlp/Research

<!DOCTYPE html>


<html lang="en" >
<head >
    <meta charset="UTF-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">

    <!-- Mobile properties -->
    <meta name="HandheldFriendly" content="True">
    <meta name="MobileOptimized" content="320">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">


    <!-- Stylesheets -->

    <link href="/research/bionlp/static/django_uswds/uswds/css/uswds.css" rel="stylesheet" />


    <title>
Text Mining Research - NIH
</title>


<link rel="stylesheet" href="/research/bionlp/static/main/css/uswds.css">
<link rel="stylesheet" href="/research/bionlp/static/main/css/header.css">
<link rel="stylesheet" href="/research/bionlp/static/main/css/footer.css">
<link rel="stylesheet" href="/research/bionlp/static/main/css/form.css">

<!-- Labs template -->
<link rel="stylesheet" href="/research/bionlp/static/main/css/atoms.css">
<link rel="stylesheet" href="/research/bionlp/static/main/css/docsum.css">
<link rel="stylesheet" href="/research/bionlp/static/main/css/media.css">

<!-- Additional template -->
<link rel="stylesheet" href="/research/bionlp/static/main/css/journals.molecules.css">
<link rel="stylesheet" href="/research/bionlp/static/main/css/custom.css">
<link rel="stylesheet" href="/research/bionlp/static/main/css/journals.journal-page.css">
<link rel="stylesheet" href="/research/bionlp/static/main/css/iconic-glyphs.css">
<link rel="stylesheet" href="/research/bionlp/static/main/css/featherlight.min.css">
<link rel="stylesheet" href="/research/bionlp/static/main/css/styles.css">
<!--[if lt IE 9]>
<link rel="stylesheet" href="/research/bionlp/static/main/css/iconic-glyphs-legacy.css">
<![endif]-->

<!-- Some JS -->
<script src="/research/bionlp/static/main/js/jquery.js"></script>
<script src="/research/bionlp/static/main/js/modernizr.js"></script>
<script src="/research/bionlp/static/main/js/featherlight.min.js"></script>
<script src="/research/bionlp/static/main/js/custom.js"></script>


</head>
<body >


<div>
    <a class="skipnav" href="#maincontent">
        Skip to main page content
    </a>
    <header class="ncbi-page-header" role="banner">
        <div class="prefix">
    <span class="nih" title="National Institutes of Health">
      <a href="https://www.nih.gov/" title="To NIH homepage">
        <img src="/research/bionlp/static/base/images/nih-logo-header.svg" alt="NIH">
      </a>
    </span>
            <span class="nlm">
      <a href="https://www.nlm.nih.gov/" title="To NLM homepage">U.S. National Library of Medicine</a>
    </span>
        </div>

        <div class="ncbi">
            <!-- <abbr class="abbr">
                <a href="https://www.ncbi.nlm.nih.gov/" title="To NCBI homepage">NCBI</a>
            </abbr>
            <span class="name">
      <a href="https://www.ncbi.nlm.nih.gov/" accesskey="1" title="To NCBI homepage">National Center for Biotechnology Information</a>
    </span> -->
            <!-- <abbr class="abbr">
                <a href="https://www.nlm.nih.gov/research/index.html" title="To DIR homepage">DIR</a>
            </abbr> -->
            <span class="name">
      <a href="https://www.nlm.nih.gov/research/index.html" accesskey="1" title="To DIR homepage">Division of Intramural Research</a>
    </span>
            <div class="right">

                <a id="in" href="/research/bionlp/accounts/login/?next=/research/bionlp/">Log in</a>

            </div>
        </div>
    </header>

    <!--app-specific header, something that might want to take full width of screen -->

    <a class="skipnav" href="#maincontent">
        Skip to main page content
    </a>

    <div class="breadcrumbs-container menu">
        <div class="usa-grid-full">
            <ul class="topnav" accesskey="4">
                <li class="current">
                    <a href="/research/bionlp/" title="Home">
                        Home
                    </a>
                </li>
                <li class="separator"></li>
                <li>
                    <a href="/research/bionlp/Zhiyong-Lu" title="Zhiyong Lu">
                        Zhiyong Lu
                    </a>
                </li>
                <li class="separator"></li>
                <li>
                    <a href="/research/bionlp/News" title="Media">
                        Media
                    </a>
                </li>
                <li class="separator"></li>
                <li>
                    <a href="/research/bionlp/Team" title="Team">
                        Team
                    </a>
                </li>
                <li class="separator"></li>
                <li>
                    <a href="/research/bionlp/Research" title="Research">
                        Research
                    </a>
                </li>
                <li class="separator"></li>
                <li>
                    <a href="/research/bionlp/Publications/" title="Publications">
                        Publications
                    </a>
                </li>
                <li class="separator"></li>
                <li>
                    <a href="/research/bionlp/Tools/" title="Tools">
                        Tools
                    </a>
                </li>
                <li>
                    <a href="/research/bionlp/APIs/" title="Tools">
                        Web APIs
                    </a>
                </li>
                <li class="separator"></li>
                <li>
                    <a href="/research/bionlp/Data/" title="Data">
                        AI Datasets
                    </a>
                </li>
                <li>
                    <a href="/research/bionlp/Visiting-us" title="Visiting us">
                        Visiting us
                    </a>
                </li>

                <li class="icon">
                    <a href="#">&#9776;</a>
                </li>
            </ul>
        </div>
    </div>


    <!-- asign css class in case app will need to alter styles of this div -->
    <div id="maincontent" class="usa-grid-full ncbi-base-page-container">
        <div class="labs-pagecontent">
            <div class="usa-width-one-whole">
                <main class="usa-grid journals-lists">


<div>
    <h3>Research</h3>
    <main class="usa-width-one-whole journal-container">
        <div>

            <div class="issue labs-docsums labs-content-box wrappall">
                <div class="usa-width-one-whole">
                    <h3>Overview of Recent R&D Projects</h3>
                    <div class="nlp-wordcloud-wrapper">
                        <img src="/research/bionlp/static/main/images/nlp-wordcloud.png" alt="nlp wordcloud"/>
                    </div>
                </div>
            </div>


            <div class="issue labs-docsums labs-content-box wrappall">
                <div class="usa-width-one-whole">
                    <h3>Example 1: PubMed 2.0</h3>
                    <div class="usa-width-one-fourth logobox">
                        <div class="logobox">
                            <a href="https://www.ncbi.nlm.nih.gov/pubmed/" target="_blank">
                                <img src="/research/bionlp/static/main/images/tools/pubmed_Zx9QQyZ.jpg" alt="Example 1: PubMed 2.0"/>
                            </a>
                        </div>
                    </div>
                    <div class="usa-width-three-fourths">
                        <p>PubMed, an information system for accessing the biomedical literature, is used billions of times each year by millions of people, both in the US and worldwide. It is built and maintained by NCBI/NLM to serve both scientific and medical community and the public at large. With the rapid growth of the biomedical literature along with its associated biomedical data, exciting opportunities arise to provide access to pertinent biomedical information across data sources in an effective and efficient manner. Our overall goal is to deliver the <b>most relevant results</b> (from 26+ million articles) within a fraction of a second to drive accelerated discovery and better health. Through automatic analysis of PubMed search logs, we have identified various kinds of information needs of our users and the gaps in the current system. To close the gap, our team is currently developing a next-gen intelligent system, namely <a href="https://www.pubmed.gov/labs">PubMed Labs</a>, for literature search with improved user experience, along with new search features and capabilities.  </p>
        <h5>Example publications</h5>
        <ul class="dot-list">
          <li>Fiorini et al., <a href="http://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.2005343">Best Match: New relevance search for PubMed</a> PLoS Biology, 2018
          <li>Fiorini et al., <a href="https://www.nature.com/articles/nbt.4267"> How User Intelligence Is Improving PubMed</a> Nature Biotechnology, 2018.
          <li>Fiorini et al., <a href="/pubmed/29083299" target="_blank">Towards PubMed 2.0</a> eLife, 2017.</li>

</ul>
                    </div>
                </div>
            </div>

            <div class="issue labs-docsums labs-content-box wrappall">
                <div class="usa-width-one-whole">
                    <h3>Example 2: Medical AI/LLMs</h3>
                    <div class="usa-width-one-fourth logobox">
                        <div class="logobox">
                            <a href="None" target="_blank">
                                <img src="/research/bionlp/static/main/images/tools/research_example2.png" alt="Example 2: Medical AI/LLMs"/>
                            </a>
                        </div>
                    </div>
                    <div class="usa-width-three-fourths">
                        Our recent research has explored the use/limits of large language models (LLMs) in medical text and image analysis for clinical decision support and knowledge discovery. Our investigations into LLMs cover three primary areas: (1) comprehensive evaluations concerning their performance, equity, and associated risks; (2) methods to augment LLMs with domain-specific knowledge and tools (e.g. GeneGPT); (3) novel applications of LLMs in biomedicine (e.g., TrialGPT).
<br>
<h5>Example publications</h5>
        <ul class="dot-list">
        <li>Jin Q et al., <a href="https://arxiv.org/abs/2307.15051">TrialGPT: Matching Patients to Clinical Trials with Large Language Models</a>. Nature Communications, 2024
        <li>Jin Q et al., <a href="https://arxiv.org/abs/2401.08396">Hidden Flaws Behind Expert-Level Accuracy of Multimodal GPT-4 Vision in Medicine.</a> npj Digital Medicine, 2024
        <li>Tian S et al., <a href="https://academic.oup.com/bib/article/25/1/bbad493/7505071?login=false">Opportunities and challenges for ChatGPT and large language models in biomedicine and health</a>. Briefings in Bioinformatics, 2024
                    </div>
                </div>
            </div>

            <div class="issue labs-docsums labs-content-box wrappall">
                <div class="usa-width-one-whole">
                    <h3>Example 3: Literature mining and information extraction</h3>
                    <div class="usa-width-one-fourth logobox">
                        <div class="logobox">
                            <a href="https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/PubTator/" target="_blank">
                                <img src="/research/bionlp/static/main/images/tools/pubtator_lCjAPx3.png" alt="Example 3: Literature mining and information extraction"/>
                            </a>
                        </div>
                    </div>
                    <div class="usa-width-three-fourths">
                        <p>Biological database curation (<b>biocuration</b>) is a key human activity to provide high-quality structured information that otherwise would be buried in unstructured text, facilitating both human and computer analyses of published biological knowledge. To achieve this, expert human curators are required to read and extract relevant information from the scholarly publications, a highly tedious and time-consuming task. Indeed, this manual process presents a considerable bottleneck in terms of curation cost, efficiency and productivity, making it difficult to keep pace with the rapid growth of the literature. Hence, our overall goals are to fulfill the practical needs of text-mining needs in biocuration, creating a new paradigm where manual curation is greatly facilitated by automated computer analysis. To this end, we have developed PubTator (<a href="http://nar.oxfordjournals.org/content/41/W1/W518.long" target="_blank">Wei et al., 2013</a>), a web-based application for assisting document triage and gene indexing. Through collaboration, PubTator is now successfully integrated into production workflows of multiple important biological databases such as SwissProt. </p>
        <h5>Example publications</h5>
        <ul class="dot-list">
          <li>Wei et al. <a href="/pubmed/23703206" target="_blank">PubTator: a web-based text mining tool for assisting biocuration.</a> Nucleic Acids Res, 2013.</li>
         <li>Poux et al., <a href="https://www.ncbi.nlm.nih.gov/labs/pubmed/29036270">On expert curation and scalability: UniProtKB/Swiss-Prot as a case study</a> Bioinformatics 2017.
          <li>Lee et al., <a href="https://www.ncbi.nlm.nih.gov/labs/pubmed/30102703">Scaling Up Data Curation Using Deep Learning: An Application to Literature Triage in Genomic Variation Resources</a> PLoS Computational Biology, 2018.

        </ul>
                    </div>
                </div>
            </div>

            <div class="issue labs-docsums labs-content-box wrappall">
                <div class="usa-width-one-whole">
                    <h3>Example 4: Medical Image Analysis (Radiology &amp; Ophthalmology)</h3>
                    <div class="usa-width-one-fourth logobox">
                        <div class="logobox">
                            <a href="None" target="_blank">
                                <img src="/research/bionlp/static/main/images/tools/20170927-lung-mass.jpg" alt="Example 4: Medical Image Analysis (Radiology &amp; Ophthalmology)"/>
                            </a>
                        </div>
                    </div>
                    <div class="usa-width-three-fourths">
                        <p>
Mining EMRs and medical images has the potential to lead to improvement in patient care as such data contain rich information for large patient populations. We have recently text-mined over 100,000 radiology reports where our algorithm generated “weak” training labels to enable the development of advanced deep learning methods for automatically reading and classifying chest X-ray images. This work has also resulted in the release of <a href="https://nihcc.app.box.com/v/ChestXray-NIHCC">ChestX-ray8</a>: one of the largest publicly available chest x-ray datasets to the scientific community. We have also conducted research to assist in the screening of age-related macular degeneration (AMD): a leading cause of vision loss in Americans 60 and older. By leveraging cutting-edge deep learning techniques and repurposing “big” imaging data from a major AMD clinical trial, we developed a novel data-driven approach (<a href="https://github.com/ncbi-nlp/DeepSeeNet">DeepSeeNet</a>) for autonomous AMD diagnosis with its performance exceeding human ophthalmologists (retinal specialists in this case). Such a result highlights the potential of deep learning systems to assist early disease detection and enhance clinical decision-making processes.

<h5>Example publications</h5>
        <ul class="dot-list">
        <li>Wang X et al., <a href="https://arxiv.org/abs/1705.02315">ChestX-ray8: Hospital-scale Chest X-ray Database and Benchmarks on Weakly-Supervised Classification and Localization of Common Thorax Diseases</a>. Proceedings of 2017 IEEE Computer Vision and Pattern Recognition (CVPR). 2017
        <li>Peng et al., <a href="https://arxiv.org/abs/1811.07492">DeepSeeNet: A deep learning model for automated classification of patient-based age-related macular degeneration severity from color fundus photographs.</a> Ophthalmology. 2018
        <li>Wang X et al., <a href="https://arxiv.org/abs/1801.04334">TieNet: Text-Image Embedding Network for Common Thorax Disease Classification and Reporting in Chest X-rays</a>. Proceedings of 2018 EEE Computer Vision and Pattern Recognition (CVPR), 2018.
                    </div>
                </div>
            </div>

            <div class="issue labs-docsums labs-content-box wrappall">
                <div class="usa-width-one-whole">
                    <h3>Example 5: BioCreative</h3>
                    <div class="usa-width-one-fourth logobox">
                        <div class="logobox">
                            <a href="http://www.biocreative.org" target="_blank">
                                <img src="/research/bionlp/static/main/images/tools/biocreative_9KkVxD6.gif" alt="Example 5: BioCreative"/>
                            </a>
                        </div>
                    </div>
                    <div class="usa-width-three-fourths">
                        <p>
          <em>Critical Assessment of Information Extraction in Biology</em> (<a href="/research/bionlp/biocreative" target="_blank">BioCreative</a>) is a community effort for evaluating text mining and information extraction systems applied to the biological domain. Since 2004, the BioCreative Evaluation series has included over ten different tasks such as ranking of relevant documents ("document triage"), extraction of genes and proteins ("gene mention") and their linkage to database identifiers ("gene normalization"), as well as creation of functional annotations in standard ontologies (e.g., GO) and extraction of entity-relations (e.g., protein-protein interaction). As part of the BioCreative executive committee, we have led the organization of multiple shared tasks in recent years such as:
          <ul class="dot-list">
            <li><a href="/research/bionlp/biocreative" target="_blank">Chemical-Disease Relation Extraction</a> - BioCreative 2015</li>
            <li><a href="http://bioc.sourceforge.net/" target="_blank">BioC: The BioCreative Interoperability Initiative</a> - BioCreative 2015 & 2013</li>
            <li><a href="/research/bionlp/biocreative" target="_blank">Automatic Gene Ontology (GO) Annotation</a> - BioCreative 2013</li>
            <li>Multi-species Gene Normalization (GN) - BioCreative 2010</li>
          </ul>
        </p>
        <h5>Example publications</h5>
        <ul class="dot-list">
          <li>Lu et al. <a href="/pubmed/22151901" target="_blank">The gene normalization task in BioCreative III.</a> BMC Bioinformatics, 2011.</li>
          <li>Comeau et al. <a href="/pubmed/24048470" target="_blank">BioC: a minimalist approach to interoperability for biomedical text processing.</a> Database (Oxford), 2013.</li>
          <li>Mao et al. <a href="/pubmed/25157073" target="_blank">Overview of the gene ontology task at BioCreative IV.</a> Database (Oxford), 2014.</li>
          <li>Wei et al. <a href="/pubmed/26994911" target="_blank">Assessing the state of the art in biomedical relation extraction: overview of the BioCreative V chemical-disease relation (CDR) task.</a> Database (Oxford), 2016.</li>
        </ul>
                    </div>
                </div>
            </div>

        </div>
    </main>
</div>

                </main>
            </div>
        </div>
    </div>

    <footer class="usa-footer usa-footer-big ncbi-footer" role="contentinfo">
        <div class="usa-grid">
            <div class="usa-row">
                <div class="usa-width-one-half">
                    <div>
                        <div class="org-section">
                            <a href="https://www.hhs.gov/"><img class="usa-footer-logo-img hhs-logo"
                                                                src="/research/bionlp/static/base/images/dhhs-logo-white.svg"
                                                                alt="U.S. Department of Health & Human Services">
                                <span class="usa-sr-only">Department of Health and Human Services</span></a>
                            <a href="https://www.nih.gov/"><img class="usa-footer-logo-img nih-logo"
                                                                src="/research/bionlp/static/base/images/nih-logo-white.svg"
                                                                alt="National Institutes of Health">
                                <span class="usa-sr-only">National Institutes of Health</span></a>
                            <a href="https://www.nlm.nih.gov/"><img class="usa-footer-logo-img nlm-logo"
                                                                    src="/research/bionlp/static/base/images/nlm-logo-letters-white.svg"
                                                                    alt="National Library of Medicine">
                                <span class="usa-sr-only">National Library of Medicine</span></a>
                            <a href="https://www.usa.gov/"><img class="usa-footer-logo-img usagov-logo"
                                                                src="/research/bionlp/static/base/images/usagov-logo-white.svg"
                                                                alt="USA.gov"/>
                                <span class="usa-sr-only">USA.gov</span></a>
                        </div>
                    </div>
                </div>

                <div class="usa-width-one-half">
                    <div>
                        <p class="about-links">
                            <a href="https://www.nlm.nih.gov/research/index.html">About DIR</a>
                            <a href="https://www.nlm.nih.gov/web_policies.html">Web Policies</a></p>
                    </div>
                </div>
            </div>
        </div>
    </footer>
</div>


    <!-- JavaScript -->

    <script src="/research/bionlp/static/django_uswds/uswds/js/uswds.js"></script>


<script type="text/javascript" src="/research/bionlp/static/base/header.js"></script>

</body>
</html>