nih-gov/www.ncbi.nlm.nih.gov/research/bionlp/APIs/BioC-PMC/index.html

376 lines
No EOL
16 KiB
HTML

<!DOCTYPE html>
<html lang="en" >
<head >
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<!-- Mobile properties -->
<meta name="HandheldFriendly" content="True">
<meta name="MobileOptimized" content="320">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<!-- Stylesheets -->
<link href="/research/bionlp/static/django_uswds/uswds/css/uswds.css" rel="stylesheet" />
<title>
BioC API for PMC Open Access
</title>
<link rel="stylesheet" href="/research/bionlp/static/main/css/uswds.css">
<link rel="stylesheet" href="/research/bionlp/static/main/css/header.css">
<link rel="stylesheet" href="/research/bionlp/static/main/css/footer.css">
<link rel="stylesheet" href="/research/bionlp/static/main/css/form.css">
<!-- Labs template -->
<link rel="stylesheet" href="/research/bionlp/static/main/css/atoms.css">
<link rel="stylesheet" href="/research/bionlp/static/main/css/docsum.css">
<link rel="stylesheet" href="/research/bionlp/static/main/css/media.css">
<!-- Additional template -->
<link rel="stylesheet" href="/research/bionlp/static/main/css/journals.molecules.css">
<link rel="stylesheet" href="/research/bionlp/static/main/css/custom.css">
<link rel="stylesheet" href="/research/bionlp/static/main/css/journals.journal-page.css">
<link rel="stylesheet" href="/research/bionlp/static/main/css/iconic-glyphs.css">
<link rel="stylesheet" href="/research/bionlp/static/main/css/featherlight.min.css">
<link rel="stylesheet" href="/research/bionlp/static/main/css/styles.css">
<!--[if lt IE 9]>
<link rel="stylesheet" href="/research/bionlp/static/main/css/iconic-glyphs-legacy.css">
<![endif]-->
<!-- Some JS -->
<script src="/research/bionlp/static/main/js/jquery.js"></script>
<script src="/research/bionlp/static/main/js/modernizr.js"></script>
<script src="/research/bionlp/static/main/js/featherlight.min.js"></script>
<script src="/research/bionlp/static/main/js/custom.js"></script>
</head>
<body >
<div>
<a class="skipnav" href="#maincontent">
Skip to main page content
</a>
<header class="ncbi-page-header" role="banner">
<div class="prefix">
<span class="nih" title="National Institutes of Health">
<a href="https://www.nih.gov/" title="To NIH homepage">
<img src="/research/bionlp/static/base/images/nih-logo-header.svg" alt="NIH">
</a>
</span>
<span class="nlm">
<a href="https://www.nlm.nih.gov/" title="To NLM homepage">U.S. National Library of Medicine</a>
</span>
</div>
<div class="ncbi">
<!-- <abbr class="abbr">
<a href="https://www.ncbi.nlm.nih.gov/" title="To NCBI homepage">NCBI</a>
</abbr>
<span class="name">
<a href="https://www.ncbi.nlm.nih.gov/" accesskey="1" title="To NCBI homepage">National Center for Biotechnology Information</a>
</span> -->
<!-- <abbr class="abbr">
<a href="https://www.nlm.nih.gov/research/index.html" title="To DIR homepage">DIR</a>
</abbr> -->
<span class="name">
<a href="https://www.nlm.nih.gov/research/index.html" accesskey="1" title="To DIR homepage">Division of Intramural Research</a>
</span>
<div class="right">
<a id="in" href="/research/bionlp/accounts/login/?next=/research/bionlp/">Log in</a>
</div>
</div>
</header>
<!--app-specific header, something that might want to take full width of screen -->
<a class="skipnav" href="#maincontent">
Skip to main page content
</a>
<div class="breadcrumbs-container menu">
<div class="usa-grid-full">
<ul class="topnav" accesskey="4">
<li class="current">
<a href="/research/bionlp/" title="Home">
Home
</a>
</li>
<li class="separator"></li>
<li>
<a href="/research/bionlp/Zhiyong-Lu" title="Zhiyong Lu">
Zhiyong Lu
</a>
</li>
<li class="separator"></li>
<li>
<a href="/research/bionlp/News" title="Media">
Media
</a>
</li>
<li class="separator"></li>
<li>
<a href="/research/bionlp/Team" title="Team">
Team
</a>
</li>
<li class="separator"></li>
<li>
<a href="/research/bionlp/Research" title="Research">
Research
</a>
</li>
<li class="separator"></li>
<li>
<a href="/research/bionlp/Publications/" title="Publications">
Publications
</a>
</li>
<li class="separator"></li>
<li>
<a href="/research/bionlp/Tools/" title="Tools">
Tools
</a>
</li>
<li>
<a href="/research/bionlp/APIs/" title="Tools">
Web APIs
</a>
</li>
<li class="separator"></li>
<li>
<a href="/research/bionlp/Data/" title="Data">
AI Datasets
</a>
</li>
<li>
<a href="/research/bionlp/Visiting-us" title="Visiting us">
Visiting us
</a>
</li>
<li class="icon">
<a href="#">&#9776;</a>
</li>
</ul>
</div>
</div>
<!-- asign css class in case app will need to alter styles of this div -->
<div id="maincontent" class="usa-grid-full ncbi-base-page-container">
<div class="labs-pagecontent">
<div class="usa-width-one-whole">
<main class="usa-grid journals-lists">
<h3>BioC API for PMC Open Access</h3>
<main class="usa-width-one-whole journal-container">
<div>
<div class="issue labs-docsums labs-content-box wrappall">
<h4>PubMed Central Open Access in BioC format (click <a href="/research/bionlp/APIs/BioC-PubMed/">here</a> for accessing PubMed articles)</h4>
<div class="usa-width-one-whole">
<p>
All the PubMed Central (PMC) Open Access articles are available in the <a href="http://bioc.sourceforge.net" target="_blank">BioC</a> format. This provides a large number of full text research articles for text mining and information retrieval research. BioC is a simple format designed for straightforward text processing.
These articles are available in BioC XML or BioC JSON, in Unicode or ASCII, and via PubMed ID or PMC ID.
</p>
<p>
If you use this resource, please cite:
<ul class="dot-list">
<li><a href="https://academic.oup.com/bioinformatics/advance-article-abstract/doi/10.1093/bioinformatics/btz070/5305021">Comeau DC, Wei CH, Islamaj Doğan R, and Lu Z. PMC text mining subset in BioC: about 3 million full text articles and growing, <i>Bioinformatics</i>, btz070, 2019.</a></li>
</ul>
</p>
<p>
Articles available from this service are in the <a href="https://www.ncbi.nlm.nih.gov/pmc/tools/openftlist/">PMC Open Access Subset</a> and the <a href="https://www.ncbi.nlm.nih.gov/pmc/about/mscollection/">PMC Author Manuscript Collection</a>. Information about these collections is available on the
following pages.
<ul class="dot-list">
<li>PMC Open Access Subset:&emsp;<a
href="https://www.ncbi.nlm.nih.gov/pmc/tools/openftlist/">https://www.ncbi.nlm.nih.gov/pmc/tools/openftlist/</a></li>
<li>PMC Author Manuscript Collection:&emsp;<a
href="https://www.ncbi.nlm.nih.gov/pmc/about/mscollection/">https://www.ncbi.nlm.nih.gov/pmc/about/mscollection/</a></li>
</ul>
</p>
<p>
Not all PMC articles are available in these collections. Lists of articles in the collections are available via <a href="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc">FTP</a>.
<ul class="dot-list">
<li>Complete Open Access Subset:&emsp;<a href="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.txt">ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.txt</a></li>
<li>Commercial Use Collection:&emsp;<a href="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_comm_use_file_list.txt">ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_comm_use_file_list.txt</a></li>
<li>PMC Author Manuscript Collection:&emsp;<a href="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/manuscript/filelist.txt">ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/manuscript/filelist.txt</a></li>
</ul>
</p>
<p>
These files are also available in the CSV format. A description of the <a href="https://www.ncbi.nlm.nih.gov/pmc/tools/ftp/">FTP Service</a> is available from:&emsp;<a href="https://www.ncbi.nlm.nih.gov/pmc/tools/ftp/">https://www.ncbi.nlm.nih.gov/pmc/tools/ftp/</a>.</p>
<p>
Articles in the BioC API for PMC Open Access are usually updated within
24 hours of these files being updated.
</p>
</div>
</div>
<div class="issue labs-docsums labs-content-box wrappall">
<h4>Instructions</h4>
<div class="usa-width-one-whole">
<pre><code>https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_[format]/[ID]/[encoding]</code></pre>
The parameters are:
<ul class="dot-list">
<li><b>format</b>: xml or json</li>
<li><b>ID</b>: PubMed ID (such as 17299597) or PMC ID (such as PMC1790863)</li>
<li><b>encoding</b>: unicode or ascii</li>
</ul>
<p>
Sample URL:<br/>
<a href="https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/17299597/unicode" target="_blank">
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/17299597/unicode
</a>
</p>
<p>
Same article in ASCII:<br/>
<a href="https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/17299597/ascii" target="_blank">
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/17299597/ascii
</a><br/>
Obviously, no Unicode to ASCII translation is perfect. We have found this one useful.
</p>
<p>
JSON instead of XML:<br/>
<a href="https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/17299597/unicode" target="_blank">
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/17299597/unicode
</a><br/>
BioC JSON follows the same structure as BioC XML.
</p>
<p>
Using PMC ID instead of PubMed ID:<br/>
<a href="https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/PMC1790863/unicode" target="_blank">
https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_xml/PMC1790863/unicode
</a>
</p>
</div>
</div>
<div class="issue labs-docsums labs-content-box wrappall">
<h4>Bulk Download</h4>
<div class="usa-width-one-whole">
<p>
BioC PMC articles can be downloaded in bulk from the FTP site:<br/>
<a href="https://ftp.ncbi.nlm.nih.gov/pub/wilbur/BioC-PMC" target="_blank">
https://ftp.ncbi.nlm.nih.gov/pub/wilbur/BioC-PMC
</a>
</p>
</div>
</div>
<div class="issue labs-docsums labs-content-box wrappall">
<h4>More information</h4>
<div class="usa-width-one-whole">
<p>
General information about BioC XML structure:<br/>
<a href="ftp://ftp.ncbi.nlm.nih.gov/pub/wilbur/BioC-PMC/BioC.dtd" target="_blank">
ftp://ftp.ncbi.nlm.nih.gov/pub/wilbur/BioC-PMC/BioC.dtd
</a>
</p>
<p>
Specific information about BioC-PMC:<br/>
<a href="ftp://ftp.ncbi.nlm.nih.gov/pub/wilbur/BioC-PMC/pmc.key" target="_blank">
ftp://ftp.ncbi.nlm.nih.gov/pub/wilbur/BioC-PMC/pmc.key
</a>
</p>
<p>
Main BioC web page:<br/>
<a href="http://bioc.sourceforge.net" target="_blank">
http://bioc.sourceforge.net
</a>
</p>
</div>
</div>
<div class="issue labs-docsums labs-content-box wrappall">
<h4>Caution</h4>
<div class="usa-width-one-whole">
<p>
If you experience any problems, please share them with us: <a href="mailto:donald.comeau@nih.gov">donald.comeau@nih.gov</a> or <a href="mailto:zhiyong.lu@nih.gov">zhiyong.lu@nih.gov</a>.
</p>
</div>
</div>
</div>
</main>
</main>
</div>
</div>
</div>
<footer class="usa-footer usa-footer-big ncbi-footer" role="contentinfo">
<div class="usa-grid">
<div class="usa-row">
<div class="usa-width-one-half">
<div>
<div class="org-section">
<a href="https://www.hhs.gov/"><img class="usa-footer-logo-img hhs-logo"
src="/research/bionlp/static/base/images/dhhs-logo-white.svg"
alt="U.S. Department of Health & Human Services">
<span class="usa-sr-only">Department of Health and Human Services</span></a>
<a href="https://www.nih.gov/"><img class="usa-footer-logo-img nih-logo"
src="/research/bionlp/static/base/images/nih-logo-white.svg"
alt="National Institutes of Health">
<span class="usa-sr-only">National Institutes of Health</span></a>
<a href="https://www.nlm.nih.gov/"><img class="usa-footer-logo-img nlm-logo"
src="/research/bionlp/static/base/images/nlm-logo-letters-white.svg"
alt="National Library of Medicine">
<span class="usa-sr-only">National Library of Medicine</span></a>
<a href="https://www.usa.gov/"><img class="usa-footer-logo-img usagov-logo"
src="/research/bionlp/static/base/images/usagov-logo-white.svg"
alt="USA.gov"/>
<span class="usa-sr-only">USA.gov</span></a>
</div>
</div>
</div>
<div class="usa-width-one-half">
<div>
<p class="about-links">
<a href="https://www.nlm.nih.gov/research/index.html">About DIR</a>
<a href="https://www.nlm.nih.gov/web_policies.html">Web Policies</a></p>
</div>
</div>
</div>
</div>
</footer>
</div>
<!-- JavaScript -->
<script src="/research/bionlp/static/django_uswds/uswds/js/uswds.js"></script>
<script type="text/javascript" src="/research/bionlp/static/base/header.js"></script>
</body>
</html>