initial commit

This commit is contained in:
Lisa Williams 2025-02-01 23:08:47 -05:00
commit 4808b983b1
4 changed files with 310 additions and 0 deletions

54
.gitignore vendored Normal file
View file

@ -0,0 +1,54 @@
# Compiled source #
###################
*.com
*.class
*.dll
*.exe
*.o
*.so
# Packages #
############
# it's better to unpack these files and commit the raw source
# git has its own built in compression methods
*.7z
*.dmg
*.gz
*.iso
*.jar
*.rar
*.tar
*.zip
# Logs and databases #
######################
*.log
*.sql
*.sqlite
# OS generated files #
######################
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db
#R files#
#########
*.rproj
* rproj.user
* rproj.history
.Rproj.user
.Rhistory
#Image files#
*.png
# Files to ignore because they are huge files
RePORTER_PRJ*
*.csv

159
NIH_Grants.R Normal file
View file

@ -0,0 +1,159 @@
# get in we're counting grants
# for science
# data source: https://reporter.nih.gov/exporter/projects
# data dictionary: https://report.nih.gov/exporter-data-dictionary
# let's load way too many packages
pacman::p_load(dplyr, tidyr, janitor, tidyr, ggplot2, rio, here, geofacet, DT, RColorBrewer, ggiraph, readr, forcats)
pacman::p_load(leaflet, glue, sf, tmap, tmaptools, tidycensus, ggmap, htmltools, htmlwidgets)
pacman::p_load_gh(c("walkerke/tigris", "bhaskarvk/leaflet.extras"))
# this won't work unless you have the correct working directory
# if you don't know what the path to your working directory is,
# navigate to the folder you saved this R script in in Terminal
# and type "realpath". You can copy and paste that path below -
# this is just sample code and won't work without your path.
setwd("")
# the path below has to be to the folder with the annual CSV files in it.
NIH_Data <-list.files(path='/path/to/my/data') %>%
lapply(read_csv) %>%
bind_rows()
# that worked but it's picky about the working directory, the wd had to be the /data folder,
# not its parent folder _shrug_
is.data.frame(NIH_Data)
# returns true
head(NIH_Data)
# looks good
# Make a dataframe showing data from just your state
# for MYSTATE_ABBREVIATION, put in the 2 letter abbreviation for
# your state (CO, MN, etc.).
# replace MYSTATE_ABBREVIATION with your state's 2 letter abbreviation
# throughout
MYSTATE_ABBREVIATION_NIH_data <- NIH_Data %>%
filter(ORG_STATE == "MYSTATE_ABBREVIATION")
is.data.frame(MYSTATE_ABBREVIATION_NIH_data)
# returns TRUE
# in order to make bar charts showing funding by year, it will
# be more convenient to have a column showing just the year.
# let's separate 'AWARD NOTICE DATE' into three columns:
NIH_data_split_year <- NIH_Data %>%
separate(AWARD_NOTICE_DATE, c('AWARD_NOTICE_DATE_YEAR', 'AWARD_NOTICE_DATE_MONTH', 'AWARD_NOTICE_DATE_DAY'))
#let's do that for the MA data too
MYSTATE_ABBREVIATION_NIH_data_split_year <- MYSTATE_ABBREVIATION_NIH_data %>%
separate(AWARD_NOTICE_DATE, c('AWARD_NOTICE_DATE_YEAR', 'AWARD_NOTICE_DATE_MONTH', 'AWARD_NOTICE_DATE_DAY'))
# ok that worked
# let's see if we can make a bar chart of the value of grants awarded by year
# first let's filter to get the years we want and then group by those years.
MYSTATE_ABBREVIATION_NIH_data_split_year<- MYSTATE_ABBREVIATION_NIH_data_split_year %>%
filter(AWARD_NOTICE_DATE_YEAR >= 2019) %>%
filter(AWARD_NOTICE_DATE_YEAR <= 2024) %>%
group_by(AWARD_NOTICE_DATE_YEAR)
# check to see if that worked
View(MYSTATE_ABBREVIATION_NIH_data_split_year)
# let's plot a graph of funding to your state from NIH over the past 5 years
plot_MYSTATE_ABBREVIATION_NIH_data <- ggplot(MYSTATE_ABBREVIATION_NIH_data_split_year, aes(x= AWARD_NOTICE_DATE_YEAR, y = TOTAL_COST )) +
geom_col(stat="identity", fill="dodgerblue") +
scale_y_continuous(labels = scales::dollar_format(scale = .000000001, suffix = "B"))+
xlab("NIH research funding to MYSTATE institutions") +
ylab("")
# now let's theme it and display it
plot_MYSTATE_ABBREVIATION_NIH_data + theme_light()
# this should generate a bar chart showing NIH project funding to your state
# let's look at which organizations get the most funding in your state
MYSTATE_ABBREVIATION_NIH_grants_2023_totals_by_org <- MYSTATE_ABBREVIATION_NIH_data_split_year %>%
filter(AWARD_NOTICE_DATE_YEAR == 2023) %>%
group_by(ORG_NAME) %>%
summarize(TOTAL_COST = sum(TOTAL_COST, na.rm=TRUE)
)
View(MYSTATE_ABBREVIATION_NIH_grants_2023_totals_by_org)
# the above does produce a table of NIH grantees, and the TOTAL COST
# column appears to be a sum of all the grants.
# let's sort it descending so we can plot it
MYSTATE_ABBREVIATION_NIH_grants_by_org_2023_desc <- MYSTATE_ABBREVIATION_NIH_grants_2023_totals_by_org %>%
arrange(desc(TOTAL_COST))
View(MYSTATE_ABBREVIATION_NIH_grants_by_org_2023_desc)
#this should create a table of grant funding to your state's orgs sorted in descending
#order by how much $ each institution got
# If there are too many institutions in your dataframe to make graphing them
# impractical, you can filter to show only institutions who received above
# a certain dollar amount of funding
TOP_MYSTATE_ABBREVIATION_big_grantees_desc <- filter(MYSTATE_ABBREVIATION_NIH_grants_by_org_2023_desc, TOTAL_COST >= 30000000)
View(TOP_MYSTATE_ABBREVIATION_big_grantees_desc)
# let's plot the top grantees
plot_top_grantees <- ggplot(TOP_MYSTATE_ABBREVIATION_big_grantees_desc,
aes(x=reorder(TOP_MYSTATE_ABBREVIATION_big_grantees_desc$ORG_NAME,
TOP_MYSTATE_ABBREVIATION_big_grantees_desc$TOTAL_COST),
y=TOP_MYSTATE_ABBREVIATION_big_grantees_desc$TOTAL_COST)) +
geom_bar(stat="identity", fill="dodgerblue") +
scale_y_continuous(labels = scales::dollar_format(scale = .000001, suffix = "M"))+
#geom_text(aes(label = signif(TOTAL_COST)), nudge_y = 3) +
coord_flip() +
labs(x="", y="Top NIH grantees in [MY STATE NAME] by grant funding, 2023")
plot_top_grantees
# that produces a chart showing institutions in your state receiving NIH funding,
# in descending order by how much $ each institution got
# let's export the grantees
write.csv(MYSTATE_ABBREVIATION_nih_big_grantees_desc,"/path/to/data/MYSTATE_ABBREVIATION_BIG_NIH_GRANTS.csv", row.names = FALSE)
# you can also create a datatable that is sortable and searchable.
MYSTATE_ABBREVIATION_NIH_data_2023 <- MYSTATE_ABBREVIATION_NIH_data_split_year %>%
filter(AWARD_NOTICE_DATE_YEAR == 2023) %>%
group_by(ORG_NAME)
datatable(MYSTATE_ABBREVIATION_NIH_data_2023)
# note that the .gitignore file for this repo filters out .csv files that
# you have created above so that they will not be uploaded to Github.
# you can always edit the .gitignore file if you want to change how
# things are handled.

View file

@ -0,0 +1,71 @@
This is an archive of the text here in case it goes away: https://report.nih.gov/exporter-data-dictionary
RePORTER Project Data
ExPORTER Description RePORTER Search Fields RePORTER -
export
Application_ID A unique identifier of the project record in the RePORTER database. Application ID Application ID
Activity A 3-character code identifying the grant, contract, or intramural activity through which a project is supported. Within each funding mechanism, NIH uses 3-character activity codes (e.g., F32, K08, P01, R01, T32, etc.) to differentiate the wide variety of research-related programs NIH supports. A comprehensive list of activity codes for grants and cooperative agreements may be found on the Types of Grant Programs Web page. RePORTER also includes R&D contracts (activity codes beginning with the letter N) and intramural projects (beginning with the letter Z). Activity Code Activity
Administering_IC Administering Institute or Center - A two-character code to designate the agency, NIH Institute, or Center administering the grant. See Institute/Center code definitions. Agency/ Institute/ Center Administering IC
Application_Type A one-digit code to identify the type of application funded: 1 = New application 2 = Competing continuation (also, competing renewal) 3 = Application for additional (supplemental) support. There are two kinds of type 3 competing revisions (which are peer-reviewed and administrative supplements) 4 = Competing extension for an R37 award or first non-competing year of a Fast-Track SBIR/STTR award 5 = Non-competing continuation 7 = Change of grantee institution 9 = Change of NIH awarding Institute or Division (on a competing continuation) Award Type Type
ARRA_Funded “Y” indicates a project supported by funds appropriated through the American Recovery and Reinvestment Act of 2009. NIH ARRA Selection ARRA Indicator
Award_Notice_Date Award notice date or Notice of Grant Award (NGA) is a legally binding document stating the government has obligated funds and which defines the period of support and the terms and conditions of award. Award Notice Date Award Notice Date
Budget_Start The date when a projects funding for a particular fiscal year begins. NA Budget Start Date
Budget_End The date when a projects funding for a particular fiscal year ends. NA Budget End Date
CFDA_Code Federal programs are assigned a number in the Catalog of Federal Domestic Assistance (CFDA), which is referred to as the "CFDA code." The CFDA database helps the Federal government track all programs it has domestically funded NA CFDA Code
Core_Project_Num An identifier for each research project, used to associate the project with publication and patent records. This identifier is not specific to any particular year of the project. It consists of the project activity code, administering IC, and serial number (a concatenation of Activity, Administering_IC, and Serial_Number). Project Number NA
ED_Inst_Type Generic name for the grouping of components across an institution who has applied for or receives NIH funding. The official name as used by NIH is Major Component Combining Name Organization Type Organization Type
FOA_Number The number of the funding opportunity announcement, if any, under which the project application was solicited. Funding opportunity announcements may be categorized as program announcements, requests for applications, notices of funding availability, solicitations, or other names depending on the agency and type of program. Funding opportunity announcements can be found at Grants.gov/FIND and in the NIH Guide to Grants and Contracts. FOA FOA
Full_Project_Num Commonly referred to as a grant number, intramural project, or contract number. For grants, this unique identification number is composed of the type code, activity code, Institute/Center code, serial number, support year, and (optional) a suffix code to designate amended applications and supplements. Project Number Project Number
Funding_IC(s) The NIH Institute or Center(s) providing funding for a project are designated by their acronyms (see Institute/Center acronyms). Each funding IC is followed by a colon (:) and the amount of funding provided for the fiscal year by that IC. Multiple ICs are separated by semicolons (;). Project funding information is available only for NIH, CDC, FDA, and ACF projects. Agency/ Institute/ Center Funding IC(s)
Funding_Mechanism The major mechanism categories used in NIH Budget mechanism tables for the Presidents budget. Extramural research awards are divided into three main funding mechanisms: grants, cooperative agreements and contracts. A funding mechanism is the type of funded application or transaction used at the NIH. Within each funding mechanism NIH includes programs. Programs can be further refined by specific activity codes. Funding Mechanism Funding Mechanism
FY The fiscal year appropriation from which project funds were obligated. Fiscal Year Fiscal Year
IC_Name Full name of the administering agency, Institute, or Center. Agency/ Institute/ Center IC
NIH_Spending_Cats Congressionally-mandated reporting categories into which NIH projects are categorized. Available for fiscal years 2008 and later. Each projects spending category designations for each fiscal year are made available the following year as part of the next Presidents Budget request. See the Research, Condition, and Disease Categorization System for more information on the categorization process. NIH Spending Category NIH Spending Categorization
Org_City The city in which the business office of the grantee organization or contractor is located. Note that this may be different from the research performance site. For all NIH intramural projects, Bethesda, MD is used. City Organization City
Org_Country The country in which the business office of the grantee organization or contractor is located. Note that this may be different from the research performance site. Country Organization Country
Org_DEPT The departmental affiliation of the contact principal investigator for a project, using a standardized categorization of departments. Names are available only for medical school departments. Department Type Department
Org_District The congressional district in which the business office of the grantee organization or contractor is located. Note that this may be different from the research performance site. Congressional District Congressional District
Org_DUNS This field may contain multiple DUNS Numbers separated by a semi-colon. The Data Universal Numbering System is a unique nine-digit number assigned by Dun and Bradstreet Information Services, recognized as the universal standard for identifying and keeping track of business worldwide. NA DUNS Number
Org_FIPS The country code of the grantee organization or contractor as defined in the Federal Information Processing Standard. NA FIPS
ORG_IPF_CODE The Institution Profile (IPF) number is an internal NIH identifier that uniquely identifies and associates institutional information within NIH electronic systems. The NIH assigns an IPF number after the institution submits its request for registration. NA Organization ID (IPF)
Org_Name The name of the educational institution, research organization, business, or government agency receiving funding for the grant, contract, cooperative agreement, or intramural project.
Institutional Lookup This file is provided as an analytical resource, encompassing historical data snapshots, and aims to provide look-up values for any IPF code and DUNS number that are present in an ExPORTER file. As expected in such a longitudinal file, a single IPF may be associated with multiple DUNS or vice versa; Also, IPF and DUNS may be associated with multiple institution names as institutions have changed their structure and name over time. The file represents the data at the time the annual ExPORTER files are created. Organization Organization Name
Org_State The state in which the business office of the grantee organization or contractor is located. Note that this may be different from the research performance site. State Organization State
Org_Zipcode The zip code in which the business office of the grantee organization or contractor is located. Note that this may be different from the research performance site. NA Organization Zip
PHR Submitted as part of a grant application, this statement articulates a project's potential to improve public health. NA Public Health Relevance
PI_ID(s) A unique identifier for each of the project Principal Investigators. Each PI in the RePORTER database has a unique identifier that is constant from project to project and year to year, but changes may be observed for investigators that have had multiple accounts in the past, particularly for those associated with contracts or sub-projects. Principal Investigator (PI) Contact PI Person ID
PI_Name(s) The name(s) of the Principal Investigator(s) designated by the organization to direct the research project. Principal Investigator (PI) Contact PI / Project Leader; Other PI or Project Leader(s)
Program_Officer_Name An Institute staff member who coordinates the substantive aspects of a contract from planning the request for proposal to oversight. Program Officer (PO) Program Official Information
Project_Start The start date of a project. For subprojects of a multi-project grant, this is the start date of the parent award. Project Start Date Project Start Date
Project_End The current end date of the project, including any future years for which commitments have been made. For subprojects of a multi-project grant, this is the end date of the parent award. Upon competitive renewal of a grant, the project end date is extended by the length of the renewal award. Project End Date Project End Date
Project_Terms Prior to fiscal year 2008, these were thesaurus terms assigned by NIH CRISP indexers. For projects funded in fiscal year 2008 and later, these are concepts that are mined from the project's title, abstract, and specific aims using an automated text mining tool. Text Search Project Terms
Project_Title Title of the funded grant, contract, or intramural (sub)project. Text Search Project Title
Serial_Number A six-digit number assigned in serial number order within each administering organization. Project Number Serial Number
Study_Section A designator of the legislatively-mandated panel of subject matter experts that reviewed the research grant application for scientific and technical merit. Study Section NA
Study_Section_Name The full name of a regular standing Study Section that reviewed the research grant application for scientific and technical merit. Applications reviewed by panels other than regular standing study sections are designated by “Special Emphasis Panel.” Study Section Study Section
Subproject_ID A unique numeric designation assigned to subprojects of a “parent” multi-project research grant. NA Subproject Number
Suffix A suffix to the grant application number that includes the letter "A" and a serial number to identify an amended version of an original application and/or the letter "S" and serial number indicating a supplement to the project. Project Number Suffix
Support_Year The year of support for a project, as shown in the full project number. For example, a project with number 5R01GM0123456-04 is in its fourth year of support. Project Number Support Year
Direct_Cost_Amt Total direct cost funding for a project from all NIH Institute and Centers for a given fiscal year. Costs are available only for NIH awards funded in FY 2012 onward. Direct cost amounts are not available for SBIR/STTR awards. NA Direct Cost IC
Indirect_Cost_Amt Total indirect cost funding for a project from all NIH Institute and Centers for a given fiscal year. Costs are available only for NIH awards funded in FY 2012 and onward. Indirect cost amounts are not available for SBIR/STTR awards. NA InDirect Cost IC
Total_Cost Total project funding from all NIH Institute and Centers for a given fiscal year. Costs are available only for:
NIH, CDC, FDA, and ACF grant awards (only the parent record of multi-project grants).
NIH intramural projects (activity codes beginning with “Z”) in FY 2007 and later fiscal years.
NIH contracts (activity codes beginning with “N”).
For multi-project grants, Total_Cost includes funding for all of the constituent subprojects. This field will be blank on subproject records; the total cost of each subproject is found in Total_Cost_Sub_Project. Award Size Total Cost
Total_Cost_Sub_Project Applies to subproject records only. Total funding for a subproject from all NIH Institute and Centers for a given fiscal year. Costs are available only for NIH awards. Award Size Total Cost (Sub Projects)
NA The geographic latitude coordinate of an Organization. NA Latitude
NA The geographic longitude coordinate of an Organization. NA Longitude
NA Projects awarded to study COVID-19 and related topics, as funded under:
RegCV - NIH regular appropriations funding
CV - Coronavirus Preparedness and Response Supplemental Appropriations Act, 2020
C3 - CARES Act (Coronavirus Aid, Relief, and Economic Security Act)
C4 - Paycheck Protection Program and Health Care Enhancement Act
C5 - Coronavirus Response and Relief Supplemental Appropriations Act, 2021
C6 - American Rescue Plan Act of 2021the geographic coordinate of an Organization
NIH COVID-19 Response NIH COVID-19 Response
NA When there are multiple Principal Investigators/Project Leaders, other Principal Investigators/Project Leaders are assigned to the project besides the main contact Principal Investigator. Principal Investigator (PI) Other PI or Project Leader(s)
NA Total project funding for one NIH Institute and Centers for a project a given fiscal year. Award Size Total Cost IC
NA Primary DUNS number of an organization when there are multiple DUNS assigned to the organization. NA Primary DUNS
NA The Unique Entity Identifier (UEI) assigned to all entities (public and private companies, individuals, institutions, or organizations) who must register to do business with the federal government in SAM. Update: Notification of Upcoming Change in Federal-wide Unique Entity Identifier Requirements. NA UEI
NA Primary UEI of an organization when there are multiple UEI assigned to the organization. NA Primary UEI

26
readme.md Normal file
View file

@ -0,0 +1,26 @@
# NIH Project Data, 2019-2024
The following repository contains links and some sample code written in R to help you parse NIH project data.
## NIH Project Data
The raw data in .CSV format is rather large, so it is stored here: https://drive.google.com/drive/folders/1iE3hYTTO7IXaBadpOJT9wL1VmBLJ3Wpc?usp=sharing
The original data can be found on the NIH website at the following URL: https://reporter.nih.gov/exporter/projects
The data dictionary, defining each column in the CSV, is available here: https://report.nih.gov/exporter-data-dictionary
For your convenience and so you can see it locally, I've replicated the data dictionary in this repo in the text file named NIH_RePORTER_Project_Data_Dictionary.
## R sample code for parsing this data and making simple plots
The sample code contained here will help you do some basic data cleanup, like combining the .CSV file of each year of the RePORTER data into a single dataframe, and separating date columns into year, month, date columns to make them easier to work with.
Please note that I am an #RStats beginner. I do not do programming for a living. You are welcome, of course, to modify this code or write your own to interact with the NIH data. Please be kind with your feedback with the recognition that I have limited time to devote to projects like this one.