NIH_Data/NIH_Grants.R
2025-02-01 23:25:04 -05:00

186 lines
6.8 KiB
R

# get in we're counting grants
# for science
# data source: https://reporter.nih.gov/exporter/projects
# data dictionary: https://report.nih.gov/exporter-data-dictionary
# let's load way too many packages
pacman::p_load(dplyr, tidyr, janitor, tidyr, ggplot2, rio, here, geofacet, DT, RColorBrewer, ggiraph, readr, forcats)
pacman::p_load(leaflet, glue, sf, tmap, tmaptools, tidycensus, ggmap, htmltools, htmlwidgets)
pacman::p_load_gh(c("walkerke/tigris", "bhaskarvk/leaflet.extras"))
# this won't work unless you have the correct working directory
# if you don't know what the path to your working directory is,
# navigate to the folder you saved this R script in in Terminal
# and type "realpath". You can copy and paste that path below -
# this is just sample code and won't work without your path.
setwd("")
# the path below has to be to the folder with the annual CSV files in it.
NIH_Data <-list.files(path='/path/to/my/data') %>%
lapply(read_csv) %>%
bind_rows()
# that worked but it's picky about the working directory, the wd had to be the /data folder,
# not its parent folder _shrug_
is.data.frame(NIH_Data)
# returns true
head(NIH_Data)
# looks good
# Make a dataframe showing data from just your state
# for MYSTATE_ABBREVIATION, put in the 2 letter abbreviation for
# your state (CO, MN, etc.).
# replace MYSTATE_ABBREVIATION with your state's 2 letter abbreviation
# throughout
MYSTATE_ABBREVIATION_NIH_data <- NIH_Data %>%
filter(ORG_STATE == "MYSTATE_ABBREVIATION")
is.data.frame(MYSTATE_ABBREVIATION_NIH_data)
# returns TRUE
# in order to make bar charts showing funding by year, it will
# be more convenient to have a column showing just the year.
# let's separate 'AWARD NOTICE DATE' into three columns:
NIH_data_split_year <- NIH_Data %>%
separate(AWARD_NOTICE_DATE, c('AWARD_NOTICE_DATE_YEAR', 'AWARD_NOTICE_DATE_MONTH', 'AWARD_NOTICE_DATE_DAY'))
#let's do that for the MA data too
MYSTATE_ABBREVIATION_NIH_data_split_year <- MYSTATE_ABBREVIATION_NIH_data %>%
separate(AWARD_NOTICE_DATE, c('AWARD_NOTICE_DATE_YEAR', 'AWARD_NOTICE_DATE_MONTH', 'AWARD_NOTICE_DATE_DAY'))
# ok that worked
# let's see if we can make a bar chart of the value of grants awarded by year
# first let's filter to get the years we want and then group by those years.
MYSTATE_ABBREVIATION_NIH_data_split_year<- MYSTATE_ABBREVIATION_NIH_data_split_year %>%
filter(AWARD_NOTICE_DATE_YEAR >= 2019) %>%
filter(AWARD_NOTICE_DATE_YEAR <= 2024) %>%
group_by(AWARD_NOTICE_DATE_YEAR)
# check to see if that worked
View(MYSTATE_ABBREVIATION_NIH_data_split_year)
# let's plot a graph of funding to your state from NIH over the past 5 years
plot_MYSTATE_ABBREVIATION_NIH_data <- ggplot(MYSTATE_ABBREVIATION_NIH_data_split_year, aes(x= AWARD_NOTICE_DATE_YEAR, y = TOTAL_COST )) +
geom_col(stat="identity", fill="dodgerblue") +
scale_y_continuous(labels = scales::dollar_format(scale = .000000001, suffix = "B"))+
xlab("NIH research funding to MYSTATE institutions") +
ylab("")
# now let's theme it and display it
plot_MYSTATE_ABBREVIATION_NIH_data + theme_light()
# this should generate a bar chart showing NIH project funding to your state
# let's look at which organizations get the most funding in your state
MYSTATE_ABBREVIATION_NIH_grants_2023_totals_by_org <- MYSTATE_ABBREVIATION_NIH_data_split_year %>%
filter(AWARD_NOTICE_DATE_YEAR == 2023) %>%
group_by(ORG_NAME) %>%
summarize(TOTAL_COST = sum(TOTAL_COST, na.rm=TRUE)
)
View(MYSTATE_ABBREVIATION_NIH_grants_2023_totals_by_org)
# the above does produce a table of NIH grantees, and the TOTAL COST
# column appears to be a sum of all the grants.
# let's sort it descending so we can plot it
MYSTATE_ABBREVIATION_NIH_grants_by_org_2023_desc <- MYSTATE_ABBREVIATION_NIH_grants_2023_totals_by_org %>%
arrange(desc(TOTAL_COST))
View(MYSTATE_ABBREVIATION_NIH_grants_by_org_2023_desc)
#this should create a table of grant funding to your state's orgs sorted in descending
#order by how much $ each institution got
# If there are too many institutions in your dataframe to make graphing them
# impractical, you can filter to show only institutions who received above
# a certain dollar amount of funding
TOP_MYSTATE_ABBREVIATION_big_grantees_desc <- filter(MYSTATE_ABBREVIATION_NIH_grants_by_org_2023_desc, TOTAL_COST >= 30000000)
View(TOP_MYSTATE_ABBREVIATION_big_grantees_desc)
# let's plot the top grantees
plot_top_grantees <- ggplot(TOP_MYSTATE_ABBREVIATION_big_grantees_desc,
aes(x=reorder(TOP_MYSTATE_ABBREVIATION_big_grantees_desc$ORG_NAME,
TOP_MYSTATE_ABBREVIATION_big_grantees_desc$TOTAL_COST),
y=TOP_MYSTATE_ABBREVIATION_big_grantees_desc$TOTAL_COST)) +
geom_bar(stat="identity", fill="dodgerblue") +
scale_y_continuous(labels = scales::dollar_format(scale = .000001, suffix = "M"))+
#geom_text(aes(label = signif(TOTAL_COST)), nudge_y = 3) +
coord_flip() +
labs(x="", y="Top NIH grantees in [MY STATE NAME] by grant funding, 2023")
plot_top_grantees
# that produces a chart showing institutions in your state receiving NIH funding,
# in descending order by how much $ each institution got
# let's export the grantees
write.csv(MYSTATE_ABBREVIATION_nih_big_grantees_desc,"/path/to/data/MYSTATE_ABBREVIATION_BIG_NIH_GRANTS.csv", row.names = FALSE)
# you can also create a datatable that is sortable and searchable.
MYSTATE_ABBREVIATION_NIH_data_2023 <- MYSTATE_ABBREVIATION_NIH_data_split_year %>%
filter(AWARD_NOTICE_DATE_YEAR == 2023) %>%
group_by(ORG_NAME)
datatable(MYSTATE_ABBREVIATION_NIH_data_2023)
# note that the .gitignore file for this repo filters out .csv files that
# you have created above so that they will not be uploaded to Github.
# you can always edit the .gitignore file if you want to change how
# things are handled.
#do you want to see how your state compares to other states? Try this:
NIH_BY_STATE <- NIH_data_split_year %>%
filter(AWARD_NOTICE_DATE_YEAR == 2023) %>%
filter(ORG_COUNTRY == "UNITED STATES") %>%
filter(TOTAL_COST > 1000000) %>%
group_by(ORG_STATE) %>%
summarize(TOTAL_COST = sum(TOTAL_COST, na.rm=TRUE)) %>%
arrange(desc(TOTAL_COST))
write.csv(NIH_BY_STATE,"/Users/lisawilliams/code/R_For_Mass_Communications/NIH_Data/NIH_BY_STATE_2023.csv", row.names = FALSE)
plot_NIH_by_state <- ggplot(NIH_BY_STATE,
aes(x=reorder(NIH_BY_STATE$ORG_STATE,
NIH_BY_STATE$TOTAL_COST),
y=NIH_BY_STATE$TOTAL_COST)) +
geom_bar(stat="identity", fill="dodgerblue") +
scale_y_continuous(labels = scales::dollar_format(scale = .000001, suffix = "M"))+
#geom_text(aes(label = signif(TOTAL_COST)), nudge_y = 3) +
coord_flip() +
labs(x="", y="")+
ggtitle("NIH Funding By US State/Territory, 2023",
subtitle = "Data from NIH RePORTER")
plot_NIH_by_state + theme_light()