mirror of
https://github.com/lisawilliams/NIH_Data.git
synced 2025-07-24 13:00:16 -07:00
186 lines
6.8 KiB
R
186 lines
6.8 KiB
R
# get in we're counting grants
|
|
# for science
|
|
|
|
# data source: https://reporter.nih.gov/exporter/projects
|
|
# data dictionary: https://report.nih.gov/exporter-data-dictionary
|
|
|
|
# let's load way too many packages
|
|
pacman::p_load(dplyr, tidyr, janitor, tidyr, ggplot2, rio, here, geofacet, DT, RColorBrewer, ggiraph, readr, forcats)
|
|
pacman::p_load(leaflet, glue, sf, tmap, tmaptools, tidycensus, ggmap, htmltools, htmlwidgets)
|
|
pacman::p_load_gh(c("walkerke/tigris", "bhaskarvk/leaflet.extras"))
|
|
|
|
|
|
# this won't work unless you have the correct working directory
|
|
# if you don't know what the path to your working directory is,
|
|
# navigate to the folder you saved this R script in in Terminal
|
|
# and type "realpath". You can copy and paste that path below -
|
|
# this is just sample code and won't work without your path.
|
|
|
|
setwd("")
|
|
|
|
# the path below has to be to the folder with the annual CSV files in it.
|
|
|
|
NIH_Data <-list.files(path='/path/to/my/data') %>%
|
|
lapply(read_csv) %>%
|
|
bind_rows()
|
|
|
|
# that worked but it's picky about the working directory, the wd had to be the /data folder,
|
|
# not its parent folder _shrug_
|
|
|
|
is.data.frame(NIH_Data)
|
|
# returns true
|
|
|
|
head(NIH_Data)
|
|
# looks good
|
|
|
|
|
|
# Make a dataframe showing data from just your state
|
|
# for MYSTATE_ABBREVIATION, put in the 2 letter abbreviation for
|
|
# your state (CO, MN, etc.).
|
|
|
|
# replace MYSTATE_ABBREVIATION with your state's 2 letter abbreviation
|
|
# throughout
|
|
|
|
MYSTATE_ABBREVIATION_NIH_data <- NIH_Data %>%
|
|
filter(ORG_STATE == "MYSTATE_ABBREVIATION")
|
|
|
|
is.data.frame(MYSTATE_ABBREVIATION_NIH_data)
|
|
# returns TRUE
|
|
|
|
# in order to make bar charts showing funding by year, it will
|
|
# be more convenient to have a column showing just the year.
|
|
# let's separate 'AWARD NOTICE DATE' into three columns:
|
|
|
|
NIH_data_split_year <- NIH_Data %>%
|
|
separate(AWARD_NOTICE_DATE, c('AWARD_NOTICE_DATE_YEAR', 'AWARD_NOTICE_DATE_MONTH', 'AWARD_NOTICE_DATE_DAY'))
|
|
|
|
|
|
#let's do that for the MA data too
|
|
MYSTATE_ABBREVIATION_NIH_data_split_year <- MYSTATE_ABBREVIATION_NIH_data %>%
|
|
separate(AWARD_NOTICE_DATE, c('AWARD_NOTICE_DATE_YEAR', 'AWARD_NOTICE_DATE_MONTH', 'AWARD_NOTICE_DATE_DAY'))
|
|
|
|
# ok that worked
|
|
|
|
# let's see if we can make a bar chart of the value of grants awarded by year
|
|
# first let's filter to get the years we want and then group by those years.
|
|
|
|
MYSTATE_ABBREVIATION_NIH_data_split_year<- MYSTATE_ABBREVIATION_NIH_data_split_year %>%
|
|
filter(AWARD_NOTICE_DATE_YEAR >= 2019) %>%
|
|
filter(AWARD_NOTICE_DATE_YEAR <= 2024) %>%
|
|
group_by(AWARD_NOTICE_DATE_YEAR)
|
|
|
|
# check to see if that worked
|
|
View(MYSTATE_ABBREVIATION_NIH_data_split_year)
|
|
|
|
|
|
# let's plot a graph of funding to your state from NIH over the past 5 years
|
|
|
|
plot_MYSTATE_ABBREVIATION_NIH_data <- ggplot(MYSTATE_ABBREVIATION_NIH_data_split_year, aes(x= AWARD_NOTICE_DATE_YEAR, y = TOTAL_COST )) +
|
|
geom_col(stat="identity", fill="dodgerblue") +
|
|
scale_y_continuous(labels = scales::dollar_format(scale = .000000001, suffix = "B"))+
|
|
xlab("NIH research funding to MYSTATE institutions") +
|
|
ylab("")
|
|
|
|
# now let's theme it and display it
|
|
|
|
plot_MYSTATE_ABBREVIATION_NIH_data + theme_light()
|
|
# this should generate a bar chart showing NIH project funding to your state
|
|
|
|
|
|
# let's look at which organizations get the most funding in your state
|
|
MYSTATE_ABBREVIATION_NIH_grants_2023_totals_by_org <- MYSTATE_ABBREVIATION_NIH_data_split_year %>%
|
|
filter(AWARD_NOTICE_DATE_YEAR == 2023) %>%
|
|
group_by(ORG_NAME) %>%
|
|
summarize(TOTAL_COST = sum(TOTAL_COST, na.rm=TRUE)
|
|
)
|
|
|
|
View(MYSTATE_ABBREVIATION_NIH_grants_2023_totals_by_org)
|
|
|
|
# the above does produce a table of NIH grantees, and the TOTAL COST
|
|
# column appears to be a sum of all the grants.
|
|
|
|
# let's sort it descending so we can plot it
|
|
|
|
MYSTATE_ABBREVIATION_NIH_grants_by_org_2023_desc <- MYSTATE_ABBREVIATION_NIH_grants_2023_totals_by_org %>%
|
|
arrange(desc(TOTAL_COST))
|
|
|
|
View(MYSTATE_ABBREVIATION_NIH_grants_by_org_2023_desc)
|
|
|
|
#this should create a table of grant funding to your state's orgs sorted in descending
|
|
#order by how much $ each institution got
|
|
|
|
|
|
# If there are too many institutions in your dataframe to make graphing them
|
|
# impractical, you can filter to show only institutions who received above
|
|
# a certain dollar amount of funding
|
|
|
|
TOP_MYSTATE_ABBREVIATION_big_grantees_desc <- filter(MYSTATE_ABBREVIATION_NIH_grants_by_org_2023_desc, TOTAL_COST >= 30000000)
|
|
|
|
View(TOP_MYSTATE_ABBREVIATION_big_grantees_desc)
|
|
|
|
# let's plot the top grantees
|
|
plot_top_grantees <- ggplot(TOP_MYSTATE_ABBREVIATION_big_grantees_desc,
|
|
aes(x=reorder(TOP_MYSTATE_ABBREVIATION_big_grantees_desc$ORG_NAME,
|
|
TOP_MYSTATE_ABBREVIATION_big_grantees_desc$TOTAL_COST),
|
|
y=TOP_MYSTATE_ABBREVIATION_big_grantees_desc$TOTAL_COST)) +
|
|
geom_bar(stat="identity", fill="dodgerblue") +
|
|
scale_y_continuous(labels = scales::dollar_format(scale = .000001, suffix = "M"))+
|
|
#geom_text(aes(label = signif(TOTAL_COST)), nudge_y = 3) +
|
|
coord_flip() +
|
|
labs(x="", y="Top NIH grantees in [MY STATE NAME] by grant funding, 2023")
|
|
|
|
plot_top_grantees
|
|
# that produces a chart showing institutions in your state receiving NIH funding,
|
|
# in descending order by how much $ each institution got
|
|
|
|
# let's export the grantees
|
|
write.csv(MYSTATE_ABBREVIATION_nih_big_grantees_desc,"/path/to/data/MYSTATE_ABBREVIATION_BIG_NIH_GRANTS.csv", row.names = FALSE)
|
|
|
|
# you can also create a datatable that is sortable and searchable.
|
|
|
|
MYSTATE_ABBREVIATION_NIH_data_2023 <- MYSTATE_ABBREVIATION_NIH_data_split_year %>%
|
|
filter(AWARD_NOTICE_DATE_YEAR == 2023) %>%
|
|
group_by(ORG_NAME)
|
|
|
|
datatable(MYSTATE_ABBREVIATION_NIH_data_2023)
|
|
|
|
# note that the .gitignore file for this repo filters out .csv files that
|
|
# you have created above so that they will not be uploaded to Github.
|
|
# you can always edit the .gitignore file if you want to change how
|
|
# things are handled.
|
|
|
|
#do you want to see how your state compares to other states? Try this:
|
|
|
|
|
|
NIH_BY_STATE <- NIH_data_split_year %>%
|
|
filter(AWARD_NOTICE_DATE_YEAR == 2023) %>%
|
|
filter(ORG_COUNTRY == "UNITED STATES") %>%
|
|
filter(TOTAL_COST > 1000000) %>%
|
|
group_by(ORG_STATE) %>%
|
|
summarize(TOTAL_COST = sum(TOTAL_COST, na.rm=TRUE)) %>%
|
|
arrange(desc(TOTAL_COST))
|
|
|
|
write.csv(NIH_BY_STATE,"/Users/lisawilliams/code/R_For_Mass_Communications/NIH_Data/NIH_BY_STATE_2023.csv", row.names = FALSE)
|
|
|
|
|
|
plot_NIH_by_state <- ggplot(NIH_BY_STATE,
|
|
aes(x=reorder(NIH_BY_STATE$ORG_STATE,
|
|
NIH_BY_STATE$TOTAL_COST),
|
|
y=NIH_BY_STATE$TOTAL_COST)) +
|
|
geom_bar(stat="identity", fill="dodgerblue") +
|
|
scale_y_continuous(labels = scales::dollar_format(scale = .000001, suffix = "M"))+
|
|
#geom_text(aes(label = signif(TOTAL_COST)), nudge_y = 3) +
|
|
coord_flip() +
|
|
labs(x="", y="")+
|
|
ggtitle("NIH Funding By US State/Territory, 2023",
|
|
subtitle = "Data from NIH RePORTER")
|
|
|
|
plot_NIH_by_state + theme_light()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|