NIH_Data/NIH_Grants.R

# get in we're counting grants
# for science

# data source: https://reporter.nih.gov/exporter/projects
# data dictionary: https://report.nih.gov/exporter-data-dictionary

# let's load way too many packages
pacman::p_load(dplyr, tidyr, janitor, tidyr, ggplot2, rio, here, geofacet, DT, RColorBrewer, ggiraph, readr, forcats)
pacman::p_load(leaflet, glue, sf, tmap, tmaptools, tidycensus, ggmap, htmltools, htmlwidgets)
pacman::p_load_gh(c("walkerke/tigris", "bhaskarvk/leaflet.extras"))


# this won't work unless you have the correct working directory
# if you don't know what the path to your working directory is,
# navigate to the folder you saved this R script in in Terminal
# and type "realpath". You can copy and paste that path below -
# this is just sample code and won't work without your path.

setwd("")

# the path below has to be to the folder with the annual CSV files in it.

NIH_Data <-list.files(path='/path/to/my/data') %>%
  lapply(read_csv) %>%
  bind_rows()

# that worked but it's picky about the working directory, the wd had to be the /data folder,
# not its parent folder _shrug_

is.data.frame(NIH_Data)
# returns true

head(NIH_Data)
# looks good


# Make a dataframe showing data from just your state
# for MYSTATE_ABBREVIATION, put in the 2 letter abbreviation for
# your state (CO, MN, etc.).

# replace MYSTATE_ABBREVIATION with your state's 2 letter abbreviation
# throughout

MYSTATE_ABBREVIATION_NIH_data <- NIH_Data %>%
  filter(ORG_STATE == "MYSTATE_ABBREVIATION")

is.data.frame(MYSTATE_ABBREVIATION_NIH_data)
# returns TRUE

# in order to make bar charts showing funding by year, it will
# be more convenient to have a column showing just the year.
# let's separate 'AWARD NOTICE DATE' into three columns:

NIH_data_split_year <- NIH_Data %>%
  separate(AWARD_NOTICE_DATE, c('AWARD_NOTICE_DATE_YEAR', 'AWARD_NOTICE_DATE_MONTH', 'AWARD_NOTICE_DATE_DAY'))


#let's do that for the MA data too
MYSTATE_ABBREVIATION_NIH_data_split_year <- MYSTATE_ABBREVIATION_NIH_data %>%
  separate(AWARD_NOTICE_DATE, c('AWARD_NOTICE_DATE_YEAR', 'AWARD_NOTICE_DATE_MONTH', 'AWARD_NOTICE_DATE_DAY'))

# ok that worked

# let's see if we can make a bar chart of the value of grants awarded by year
# first let's filter to get the years we want and then group by those years.

MYSTATE_ABBREVIATION_NIH_data_split_year<- MYSTATE_ABBREVIATION_NIH_data_split_year %>%
  filter(AWARD_NOTICE_DATE_YEAR >= 2019) %>%
  filter(AWARD_NOTICE_DATE_YEAR <= 2024) %>%
  group_by(AWARD_NOTICE_DATE_YEAR)

# check to see if that worked
View(MYSTATE_ABBREVIATION_NIH_data_split_year)


# let's plot a graph of funding to your state from NIH over the past 5 years

plot_MYSTATE_ABBREVIATION_NIH_data <- ggplot(MYSTATE_ABBREVIATION_NIH_data_split_year, aes(x= AWARD_NOTICE_DATE_YEAR, y = TOTAL_COST )) +
  geom_col(stat="identity", fill="dodgerblue") +
  scale_y_continuous(labels = scales::dollar_format(scale = .000000001, suffix = "B"))+
  xlab("NIH research funding to MYSTATE institutions") +
  ylab("")

# now let's theme it and display it

plot_MYSTATE_ABBREVIATION_NIH_data + theme_light()
# this should generate a bar chart showing NIH project funding to your state


# let's look at which organizations get the most funding in your state
MYSTATE_ABBREVIATION_NIH_grants_2023_totals_by_org <- MYSTATE_ABBREVIATION_NIH_data_split_year %>%
  filter(AWARD_NOTICE_DATE_YEAR == 2023) %>%
  group_by(ORG_NAME) %>%
  summarize(TOTAL_COST = sum(TOTAL_COST, na.rm=TRUE)
  )

View(MYSTATE_ABBREVIATION_NIH_grants_2023_totals_by_org)

# the above does produce a table of NIH grantees, and the TOTAL COST
# column appears to be a sum of all the grants.

# let's sort it descending so we can plot it

MYSTATE_ABBREVIATION_NIH_grants_by_org_2023_desc <- MYSTATE_ABBREVIATION_NIH_grants_2023_totals_by_org %>%
  arrange(desc(TOTAL_COST))

View(MYSTATE_ABBREVIATION_NIH_grants_by_org_2023_desc)

#this should create a table of grant funding to your state's orgs sorted in descending
#order by how much $ each institution got


# If there are too many institutions in your dataframe to make graphing them
# impractical, you can filter to show only institutions who received above
# a certain dollar amount of funding

TOP_MYSTATE_ABBREVIATION_big_grantees_desc <- filter(MYSTATE_ABBREVIATION_NIH_grants_by_org_2023_desc, TOTAL_COST >= 30000000)

View(TOP_MYSTATE_ABBREVIATION_big_grantees_desc)

# let's plot the top grantees
plot_top_grantees <- ggplot(TOP_MYSTATE_ABBREVIATION_big_grantees_desc,
                            aes(x=reorder(TOP_MYSTATE_ABBREVIATION_big_grantees_desc$ORG_NAME,
                                          TOP_MYSTATE_ABBREVIATION_big_grantees_desc$TOTAL_COST),
                                y=TOP_MYSTATE_ABBREVIATION_big_grantees_desc$TOTAL_COST)) +
  geom_bar(stat="identity", fill="dodgerblue") +
  scale_y_continuous(labels = scales::dollar_format(scale = .000001, suffix = "M"))+
  #geom_text(aes(label = signif(TOTAL_COST)), nudge_y = 3) +
  coord_flip() +
  labs(x="", y="Top NIH grantees in [MY STATE NAME] by grant funding, 2023")

plot_top_grantees
# that produces a chart showing institutions in your state receiving NIH funding,
# in descending order by how much $ each institution got

# let's export the grantees
write.csv(MYSTATE_ABBREVIATION_nih_big_grantees_desc,"/path/to/data/MYSTATE_ABBREVIATION_BIG_NIH_GRANTS.csv", row.names = FALSE)

# you can also create a datatable that is sortable and searchable.

MYSTATE_ABBREVIATION_NIH_data_2023 <- MYSTATE_ABBREVIATION_NIH_data_split_year %>%
  filter(AWARD_NOTICE_DATE_YEAR == 2023) %>%
  group_by(ORG_NAME)

datatable(MYSTATE_ABBREVIATION_NIH_data_2023)

# note that the .gitignore file for this repo filters out .csv files that
# you have created above so that they will not be uploaded to Github.
# you can always edit the .gitignore file if you want to change how
# things are handled.

#do you want to see how your state compares to other states? Try this:


NIH_BY_STATE <- NIH_data_split_year %>%
  filter(AWARD_NOTICE_DATE_YEAR == 2023) %>%
  filter(ORG_COUNTRY == "UNITED STATES") %>%
  filter(TOTAL_COST > 1000000) %>%
  group_by(ORG_STATE) %>%
  summarize(TOTAL_COST = sum(TOTAL_COST, na.rm=TRUE)) %>%
  arrange(desc(TOTAL_COST))

write.csv(NIH_BY_STATE,"/Users/lisawilliams/code/R_For_Mass_Communications/NIH_Data/NIH_BY_STATE_2023.csv", row.names = FALSE)


plot_NIH_by_state <- ggplot(NIH_BY_STATE,
                            aes(x=reorder(NIH_BY_STATE$ORG_STATE,
                                          NIH_BY_STATE$TOTAL_COST),
                                y=NIH_BY_STATE$TOTAL_COST)) +
  geom_bar(stat="identity", fill="dodgerblue") +
  scale_y_continuous(labels = scales::dollar_format(scale = .000001, suffix = "M"))+
  #geom_text(aes(label = signif(TOTAL_COST)), nudge_y = 3) +
  coord_flip() +
  labs(x="", y="")+
  ggtitle("NIH Funding By US State/Territory, 2023",
          subtitle = "Data from NIH RePORTER")

plot_NIH_by_state + theme_light()