Web Scraping Academia Institute’s Grant Fundings using R

blog

Web Scrapping

How to use R for web scrapping

Author

Jihong Zhang

Published

May 12, 2022

This is an example of how to web scrape grants of active research in college’s official website. Please follow the academia institute’s website robots rules.

⌘+C

library(XML)
library(stringr)
library(rvest)
library(tidyverse)
library(kableExtra)

1 Web Scrapping

⌘+C

topics = rep(NA, 84)
fundings = rep(NA, 84)
iter = 0
for (page in 1:9) {
  ## parent webpage
  scrape_url <- paste0('http://XXXXXXXXXXXXXXXXXXXXXX', page)
  
  html_form_page <- read_html(scrape_url)
  
  ## find the child web page containing projects' name, total award, topic etc.
  child_url = html_form_page |> html_elements("h3 a[href]") |> html_attr("href") 
  
  for (item in 1:length(child_url)) {
    iter = iter + 1
    child_html_text <- child_url[item] |> read_html() |> html_elements("div[class='study_wrapper']") |> html_text() 
    topic = child_html_text |> str_extract(pattern = "Topic\\(s\\)\\: [a-zA-Z]+\\b") |> str_replace(pattern = "Topic\\(s\\)\\: ", "")
    funding = child_html_text |> str_extract(pattern = "\\$\\d+\\,\\d+") |> str_replace_all(pattern = "\\$|\\,", "") |> as.numeric()
    topics[iter] = topic
    fundings[iter] = funding
  }
}

dat <- data.frame(topic = topics, funding_amount = fundings) |> 
  add_row(topic = "Marijuana", funding_amount = 3743) |> 
  mutate(topic = ifelse(topic == "TobaccoMarijuana", "Tobacco", topic))

topic	funding_amount
Other	97666
Other	15000
Tobacco	1271
Cancer	NA
Other	199836
Other	64614
Cancer	200000
Other	75000
Other	72972
Cancer	329234

2 Visualization in ggplot2

⌘+C

## funding per project
dat1 <- dat |> 
  group_by(topic) |> 
  summarise(funding_amount_mean = mean(funding_amount, na.rm = T)) |> 
  mutate(topic = fct_reorder(topic, desc(funding_amount_mean)))

ggplot(dat1) +
  aes(x = topic, y = funding_amount_mean) +
  geom_col(fill = "darkblue") +
  scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6)) +
  labs(y = "funding per project", title = "Funding for each project during 2019 to 2022") +
  theme(legend.position = "none", text = element_text(size = 12)) # remove lengend

⌘+C

## Total funding amount
dat2 <- dat |> 
  group_by(topic) |> 
  summarise(
    funding_amount_sum = sum(funding_amount, na.rm = T), 
    n = n()) |> 
  mutate(
    topic = fct_reorder(topic, desc(funding_amount_sum)),
    highlight = ifelse(funding_amount_sum == max(funding_amount_sum), 1, 0) |> as.factor())

ggplot(dat2) +
  aes(x = topic, y = funding_amount_sum, fill = highlight) +
  geom_col() +
  geom_text(aes(label = round(funding_amount_sum/ 10^6, 3)), vjust = 0.001, size = 5) +
  geom_label(aes(label = n), vjust = 0.999, size = 5, color = "white") +
  scale_fill_manual(values = c("darkblue", "red2")) +
  labs(x = "", y = "total amount of funding", 
       title = "Total Amount of Funding and Number of Grant Projects during 2019-2022", 
       subtitle = "Active research at Health Promotion Center from 2019 to 2022",
       caption = "source: https://healthpromotionresearch.org/Active-Studies/") +
  scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6)) +
  theme(legend.position = "none", text = element_text(size = 10)) # remove lengend

--- title: Web Scraping Academia Institute's Grant Fundings using R description: "How to use R for web scrapping" author: 'Jihong Zhang' date: '2022-05-12' slug: web-scrapping-grant-fundings-in-r categories: - blog - Web Scrapping tags: [] subtitle: '' summary: '' lastmod: '2022-05-12T21:38:27-05:00' featured: no projects: [] --- This is an example of how to web scrape grants of active research in college's official website. Please follow the academia institute's website robots rules. ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, fig.align = "center") ``` ```{r, message=FALSE, warning=FALSE} library(XML) library(stringr) library(rvest) library(tidyverse) library(kableExtra) ``` ## Web Scrapping ```{r, eval=FALSE} topics = rep(NA, 84) fundings = rep(NA, 84) iter = 0 for (page in 1:9) { ## parent webpage scrape_url <- paste0('http://XXXXXXXXXXXXXXXXXXXXXX', page) html_form_page <- read_html(scrape_url) ## find the child web page containing projects' name, total award, topic etc. child_url = html_form_page |> html_elements("h3 a[href]") |> html_attr("href") for (item in 1:length(child_url)) { iter = iter + 1 child_html_text <- child_url[item] |> read_html() |> html_elements("div[class='study_wrapper']") |> html_text() topic = child_html_text |> str_extract(pattern = "Topic\$s\$\\: [a-zA-Z]+\\b") |> str_replace(pattern = "Topic\$s\$\\: ", "") funding = child_html_text |> str_extract(pattern = "\\$\\d+\\,\\d+") |> str_replace_all(pattern = "\\$|\\,", "") |> as.numeric() topics[iter] = topic fundings[iter] = funding } } dat <- data.frame(topic = topics, funding_amount = fundings) |> add_row(topic = "Marijuana", funding_amount = 3743) |> mutate(topic = ifelse(topic == "TobaccoMarijuana", "Tobacco", topic)) ``` ```{r, cache=TRUE, echo=FALSE} topics = rep(NA, 84) fundings = rep(NA, 84) iter = 0 for (page in 1:9) { scrape_url <- paste0('https://healthpromotionresearch.org/Active-Studies/PgrID/92890/PageID/', page) html_form_page <- read_html(scrape_url) ## find the child web page containing projects' name, total award, topic etc. child_url = html_form_page |> html_elements("h3 a[href]") |> html_attr("href") for (item in 1:length(child_url)) { iter = iter + 1 child_html_text <- child_url[item] |> read_html() |> html_elements("div[class='study_wrapper']") |> html_text() topic = child_html_text |> str_extract(pattern = "Topic\$s\$\\: [a-zA-Z]+\\b") |> str_replace(pattern = "Topic\$s\$\\: ", "") funding = child_html_text |> str_extract(pattern = "\\$\\d+\\,\\d+") |> str_replace_all(pattern = "\\$|\\,", "") |> as.numeric() topics[iter] = topic fundings[iter] = funding } } dat <- data.frame(topic = topics, funding_amount = fundings) |> add_row(topic = "Marijuana", funding_amount = 3743) |> mutate(topic = ifelse(topic == "TobaccoMarijuana", "Tobacco", topic)) ``` ```{r, echo=FALSE} kable(dat[1:10,], format = "html") |> kable_material() ``` ## Visualization in ggplot2 ```{r} ## funding per project dat1 <- dat |> group_by(topic) |> summarise(funding_amount_mean = mean(funding_amount, na.rm = T)) |> mutate(topic = fct_reorder(topic, desc(funding_amount_mean))) ggplot(dat1) + aes(x = topic, y = funding_amount_mean) + geom_col(fill = "darkblue") + scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6)) + labs(y = "funding per project", title = "Funding for each project during 2019 to 2022") + theme(legend.position = "none", text = element_text(size = 12)) # remove lengend ``` ```{r} ## Total funding amount dat2 <- dat |> group_by(topic) |> summarise( funding_amount_sum = sum(funding_amount, na.rm = T), n = n()) |> mutate( topic = fct_reorder(topic, desc(funding_amount_sum)), highlight = ifelse(funding_amount_sum == max(funding_amount_sum), 1, 0) |> as.factor()) ggplot(dat2) + aes(x = topic, y = funding_amount_sum, fill = highlight) + geom_col() + geom_text(aes(label = round(funding_amount_sum/ 10^6, 3)), vjust = 0.001, size = 5) + geom_label(aes(label = n), vjust = 0.999, size = 5, color = "white") + scale_fill_manual(values = c("darkblue", "red2")) + labs(x = "", y = "total amount of funding", title = "Total Amount of Funding and Number of Grant Projects during 2019-2022", subtitle = "Active research at Health Promotion Center from 2019 to 2022", caption = "source: https://healthpromotionresearch.org/Active-Studies/") + scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6)) + theme(legend.position = "none", text = element_text(size = 10)) # remove lengend ```