Scrape Fun!

Web Scraping with R

CSJP

What’s Web Scraping …

Definition -

Web Scraping (Web Harvesting or Web data extraction) is a computer software technique of extracting information from websites, see http://en.wikipedia.org/wiki/Web_scraping.

Scraping Something Now …

Case Studies

Case One - Scraping Craigslist

Code and Results

# Visit Craigslist website

# Search Data Analyst (DA) Jobs at Craigslist website
crag = html('http://sfbay.craigslist.org/search/jjj?query=data+Analyst')

# Extract 100 Rows of DA Jobs in a result page
all =
  crag %>%
  html_nodes('.row') %>%
  html_text()

# Extract 100 hyperlinks from DA job page
lnks =
  crag %>%
  html_nodes('.hdrlnk') %>%
  html_attr('href')

# Cleansing data for each row
tk=unlist(str_replace(all, "\\<[a-zA-Z]+ \\<[0-9]+", ""))
tt=unlist(str_extract(all, "\\<[a-zA-Z]+ \\<[0-9]+"))
all=str_c(tt," ",tk)

# Merge data together
jobs = tbl_df(data.frame
              (cbind
               (Job=all, 
                Hlink = str_c("http://sfbay.craigslist.org",lnks)), 
               stringsAsFactors=F)
              )

# Read data in structure
glimpse(jobs)
## Variables:
## $ Job   (chr) "Jan 16      Research Data Management Service Design Ana...
## $ Hlink (chr) "http://sfbay.craigslist.org/eby/tch/4850217617.html", "...
# Output 100 DA jobs into an Excel file
write.xlsx2(data.frame(jobs), "./CJobs.xls", 
            sheetName="Today's DA Jobs",
            row.names=FALSE)

Case Two - Scraping CRAN

Code and Results

# Visit the Comprehensive R Archive Network (CRAN) website

# Access information page for R Packages available by date
cran = html('http://cran.r-project.org/web/packages/available_packages_by_date.html')

# Extract data from an HTML Table
pkgs = 
  cran %>%
  html_node("table") %>%
  html_table(header=T)

# Make more readable
pkgs = tbl_df(pkgs)

# Cleansing data for each row
pkgs = pkgs %>% 
  mutate(Date = as.Date(Date),                
         Hlink = 
           cran %>%
           html_nodes("a") %>%
           html_attr("href") %>%
           str_replace_all(.,'\\.\\.\\/\\.\\.', '') %>%
           str_c("http://cran.r-project.org",.)
  )

# Read data in structure
glimpse(pkgs)
## Variables:
## $ Date    (date) 2015-01-16, 2015-01-16, 2015-01-16, 2015-01-16, 2015-...
## $ Package (chr) "AICcmodavg", "anfis", "batchmeans", "bayesTFR", "bcoo...
## $ Title   (chr) "Model Selection and Multimodel Inference Based on (Q)...
## $ Hlink   (chr) "http://cran.r-project.org/web/packages/AICcmodavg/ind...
# How many R packages available up to date
pkgs
## Source: local data frame [6,200 x 4]
## 
##          Date       Package
## 1  2015-01-16    AICcmodavg
## 2  2015-01-16         anfis
## 3  2015-01-16    batchmeans
## 4  2015-01-16      bayesTFR
## 5  2015-01-16         bcool
## 6  2015-01-16 caretEnsemble
## 7  2015-01-16  chemometrics
## 8  2015-01-16    disclapmix
## 9  2015-01-16          frbs
## 10 2015-01-16         fwsim
## ..        ...           ...
## Variables not shown: Title (chr), Hlink (chr)
#output pkgs into an Excel file
write.xlsx2(data.frame(pkgs), "./RPkgs.xls", 
           sheetName="UptoDated Packages",
           row.names=FALSE)

# Summarize total R packages available by date
counts =
  pkgs %>%
  group_by(Date) %>%
  summarise(pkgs_Daycount = n()) %>%
  arrange(Date)

# Read data in structure
glimpse(counts)
## Variables:
## $ Date          (date) 2005-10-29, 2006-03-15, 2006-03-30, 2006-05-24,...
## $ pkgs_Daycount (int) 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, ...
# Visualize summarized data by date
qplot(Date, pkgs_Daycount, data=counts)

# Find out at which date R packages available are a big boom!
pkgs %>%
  group_by(Date) %>%
  tally(sort=T) %>%
  top_n(1)
## Source: local data frame [1 x 2]
## 
##         Date   n
## 1 2012-10-29 209

My Session Info. …

## R version 3.1.2 (2014-10-31)
## Platform: i386-w64-mingw32/i386 (32-bit)
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] xlsx_0.5.7          xlsxjars_0.6.1      rJava_0.9-6        
##  [4] vcd_1.3-2           scales_0.2.4        ROCR_1.0-5         
##  [7] reshape2_1.4.1      RColorBrewer_1.1-2  randomForest_4.6-10
## [10] plyr_1.8.1          nnet_7.3-8          MASS_7.3-35        
## [13] KernSmooth_2.23-13  gtools_3.4.1        gplots_2.15.0      
## [16] gdata_2.13.3        foreach_1.4.2       e1071_1.6-4        
## [19] colorspace_1.2-4    cluster_1.15.3      class_7.3-11       
## [22] caTools_1.17.1      caret_6.0-37        lattice_0.20-29    
## [25] boot_1.3-13         ada_2.0-3           rpart_4.1-8        
## [28] dplyr_0.3.0.2       ggplot2_1.0.0       stringr_0.6.2      
## [31] rvest_0.2.0         knitr_1.8          
## 
## loaded via a namespace (and not attached):
##  [1] assertthat_0.1      bitops_1.0-6        BradleyTerry2_1.0-5
##  [4] brglm_0.5-9         car_2.0-22          codetools_0.2-9    
##  [7] DBI_0.3.1           digest_0.6.7        evaluate_0.5.5     
## [10] formatR_1.0         gtable_0.1.2        htmltools_0.2.6    
## [13] httr_0.6.1          iterators_1.0.7     labeling_0.3       
## [16] lazyeval_0.1.9      lme4_1.1-7          magrittr_1.5       
## [19] Matrix_1.1-4        minqa_1.2.4         munsell_0.4.2      
## [22] nlme_3.1-118        nloptr_1.0.4        parallel_3.1.2     
## [25] proto_0.3-10        Rcpp_0.11.3         RCurl_1.95-4.5     
## [28] rmarkdown_0.3.11    selectr_0.2-3       splines_3.1.2      
## [31] tools_3.1.2         XML_3.98-1.1        yaml_2.1.13