Web Scraping (Web Harvesting or Web data extraction) is a computer software technique of extracting information from websites, see http://en.wikipedia.org/wiki/Web_scraping.
Craigslist website
http://sfbay.craigslist.org/search/jjj?query=data+Analyst
CRAN website
http://cran.r-project.org/web/packages/available_packages_by_date.html
# Visit Craigslist website
# Search Data Analyst (DA) Jobs at Craigslist website
crag = html('http://sfbay.craigslist.org/search/jjj?query=data+Analyst')
# Extract 100 Rows of DA Jobs in a result page
all =
crag %>%
html_nodes('.row') %>%
html_text()
# Extract 100 hyperlinks from DA job page
lnks =
crag %>%
html_nodes('.hdrlnk') %>%
html_attr('href')
# Cleansing data for each row
tk=unlist(str_replace(all, "\\<[a-zA-Z]+ \\<[0-9]+", ""))
tt=unlist(str_extract(all, "\\<[a-zA-Z]+ \\<[0-9]+"))
all=str_c(tt," ",tk)
# Merge data together
jobs = tbl_df(data.frame
(cbind
(Job=all,
Hlink = str_c("http://sfbay.craigslist.org",lnks)),
stringsAsFactors=F)
)
# Read data in structure
glimpse(jobs)
## Variables:
## $ Job (chr) "Jan 16 Research Data Management Service Design Ana...
## $ Hlink (chr) "http://sfbay.craigslist.org/eby/tch/4850217617.html", "...
# Output 100 DA jobs into an Excel file
write.xlsx2(data.frame(jobs), "./CJobs.xls",
sheetName="Today's DA Jobs",
row.names=FALSE)
# Visit the Comprehensive R Archive Network (CRAN) website
# Access information page for R Packages available by date
cran = html('http://cran.r-project.org/web/packages/available_packages_by_date.html')
# Extract data from an HTML Table
pkgs =
cran %>%
html_node("table") %>%
html_table(header=T)
# Make more readable
pkgs = tbl_df(pkgs)
# Cleansing data for each row
pkgs = pkgs %>%
mutate(Date = as.Date(Date),
Hlink =
cran %>%
html_nodes("a") %>%
html_attr("href") %>%
str_replace_all(.,'\\.\\.\\/\\.\\.', '') %>%
str_c("http://cran.r-project.org",.)
)
# Read data in structure
glimpse(pkgs)
## Variables:
## $ Date (date) 2015-01-16, 2015-01-16, 2015-01-16, 2015-01-16, 2015-...
## $ Package (chr) "AICcmodavg", "anfis", "batchmeans", "bayesTFR", "bcoo...
## $ Title (chr) "Model Selection and Multimodel Inference Based on (Q)...
## $ Hlink (chr) "http://cran.r-project.org/web/packages/AICcmodavg/ind...
# How many R packages available up to date
pkgs
## Source: local data frame [6,200 x 4]
##
## Date Package
## 1 2015-01-16 AICcmodavg
## 2 2015-01-16 anfis
## 3 2015-01-16 batchmeans
## 4 2015-01-16 bayesTFR
## 5 2015-01-16 bcool
## 6 2015-01-16 caretEnsemble
## 7 2015-01-16 chemometrics
## 8 2015-01-16 disclapmix
## 9 2015-01-16 frbs
## 10 2015-01-16 fwsim
## .. ... ...
## Variables not shown: Title (chr), Hlink (chr)
#output pkgs into an Excel file
write.xlsx2(data.frame(pkgs), "./RPkgs.xls",
sheetName="UptoDated Packages",
row.names=FALSE)
# Summarize total R packages available by date
counts =
pkgs %>%
group_by(Date) %>%
summarise(pkgs_Daycount = n()) %>%
arrange(Date)
# Read data in structure
glimpse(counts)
## Variables:
## $ Date (date) 2005-10-29, 2006-03-15, 2006-03-30, 2006-05-24,...
## $ pkgs_Daycount (int) 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, ...
# Visualize summarized data by date
qplot(Date, pkgs_Daycount, data=counts)
# Find out at which date R packages available are a big boom!
pkgs %>%
group_by(Date) %>%
tally(sort=T) %>%
top_n(1)
## Source: local data frame [1 x 2]
##
## Date n
## 1 2012-10-29 209
## R version 3.1.2 (2014-10-31)
## Platform: i386-w64-mingw32/i386 (32-bit)
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] xlsx_0.5.7 xlsxjars_0.6.1 rJava_0.9-6
## [4] vcd_1.3-2 scales_0.2.4 ROCR_1.0-5
## [7] reshape2_1.4.1 RColorBrewer_1.1-2 randomForest_4.6-10
## [10] plyr_1.8.1 nnet_7.3-8 MASS_7.3-35
## [13] KernSmooth_2.23-13 gtools_3.4.1 gplots_2.15.0
## [16] gdata_2.13.3 foreach_1.4.2 e1071_1.6-4
## [19] colorspace_1.2-4 cluster_1.15.3 class_7.3-11
## [22] caTools_1.17.1 caret_6.0-37 lattice_0.20-29
## [25] boot_1.3-13 ada_2.0-3 rpart_4.1-8
## [28] dplyr_0.3.0.2 ggplot2_1.0.0 stringr_0.6.2
## [31] rvest_0.2.0 knitr_1.8
##
## loaded via a namespace (and not attached):
## [1] assertthat_0.1 bitops_1.0-6 BradleyTerry2_1.0-5
## [4] brglm_0.5-9 car_2.0-22 codetools_0.2-9
## [7] DBI_0.3.1 digest_0.6.7 evaluate_0.5.5
## [10] formatR_1.0 gtable_0.1.2 htmltools_0.2.6
## [13] httr_0.6.1 iterators_1.0.7 labeling_0.3
## [16] lazyeval_0.1.9 lme4_1.1-7 magrittr_1.5
## [19] Matrix_1.1-4 minqa_1.2.4 munsell_0.4.2
## [22] nlme_3.1-118 nloptr_1.0.4 parallel_3.1.2
## [25] proto_0.3-10 Rcpp_0.11.3 RCurl_1.95-4.5
## [28] rmarkdown_0.3.11 selectr_0.2-3 splines_3.1.2
## [31] tools_3.1.2 XML_3.98-1.1 yaml_2.1.13