Batting Fun!
Baseball Data with R
CSJP
Baseball Data Sources
# Loading data
batting_df = tbl_df(Batting)
person_df = tbl_df(Master)
series_df = tbl_df(SeriesPost)
teams_df = tbl_df(Teams)
pitching_df = tbl_df(Pitching)
about Home Runs …
# Top 5 all-time players in MLB history who have hit most home runs
batting_df %>%
count(playerID, wt = HR, sort = TRUE) %>%
head(5) %>%
left_join(
person_df %>%
select(playerID, nameFirst, nameLast),
by='playerID') %>%
select('First Name'=nameFirst, 'Last Name'=nameLast,
'Total HRs'=n) %>%
kable(row.names=FALSE,
padding = 2, align='c')
First Name
|
Last Name
|
Total HRs
|
Barry
|
Bonds
|
762
|
Hank
|
Aaron
|
755
|
Babe
|
Ruth
|
714
|
Willie
|
Mays
|
660
|
Alex
|
Rodriguez
|
654
|
about Home Runs with Seasons …
# Top 5 all-time players in MLB history who have hit most home runs
# add season information
batting_df %>%
select(playerID, HR, stint) %>%
group_by(playerID) %>%
summarise(Total_HR = sum(HR), Seasons = sum(stint==1L)) %>%
arrange(desc(Total_HR)) %>%
head(5) %>%
left_join(
person_df %>%
select(playerID, nameFirst, nameLast),
by='playerID') %>%
select('First Name'=nameFirst, 'Last Name'=nameLast,
'Total HRs'=Total_HR, 'Seasons'=Seasons) %>%
kable(row.names=FALSE,
padding = 2, align='c')
First Name
|
Last Name
|
Total HRs
|
Seasons
|
Barry
|
Bonds
|
762
|
22
|
Hank
|
Aaron
|
755
|
23
|
Babe
|
Ruth
|
714
|
22
|
Willie
|
Mays
|
660
|
22
|
Alex
|
Rodriguez
|
654
|
20
|
about Scored Runs …
# Top 5 all-time players in MLB history who have most scored runs
batting_df %>%
count(playerID, wt = R, sort = TRUE) %>%
head(5) %>%
left_join(
person_df %>%
select(playerID, nameFirst, nameLast),
by='playerID') %>%
select('First Name'=nameFirst, 'Last Name'=nameLast,
'Total Runs'=n) %>%
kable(row.names=FALSE,
padding = 2, align='c')
First Name
|
Last Name
|
Total Runs
|
Rickey
|
Henderson
|
2295
|
Ty
|
Cobb
|
2246
|
Barry
|
Bonds
|
2227
|
Hank
|
Aaron
|
2174
|
Babe
|
Ruth
|
2174
|
about Batting Average …
# Top 5 all-time players in MLB history who have highest batting average
# under the conditions - show up more than 1000 games and 1000 at bats
batting_df %>%
select(playerID, H, AB, G) %>%
group_by(playerID) %>%
summarise(
Games = sum(G, na.rm=TRUE),
AtBats = sum(AB, na.rm=TRUE),
BA = round((sum(H, na.rm=TRUE) / sum(AB, na.rm=TRUE)), 3)
) %>%
filter(Games > 1000, AtBats > 1000) %>%
select(playerID, BA) %>%
arrange(desc(BA)) %>%
head(5) %>%
left_join(
person_df %>%
select(playerID, nameFirst, nameLast),
by='playerID') %>%
select('First Name'=nameFirst, 'Last Name'=nameLast,
'Batting Average'=BA) %>%
kable(row.names=FALSE,
padding = 2, align='c')
First Name
|
Last Name
|
Batting Average
|
Ty
|
Cobb
|
0.366
|
Rogers
|
Hornsby
|
0.358
|
Shoeless Joe
|
Jackson
|
0.356
|
Ed
|
Delahanty
|
0.346
|
Tris
|
Speaker
|
0.345
|
about Consecutive Seasons …
# Top 5 all-time players in MLB history who have played most consecutive seasons
batting_df %>%
count(playerID, wt = (stint==1L), sort = TRUE) %>%
# filter(rank(desc(n),ties.method='first') <= 5) %>% # same as row_number()
filter(row_number(desc(n)) <= 5) %>%
left_join(
person_df %>%
select(playerID, nameFirst, nameLast),
by='playerID') %>%
select('First Name'=nameFirst, 'Last Name'=nameLast,
'Consecutive Seasons'=n) %>%
kable(row.names=FALSE,
padding = 2, align='c')
First Name
|
Last Name
|
Consecutive Seasons
|
Cap
|
Anson
|
27
|
Nolan
|
Ryan
|
27
|
Tommy
|
John
|
26
|
Deacon
|
McGuire
|
26
|
Eddie
|
Collins
|
25
|
about Strikeouts …
# Top 5 all-time players in MLB history who have most strikeouts
pitching_df %>%
count(playerID, wt=SO, sort=T) %>%
head(5) %>%
left_join(
person_df %>%
select(playerID, nameFirst, nameLast),
by='playerID') %>%
select('First Name'=nameFirst, 'Last Name'=nameLast,
'Total Strikeouts'=n) %>%
kable(row.names=FALSE,
padding = 2, align='c')
First Name
|
Last Name
|
Total Strikeouts
|
Nolan
|
Ryan
|
5714
|
Randy
|
Johnson
|
4875
|
Roger
|
Clemens
|
4672
|
Steve
|
Carlton
|
4136
|
Bert
|
Blyleven
|
3701
|
about Strikeouts with Seasons …
# Top 5 all-time players in MLB history who have most strikeouts
# add seasons information
pitching_df %>%
select(playerID, SO, stint) %>%
group_by(playerID) %>%
summarise(Total_Strikeouts = sum(SO), Seasons = sum(stint==1L)) %>%
arrange(desc(Total_Strikeouts)) %>%
head(5) %>%
left_join(
person_df %>%
select(playerID, nameFirst, nameLast),
by='playerID') %>%
select('First Name'=nameFirst, 'Last Name'=nameLast,
'Total Strikeouts'=Total_Strikeouts, Seasons) %>%
kable(row.names=FALSE,
padding = 2, align='c')
First Name
|
Last Name
|
Total Strikeouts
|
Seasons
|
Nolan
|
Ryan
|
5714
|
27
|
Randy
|
Johnson
|
4875
|
22
|
Roger
|
Clemens
|
4672
|
24
|
Steve
|
Carlton
|
4136
|
24
|
Bert
|
Blyleven
|
3701
|
22
|
about Teams in 2014 …
# How many teams are in MLB in 2014?
teams_df %>%
filter(yearID==2014) %>%
select(teamID) %>%
distinct() %>%
summarize('Total Teams'=n()) %>%
kable(row.names=FALSE,
padding = 2, align='c')
about Champions with Leagues …
# How many World Series Titles have been won by each League?
series_df %>%
filter(round=='WS') %>%
group_by(League=lgIDwinner) %>%
summarize('Champions'=n()) %>%
kable(row.names=FALSE,
padding = 2, align='c')
League
|
Champions
|
AA
|
1
|
AL
|
63
|
NL
|
53
|
about Champions with Teams …
# Top 15 Teams in MLB history have won the most World Series Titles
series_df %>%
filter(round=='WS') %>%
group_by(teamIDwinner) %>%
summarize(Champs=n(), FirstYear=min(yearID),
LastYear=max(yearID)) %>%
arrange(desc(Champs)) %>%
head(15) %>%
left_join (
teams_df %>%
select(yearID, teamID, name),
by = c("teamIDwinner"="teamID", "LastYear"="yearID")
) %>%
select('Team Name'=name, 'Champions'=Champs,
'First Year'=FirstYear, 'Last Year'=LastYear) %>%
kable(row.names=FALSE,
padding = 2, align='c')
Team Name
|
Champions
|
First Year
|
Last Year
|
New York Yankees
|
27
|
1923
|
2009
|
St. Louis Cardinals
|
11
|
1926
|
2011
|
Boston Red Sox
|
8
|
1903
|
2013
|
Cincinnati Reds
|
5
|
1919
|
1990
|
Los Angeles Dodgers
|
5
|
1959
|
1988
|
New York Giants
|
5
|
1905
|
1954
|
Philadelphia Athletics
|
5
|
1910
|
1930
|
Pittsburgh Pirates
|
5
|
1909
|
1979
|
Detroit Tigers
|
4
|
1935
|
1984
|
Oakland Athletics
|
4
|
1972
|
1989
|
Baltimore Orioles
|
3
|
1966
|
1983
|
Chicago White Sox
|
3
|
1906
|
2005
|
San Francisco Giants
|
3
|
2010
|
2014
|
Brooklyn Dodgers
|
2
|
1890
|
1955
|
Chicago Cubs
|
2
|
1907
|
1908
|