Batting Fun!

Baseball Data with R

CSJP

Baseball Data Sources

# Loading data
batting_df = tbl_df(Batting)
person_df = tbl_df(Master)
series_df = tbl_df(SeriesPost)
teams_df = tbl_df(Teams)
pitching_df = tbl_df(Pitching)

about Home Runs …

# Top 5 all-time players in MLB history who have hit most home runs
batting_df %>%  
  count(playerID, wt = HR, sort = TRUE) %>%
  head(5) %>%
  left_join(
    person_df %>%
      select(playerID, nameFirst, nameLast), 
    by='playerID') %>%
  select('First Name'=nameFirst, 'Last Name'=nameLast, 
         'Total HRs'=n) %>%
  kable(row.names=FALSE, 
      padding = 2, align='c')
First Name Last Name Total HRs
Barry Bonds 762
Hank Aaron 755
Babe Ruth 714
Willie Mays 660
Alex Rodriguez 654

about Home Runs with Seasons …

# Top 5 all-time players in MLB history who have hit most home runs
# add season information
batting_df %>%
  select(playerID, HR, stint) %>%
  group_by(playerID) %>%
  summarise(Total_HR = sum(HR), Seasons = sum(stint==1L)) %>% 
  arrange(desc(Total_HR)) %>%
  head(5) %>%
  left_join(
    person_df %>%
      select(playerID, nameFirst, nameLast), 
    by='playerID') %>%
  select('First Name'=nameFirst, 'Last Name'=nameLast, 
         'Total HRs'=Total_HR, 'Seasons'=Seasons) %>% 
  kable(row.names=FALSE, 
      padding = 2, align='c')
First Name Last Name Total HRs Seasons
Barry Bonds 762 22
Hank Aaron 755 23
Babe Ruth 714 22
Willie Mays 660 22
Alex Rodriguez 654 20

about Scored Runs …

# Top 5 all-time players in MLB history who have most scored runs
batting_df %>%  
  count(playerID, wt = R, sort = TRUE) %>%
  head(5) %>%
  left_join(
    person_df %>%
      select(playerID, nameFirst, nameLast), 
    by='playerID') %>%
  select('First Name'=nameFirst, 'Last Name'=nameLast, 
         'Total Runs'=n) %>%
  kable(row.names=FALSE, 
      padding = 2, align='c')
First Name Last Name Total Runs
Rickey Henderson 2295
Ty Cobb 2246
Barry Bonds 2227
Hank Aaron 2174
Babe Ruth 2174

about Batting Average …

# Top 5 all-time players in MLB history who have highest batting average
# under the conditions - show up more than 1000 games and 1000 at bats
batting_df %>% 
  select(playerID, H, AB, G) %>%
  group_by(playerID) %>%
  summarise(
    Games = sum(G, na.rm=TRUE),
    AtBats = sum(AB, na.rm=TRUE),
    BA = round((sum(H, na.rm=TRUE) / sum(AB, na.rm=TRUE)), 3)
  ) %>%
  filter(Games > 1000, AtBats > 1000) %>% 
  select(playerID, BA) %>%
  arrange(desc(BA)) %>%
  head(5) %>%
  left_join(
    person_df %>%
      select(playerID, nameFirst, nameLast), 
    by='playerID') %>%
  select('First Name'=nameFirst, 'Last Name'=nameLast, 
         'Batting Average'=BA) %>% 
  kable(row.names=FALSE, 
      padding = 2, align='c')
First Name Last Name Batting Average
Ty Cobb 0.366
Rogers Hornsby 0.358
Shoeless Joe Jackson 0.356
Ed Delahanty 0.346
Tris Speaker 0.345

about Consecutive Seasons …

# Top 5 all-time players in MLB history who have played most consecutive seasons
batting_df %>%  
  count(playerID, wt = (stint==1L), sort = TRUE) %>%
  #  filter(rank(desc(n),ties.method='first') <= 5) %>% # same as row_number()
  filter(row_number(desc(n)) <= 5) %>%
  left_join(
    person_df %>%
      select(playerID, nameFirst, nameLast), 
    by='playerID') %>%
  select('First Name'=nameFirst, 'Last Name'=nameLast, 
         'Consecutive Seasons'=n) %>%
  kable(row.names=FALSE, 
        padding = 2, align='c')
First Name Last Name Consecutive Seasons
Cap Anson 27
Nolan Ryan 27
Tommy John 26
Deacon McGuire 26
Eddie Collins 25

about Strikeouts …

# Top 5 all-time players in MLB history who have most strikeouts
pitching_df %>%
  count(playerID, wt=SO, sort=T) %>%
  head(5) %>%
  left_join(
    person_df %>%
      select(playerID, nameFirst, nameLast), 
    by='playerID') %>%
  select('First Name'=nameFirst, 'Last Name'=nameLast, 
         'Total Strikeouts'=n) %>%
  kable(row.names=FALSE, 
        padding = 2, align='c')  
First Name Last Name Total Strikeouts
Nolan Ryan 5714
Randy Johnson 4875
Roger Clemens 4672
Steve Carlton 4136
Bert Blyleven 3701

about Strikeouts with Seasons …

# Top 5 all-time players in MLB history who have most strikeouts
# add seasons information
pitching_df %>%
  select(playerID, SO, stint) %>%
  group_by(playerID) %>%
  summarise(Total_Strikeouts = sum(SO), Seasons = sum(stint==1L)) %>%
  arrange(desc(Total_Strikeouts)) %>%
  head(5) %>%
  left_join(
    person_df %>%
      select(playerID, nameFirst, nameLast), 
    by='playerID') %>%
  select('First Name'=nameFirst, 'Last Name'=nameLast, 
         'Total Strikeouts'=Total_Strikeouts, Seasons) %>%
  kable(row.names=FALSE, 
        padding = 2, align='c')  
First Name Last Name Total Strikeouts Seasons
Nolan Ryan 5714 27
Randy Johnson 4875 22
Roger Clemens 4672 24
Steve Carlton 4136 24
Bert Blyleven 3701 22

about Teams in 2014 …

# How many teams are in MLB in 2014?
teams_df %>%
  filter(yearID==2014) %>%
  select(teamID) %>%
  distinct() %>%
  summarize('Total Teams'=n()) %>%
  kable(row.names=FALSE, 
        padding = 2, align='c') 
Total Teams
30

about Champions with Leagues …

# How many World Series Titles have been won by each League?
series_df %>%
  filter(round=='WS') %>%
  group_by(League=lgIDwinner) %>%
  summarize('Champions'=n()) %>%
  kable(row.names=FALSE, 
        padding = 2, align='c')  
League Champions
AA 1
AL 63
NL 53

about Champions with Teams …

# Top 15 Teams in MLB history have won the most World Series Titles
series_df %>%
  filter(round=='WS') %>%
  group_by(teamIDwinner) %>%
  summarize(Champs=n(), FirstYear=min(yearID), 
            LastYear=max(yearID)) %>%
  arrange(desc(Champs)) %>%
  head(15) %>%
  left_join (
    teams_df %>%
      select(yearID, teamID, name),
    by = c("teamIDwinner"="teamID", "LastYear"="yearID")
  ) %>%
  select('Team Name'=name, 'Champions'=Champs, 
         'First Year'=FirstYear, 'Last Year'=LastYear) %>%
  kable(row.names=FALSE, 
        padding = 2, align='c')
Team Name Champions First Year Last Year
New York Yankees 27 1923 2009
St. Louis Cardinals 11 1926 2011
Boston Red Sox 8 1903 2013
Cincinnati Reds 5 1919 1990
Los Angeles Dodgers 5 1959 1988
New York Giants 5 1905 1954
Philadelphia Athletics 5 1910 1930
Pittsburgh Pirates 5 1909 1979
Detroit Tigers 4 1935 1984
Oakland Athletics 4 1972 1989
Baltimore Orioles 3 1966 1983
Chicago White Sox 3 1906 2005
San Francisco Giants 3 2010 2014
Brooklyn Dodgers 2 1890 1955
Chicago Cubs 2 1907 1908