data(SeriesPost)
# How many times has each team won the World Series?
# Notes:
# - the SeriesPost table includes an identifier for the
# team (teamID), but not the franchise (e.g. the Brooklyn Dodgers
# [BRO] and Los Angeles Dodgers [LAN] are counted separately)
#
# - the World Series was first played in 1903, but the
# Lahman data tables have the final round of the earlier
# playoffs labelled "WS", so it is necessary to
# filter the SeriesPost table to exclude years prior to 1903.
# using the dplyr data manipulation package
library("dplyr")
library("tidyr")
library("ggplot2")
## WS winners, arranged in descending order of titles won
ws_winner_table <- SeriesPost %>%
filter(yearID > "1902", round == "WS") %>%
group_by(teamIDwinner) %>%
summarise(wincount = n()) %>%
arrange(desc(wincount))
ws_winner_table
## Expanded form of World Series team data in modern era
ws <- SeriesPost %>%
filter(yearID >= 1903 & round == "WS") %>%
select(-ties, -round) %>%
mutate(lgIDloser = droplevels(lgIDloser),
lgIDwinner = droplevels(lgIDwinner))
# Bar chart of length of series (# games played)
# 1903, 1919 and 1921 had eight games
ggplot(ws, aes(x = wins + losses)) +
geom_bar(fill = "dodgerblue") +
labs(x = "Number of games", y = "Frequency")
# Last year the Cubs appeared in the WS
ws %>%
filter(teamIDwinner == "CHN" | teamIDloser == "CHN") %>%
summarise(max(yearID))
# Dot chart of number of WS appearances by teamID
ws %>%
gather(wl, team, teamIDwinner, teamIDloser) %>%
count(team) %>%
arrange(desc(n)) %>%
ggplot(., aes(x = reorder(team, n), y = n)) +
theme_bw() +
geom_point(size = 3, color = "dodgerblue") +
geom_segment(aes(xend = reorder(team, n), yend = 0),
linetype = "dotted", color = "dodgerblue",
size = 1) +
labs(x = NULL, y = "Number of WS appearances") +
scale_y_continuous(expand = c(0, 0), limits = c(0, 42)) +
coord_flip() +
theme(axis.text.y = element_text(size = rel(0.8)),
axis.ticks.y = element_blank())
# Initial year of each round of championship series in modern era
SeriesPost %>%
filter(yearID >= 1903) %>% # modern WS started in 1903
group_by(round) %>%
summarise(first_year = min(yearID)) %>%
arrange(first_year)
# Ditto, but with more information about each series played
SeriesPost %>%
filter(yearID >= 1903) %>%
group_by(round) %>%
arrange(yearID) %>%
do(head(., 1)) %>%
select(-lgIDwinner, -lgIDloser) %>%
arrange(yearID, round)
Run the code above in your browser using DataLab