# Loading packages
# Load data from CSVs
player_data <- read.csv("Master.csv")
pitching_data <- read.csv("Pitching.csv")
salary_data <- read.csv("Salaries.csv")
inflation_index <- read.csv("inflation.csv")
The goal of the analysis in these graphs is to reveal some conclusions about baseball data. Specifically, I wanted to determine how ERA has changed over time. I also looked at how salaries were affected by place of birth, inside the US or not.
The data was pulled from *http://www.seanlahman.com/baseball-archive/statistics/*. I used master.csv, pitching,csv, and salaries.csv.
pitching_data$yearID <- as.factor(pitching_data$yearID)
pitching_data <- filter(pitching_data, !is.na(ERA))
ggplot(pitching_data) + geom_boxplot(aes(x=yearID,y=ERA))
Graph 1: This graph shows the distribution of ERAs for each year. After filtering out ERAs that had null values, I created a simple boxplot with year as the x variable and ERA as the y variable.
pitching_data$yearID <- as.factor(pitching_data$yearID)
summary_ERA <- summarize(group_by(pitching_data, yearID), Q1=quantile(ERA,.25,na.rm=T),median=median(ERA,na.rm=T),
summary_ERA$yearID <- as.numeric(as.character(summary_ERA$yearID))
ggplot(summary_ERA) + geom_line(aes(x=yearID,y=median))
Graph 2: This line graph shows how the median ERA has changed over time. To create this graph, I had to create a summary dataset that had the first quantile, median, third quantile, minimum, and maximum values in it from the ERA variable.
ggplot(summary_ERA) + geom_ribbon(aes(x=yearID,ymin=Q1,ymax=Q3),fill="lightgreen")+ geom_line(aes(x=yearID,y=median),color="darkblue")
Graph 3: This graph shows the range between the first and third quartiles as depicted by the green ribbon, with the median ERA as the dark blue line.
# variables
min_games_pitched = 10
low_era = 3
high_era = 6
pitching_data$ERA <- as.numeric(pitching_data$ERA)
# Filtered for 10 games minimum pitched
games_pitched_filtered <- filter(pitching_data, G >= min_games_pitched)
games_pitched_filtered$yearID <- as.numeric(as.character((games_pitched_filtered$yearID)))
summary_ERA_2 <- summarize(group_by(games_pitched_filtered, yearID),era_below_3=sum(ERA <= 3,na.rm=T), era_above_6=sum(ERA >= 6,na.rm=T), below_3_eras_proportion=mean(ERA <= 3,na.rm=T), above_6_eras_proportion=mean(ERA >= 6,na.rm=T), top_era=max(ERA), bottom_era=min(ERA))
summary_ERA_2$yearID <- as.numeric(as.character(summary_ERA_2$yearID))
ggplot(summary_ERA_2) +
geom_line(aes(x=yearID,y=below_3_eras_proportion,color="3 or under"))+
geom_line(aes(x=yearID,y=above_6_eras_proportion,color="6 or higher"))+
scale_color_manual(values=c("3 or under"="darkblue","6 or higher"="red"),
name="ERA") +
labs(x="Year", y="Proportion", title="Proportion of Pitchers (pitching at least 10 games)\n With Low and High ERAs by Year")+
Graph 4: This graph displays the proportion of pitchers who had an ERA less than or equal to 3 and the proportion who had an ERA greater than or equal to 6.
names(inflation_index)[1] <- "yearID"
salary_data$playerID <- as.character(salary_data$playerID)
player_data$playerID <- as.character(player_data$playerID)
country_data <- inner_join(player_data, salary_data, by="playerID")
country_data <- mutate(country_data,usa_born = ifelse(birthCountry == "USA","Born in USA","Born outside USA"))
salary_sum <- summarize(group_by(country_data, yearID, usa_born),
Q1= quantile(salary,.25, na.rm=T),
median = median(salary, na.rm=T),
Q3 = quantile(salary,.75, na.rm=T),
min=min(salary, na.rm=T),
max=max(salary, na.rm=T))
salary_sum$yearID <- as.numeric(as.character(salary_sum$yearID))
salary_data$salary <- as.numeric(salary_data$salary)
country_summary_left <- left_join(salary_sum, inflation_index, by="yearID")
country_summary_left[country_summary_left$yearID==2015, "inflation2015"]<-1
country_summary <- mutate(country_summary_left,
median_inflation_adjusted = median*inflation2015,
Q1_inflation_adjusted = Q1*inflation2015,
Q3_inflation_adjusted = Q3*inflation2015,
min_inflation_adjusted = min*inflation2015,
max_inflation_adjusted = max*inflation2015)
fill=usa_born), alpha=.4)+
labs(y="Annual Salary \n (Adjusted for Inflation)", x="Year", title="Salaries of Middle 50% of Earners in Major League Baseball") +
scale_color_discrete(name="Median") +
scale_fill_discrete(name="Middle 50% Earners") +
Graph 5: This graph displays the salaries for the central 50% of earners divided between players who were born in the United States and who were born outside the United States.
The line graphs show that ERA has fluctuated over the years, but the range at which it fluctuates has decreased over time. ERAs of 3 or under have decreased by a lot over time, leading me to believe that batters have gotten better over the years. Another conclusion that can be made from the fifth graph. It seems as if the median salary for players born outside the US is higher than those born on the US.