MLB-Bayesian-Ridge/Model_Fitting.Rmd at main · dteuscher1/MLB-Bayesian-Ridge · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
---
output: pdf_document
---

```{r, eval=FALSE}
# Load packages
library(baseballr)
# Alternatively, using the devtools package:
#devtools::install_github(repo = "BillPetti/baseballr")
library(tidyverse)
library(mvtnorm)
library(rstan)
library(shinystan)
library(latex2exp)
library(glmnet)
library(patchwork)

# Load in FIP data from Fangraphs; I didn't end up using 2017, but I have the
# data anyway
data_2016 <- fg_pitch_leaders(2016, 2016, qual = 40, ind = 0)
data_2017 <- fg_pitch_leaders(2017, 2017, qual = 40, ind = 0)

# Combine the data frames together with FIP
data <- data_2016 %>%
    bind_rows(data_2017) %>%
    mutate(Season = as.numeric(Season))

# Read in statcast variables with covariates for modeling
savant <- read.csv("Data/stats.csv") %>%
    # Combine first and last name together
    mutate(Name = str_trim(paste(first_name, last_name)))

# Join the two datasets together
combined <- savant %>% inner_join(data %>% select(Name, FIP, Season),
                                  by = c("Name", "year" = "Season"))

# Filter the data to included right handed pitchers who pitched more than 90
# innings in 2016
small <- combined %>% select(-X, -player_id, -xba, -xslg, -xwoba, -xobp, -xiso,
                             -wobacon, -exit_velocity_avg, -barrel_batted_rate,
                             -hard_hit_percent) %>%
    filter(year == 2016) %>%
    filter(p_formatted_ip > 90) %>%
    filter(pitch_hand == "R") %>%
    select(-p_formatted_ip, -pitch_hand) %>%
    # Replace a few NA observations with 0; Some pitchers didn't throw
    # breaking or offspeed pitches, so it makes sense to input 0 for those
    # variables
    replace(is.na(.), 0)

# Create data to use for Stan
N <- nrow(small)
# Standardize the covariates
X <- model.matrix(FIP ~.-1-Name-year-last_name-first_name, data = small) %>%
    scale()
K <- ncol(X)
y <- small$FIP


# Hand draws
# Number of iterations
ndraws <- 10000

# Create matricies or vectors to store draws
beta.draws <- matrix(0, nrow = ndraws, ncol = ncol(X))
alpha.draws <- numeric(length = ndraws)
sigma2 <- numeric(length = ndraws)
sigma2_beta <- numeric(length = ndraws)

# Set initial values
alpha.draws[1] <- 4
sigma2[1] <- .7
sigma2_beta[1] <- 2
a <- 40
b <- 8
X_mat <- as.matrix(X)
# Set counter for number of accepted draws
accept <- 0

# Metropolis Hastings and Gibbs Sampler
# Sample beta, sigma, and alpha from the full conditional using Gibbs sampling
# Sample sigma_beta using Metropolis Hastings
for(i in 2:ndraws){
    # Subtract alpha so r ~ N(Xbeta, sigma^2)
    r <- y - alpha.draws[i-1]
    sigma_mat <- solve((t(X_mat) %*% X_mat)/sigma2[i-1] +
                           1/sigma2_beta[i-1] *diag(49))
    mu_vec <- sigma_mat %*% ((t(X) %*% r)/sigma2[i-1])

    # Draw from full conditional for beta
    beta.draws[i, ] <- rmvnorm(1, mean = mu_vec, sigma = sigma_mat)
    a_star <- 2 + (nrow(X_mat)/2)
    b_star <- 10 + .5*sum((r - (X_mat %*% beta.draws[i, ]))^2)

    # Draw from full conditional for sigma
    sigma2[i] <- 1/rgamma(1, a_star, b_star)

    # Subtract Xbeta so r ~ N(alpha, sigma^2)
    r <- y - (X_mat %*% beta.draws[i, ])
    alpha_mat <- matrix(1, ncol = 1, nrow = nrow(X))
    sigma_mat <- solve((t(alpha_mat) %*% alpha_mat)/sigma2[i] +
                           1/sigma2_beta[i-1] *diag(1))
    mu_vec <- sigma_mat %*% ((t(alpha_mat) %*% r)/sigma2[i])
    # Draw from full conditional for beta
    alpha.draws[i] <- rmvnorm(1, mean = mu_vec, sigma = sigma_mat)

    # Propose value for sigma^2 beta
    proposed_value <- rnorm(1, mean = sigma2_beta[i-1], sd = .4)

    # Metropolis Hastings for sigma^2 beta using Gaussian random walk
    if(proposed_value > 0){
        mh <- sum(dnorm(beta.draws[i, ], 0, proposed_value, log = T)) +
            sum(dgamma(proposed_value, a, b, log = T)) -
            sum(dnorm(beta.draws[i, ], 0, sigma2_beta[i-1], log = T)) -
            sum(dgamma(sigma2_beta[i-1], a, b, log = T))
        if(log(runif(1)) < mh){
            sigma2_beta[i] <- proposed_value
            accept <- accept + 1
        } else{
            sigma2_beta[i] <- sigma2_beta[i-1]
        }
    } else{
        sigma2_beta[i] <- sigma2_beta[i-1]
    }
}

# Trace plots for parameters
plot(sigma2_beta, type = "l")
plot(alpha.draws, type = "l")
plot(beta.draws[,1], type = "l")
plot(sigma2, type = "l")

# Effective Sample Size for hand draws
coda::effectiveSize(sigma2_beta)
coda::effectiveSize(alpha.draws)
coda::effectiveSize(sigma2)
coda::effectiveSize(beta.draws)


# Fit first Stan model
data <- list(N = N, K = K, X = X, y = y, a = 40, b = 8)

### Run the model and examine results
nCores <- parallel::detectCores()
options(mc.cores = nCores)          # Use all available cores
rstan_options(auto_write = TRUE)    # Cache compiled code.

# Fit model
fit3 <- stan(model_code = readLines("MLR_Model.stan"),
            data = data, iter = 10000, warmup = 1000, thin = 2, chains = 2)

# Extract samples
samples3 <- rstan::extract(fit3)

# Explore results and diagnostics using Shinystan; This just requires putting
# the fitted model into this function and then you can explore the results
# With so many parameters, it was easier to look at things using this
launch_shinystan(fit3)

# Sensitivity analysis
## Fit model 2
data <- list(N = N, K = K, X = X, y = y, a = 36, b = 12)

fit <- stan(model_code = readLines("MLR_Model.stan"),
            data = data, iter = 10000, warmup = 1000, thin = 2, chains = 2)


samples <- rstan::extract(fit)
launch_shinystan(fit)

## Fit model 3
data <- list(N = N, K = K, X = X, y = y, a = 1, b = 1)

fit2 <- stan(model_code = readLines("MLR_Model.stan"),
            data = data, iter = 10000, warmup = 1000, thin = 2, chains = 2)


samples2 <- rstan::extract(fit2)
launch_shinystan(fit2)

# Combine the samples together
chains <- cbind(samples3[[1]],samples3[[2]],samples3[[3]], samples3[[4]])
sims <- as.mcmc(chains)

# Calculate Raftery Lewis diagnostic
r_l <- raftery.diag(sims)

# Create data frame with the chains
chains_frame <- data.frame(chains, iter = 1:nrow(chains))

# Plot some of the trace plots
p1 <- ggplot(chains_frame, aes(x = iter, y = X5)) +
    geom_line() +
    theme_minimal() +
    labs(x = "Iteration", y =  TeX(r'($\beta_5$)'))


p2 <- ggplot(chains_frame, aes(x = iter, y = X50)) +
    geom_line() +
    theme_minimal() +
    labs(x = "Iteration", y = TeX(r'($\sigma^2$)'))

p3 <- ggplot(chains_frame, aes(x = iter, y = X51)) +
    geom_line() +
    theme_minimal() +
    labs(x = "Iteration", y = TeX(r'($\sigma_b^2$)'))

p4 <- ggplot(chains_frame, aes(x = iter, y = X52)) +
    geom_line() +
    theme_minimal() +
    labs(x = "Iteration", y = TeX(r'($\alpha$)'))

p1 + p2 + p3 + p4

# Frequentist
cv.out <- cv.glmnet(as.matrix(X),y,alpha=0, standardize = TRUE)
# Plot the cross validation for the best lambda value
plot(cv.out)
# Extract the best lambda value
bestlam <- cv.out$lambda.min

# Fit model with the best lambda
mod <- glmnet(X, y , alpha = 0, lambda = bestlam, standardize = TRUE)

# Bootstrap the data and get coefficients with each bootstrap dataset
bootstrap_coef <- matrix(0, nrow = 500, ncol = 50)
for(i in 1:500){
    n <- nrow(small)
    obs <- sample(1:n, n, replace = TRUE)
    boot_sample <- small[obs, ]
    x <- model.matrix(FIP ~.-1-Name-year-last_name-first_name, data = boot_sample)
    y <- boot_sample$FIP
    model <- glmnet(x, y, alpha = 0, lambda = bestlam)
    coef <- as.vector(predict(model,type="coefficients",s=bestlam))
    bootstrap_coef[i, ] <- coef
}

# Calculate confidence intervals and estimates
cis <- apply(bootstrap_coef, 2, quantile, c(.025, .975))
boot_est <- apply(bootstrap_coef, 2, mean)
vars <- rownames(predict(model,type="coefficients",s=bestlam))
ridge_freq <- data.frame(Variable = vars, Lower = cis[1, ],
                         Estimate = boot_est, Upper = cis[2,]) %>%
    arrange(desc(Estimate))

# Examine the coefficents that don't have 0 in the interval
ridge_freq %>% filter((Lower > 0) | Upper < 0) %>% arrange(desc(Estimate))
```