DifferentiableProgrammingForStatisticalModeling/06_SensitivityAnalysis_RecoveringCensoredValues.jl at main · maren-ha/DifferentiableProgrammingForStatisticalModeling · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""
Sensitivity Analysis II: Recovering randomly censored values from all hospitals with no missing reports
"""

#
# Setup
#

# cd("path/to/Github/repo")
path_to_repo = pwd()
include("src/setup.jl")
include("src/lov.jl")

#
# First, identify densely sampled hospitals: find all hospitals where no observations are missing
#

nhospitals = length(hospital_list)
allreported_inds = findall(i -> sum(hospital_list[i][!,:notreported])== 0, 1:nhospitals)

#
# Scenario 1: for those, randomly remove 10, 25 and 50 percent of values
#

# initialize array to save prediction errors
prederrs_df_all = []

# fit prediction models (all local coefficients)
for frac_missing in [0.1, 0.25, 0.5, 0.75]

    @info "Starting scenario with $(frac_missing) percent random censoring"

    betas = collect([0.0; 0.0; 0.0] for ind in 1:length(allreported_inds));
    lrbetas = collect([0.0; 0.0; 0.0] for ind in 1:length(allreported_inds));

    prederrs_mean = []
    prederrs_modmean = []
    prederrs_zero = []
    prederrs_lr = []
    prederrs_incr = []

    Random.seed!(42);
    nreps = 100

    @showprogress for ind in 1:length(allreported_inds)
        i = allreported_inds[ind]
        true_y, z, r = hospital_list[i][:,:currentcases], hospital_list[i][:,:incidences], hospital_list[i][:,:reported]

        prederr_incr = 0.0
        prederr_lr = 0.0
        prederr_zero = 0.0
        prederr_mean = 0.0
        prederr_modmean = 0.0

        mod_r = modify_r_wrapper(frac_missing)

        prederr_mean, prederr_modmean, prederr_zero, prederr_lr, prederr_incr, betas_reps, lrbetas_reps = fitmodels_lov(hospital_list, i, nreps, mod_r)

        betas[ind] = [mean(collect(betas_reps[rep][1] for rep in 1:nreps)); mean(collect(betas_reps[rep][2] for rep in 1:nreps)); mean(collect(betas_reps[rep][3] for rep in 1:nreps))]
        lrbetas[ind] = [mean(collect(lrbetas_reps[rep][1] for rep in 1:nreps)); mean(collect(lrbetas_reps[rep][2] for rep in 1:nreps)); mean(collect(lrbetas_reps[rep][3] for rep in 1:nreps))]

        push!(prederrs_mean, prederr_mean/nreps)
        push!(prederrs_modmean, prederr_modmean/nreps)
        push!(prederrs_zero, prederr_zero/nreps)
        push!(prederrs_lr, prederr_lr/nreps)
        push!(prederrs_incr, prederr_incr/nreps)
    end

    prederrs_df = create_prederrs_df(prederrs_mean, prederrs_modmean, prederrs_zero, prederrs_lr, prederrs_incr)
    push!(prederrs_df_all, prederrs_df)
end

# save results
path_to_results = string(path_to_repo, "/results/")
save(string(path_to_results, "randomcensoring_prederrs.jld2"), Dict("all_prederrs_dfs" => prederrs_df_all))

# if you have done the analyses before, load and create boxplots and summary statistics
prederrs_dict = load(string(path_to_results, "randomcensoring_prederrs.jld2"))
prederrs_df_all = prederrs_dict["all_prederrs_dfs"]

# create dataframe of summary statistics
nmodels = 5
nscenarios= 4
mses = []
medians = []
firstquartiles = []
thirdquartiles = []
maximums = []
for i in 1:nscenarios
    cur_pred_df = prederrs_df_all[i]
    push!(mses, round.(sum.(eachcol(cur_pred_df[!,2:end]))/nrow(cur_pred_df), digits = 3))
    push!(medians,round.(median.(eachcol(cur_pred_df[!,2:end])), digits = 3))
    push!(firstquartiles, round.(quantile.(eachcol(cur_pred_df[!,2:end]), 0.25), digits = 3))
    push!(thirdquartiles, round.(quantile.(eachcol(cur_pred_df[!,2:end]), 0.75), digits = 3))
    push!(maximums, round.(maximum.(eachcol(cur_pred_df[!,2:end])), digits=3))
end

summary_df = DataFrame(Scenario = repeat(["$(i)" for i in 1:nscenarios], inner=nmodels),
                    Model = repeat(["Mean model", "Modified mean model", "Zero model", "Linear regression", "Increment model"], nscenarios),
                    MSE = cat(mses..., dims=1),
                    FirstQuartile = cat(firstquartiles..., dims=1),
                    Median = cat(medians..., dims=1),
                    ThirdQuartile = cat(thirdquartiles..., dims=1),
                    Maximum = cat(maximums..., dims=1)
)
CSV.write(string(path_to_results, "randomcensoring_summary.csv"), summary_df)