-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbaseline_table_generator.py
More file actions
102 lines (88 loc) · 4.96 KB
/
baseline_table_generator.py
File metadata and controls
102 lines (88 loc) · 4.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# baseline_table_generator.py
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, kruskal, shapiro, f_oneway, levene, ttest_ind, mannwhitneyu
def bs_count(sample):
cat_col = [col for col in sample.columns if sample[col].nunique() <= 3]
cat_table = pd.DataFrame()
for col in cat_col:
counts = sample[col].value_counts().sort_index()
percentages = (sample[col].value_counts(normalize=True) * 100).round(1).sort_index()
for level in counts.index:
var_name = f"{col}_{level}"
merged_value = f"{counts[level]:,.0f} ({percentages[level]:.1f})"
cat_table = pd.concat([cat_table, pd.DataFrame({'Variable': [var_name], 'Count': [merged_value]})], ignore_index=True)
num_table = pd.DataFrame(columns=['Variable', 'Count'])
num_col = [col for col in sample.columns if col not in cat_col]
for col in num_col:
med = sample[col].median()
q1 = sample[col].quantile(0.25)
q3 = sample[col].quantile(0.75)
num_table = pd.concat([
num_table,
pd.DataFrame({'Variable': [col], 'Count': [f"{med:.1f} ({q1:.1f}-{q3:.1f})"]})
], ignore_index=True)
final_tab = pd.concat([cat_table, num_table], axis=0, ignore_index=True)
final_tab['Base_Col'] = final_tab['Variable'].str.extract(r'^(.*?)(?:_[^_]+)?$')[0]
return final_tab
def bs_res_count(sample, response_col):
sample[response_col] = sample[response_col].astype('category')
sample = sample.dropna(subset=[response_col])
cat_col = [col for col in sample.columns if (sample[col].nunique() <= 3) and col != response_col]
results_cat = []
for col in cat_col:
cat_sample = sample.dropna(subset=[col])
counts = cat_sample.groupby([response_col, col]).size().unstack(fill_value=0)
percentages = counts.div(counts.sum(axis=1), axis=0) * 100
for level in counts.columns:
row_data = {'Variable': f"{col}_{level}", 'Chi-square': None, 'p-value': None, 'Test': 'Chi-square'}
for group in counts.index:
row_data[f'classification_{group}'] = f"{counts.loc[group, level]:,} ({percentages.loc[group, level]:.0f})"
results_cat.append(row_data)
sub_sample = cat_sample.copy()
sub_sample['level_binary'] = (sub_sample[col] == level).astype(int)
sub_crosstab = pd.crosstab(sub_sample['level_binary'], sub_sample[response_col])
if sub_crosstab.shape[0] > 1 and sub_crosstab.shape[1] > 1:
chi2_stat, chi2_p = chi2_contingency(sub_crosstab, correction=False)[:2]
else:
chi2_stat, chi2_p = None, 1.0
for row in results_cat:
if row['Variable'] == f"{col}_{level}":
row['Chi-square'] = round(chi2_stat, 3) if chi2_stat is not None else None
row['p-value'] = '<0.001' if chi2_p < 0.001 else round(chi2_p, 3)
results_num = []
num_col = [col for col in sample.columns if col not in cat_col + [response_col]]
for col in num_col:
num_sample = sample.dropna(subset=[col])
normality = [shapiro(num_sample[num_sample[response_col] == cat][col])[1] if len(num_sample[num_sample[response_col] == cat]) >= 3 else 0 for cat in num_sample[response_col].cat.categories]
is_normal = all(p > 0.05 for p in normality)
group_stats = num_sample.groupby(response_col).agg(median=(col, 'median'), q1=(col, lambda x: x.quantile(0.25)), q3=(col, lambda x: x.quantile(0.75)))
row_data = {'Variable': col, 'p-value': None}
for group in group_stats.index:
row_data[f'classification_{group}'] = f"{group_stats.loc[group, 'median']:.1f} ({group_stats.loc[group, 'q1']:.1f}-{group_stats.loc[group, 'q3']:.1f})"
groups = [num_sample[num_sample[response_col] == cat][col] for cat in num_sample[response_col].cat.categories]
test_used = None
if len(groups) == 2:
if is_normal:
_, p_levene = levene(*groups)
stat, p = ttest_ind(*groups, equal_var=p_levene > 0.05)
test_used = "t-test" if p_levene > 0.05 else "Welch's t-test"
else:
stat, p = mannwhitneyu(*groups)
test_used = "Mann-Whitney U"
else:
if is_normal:
stat, p = f_oneway(*groups)
test_used = "ANOVA"
else:
stat, p = kruskal(*groups)
test_used = "Kruskal-Wallis"
row_data['p-value'] = '<0.001' if p < 0.001 else round(p, 3)
row_data['Test'] = test_used
results_num.append(row_data)
final_results = pd.DataFrame(results_cat + results_num)
classification_cols = [col for col in final_results.columns if col.startswith('classification')]
return final_results[['Variable'] + classification_cols + ['p-value', 'Test']]
def export_to_excel(df, filename='baseline_table.xlsx'):
df.to_excel(filename, index=False)
print(f"✅ Excel file saved: {filename}")