Tables/baseline_table_generator.py at main · HUI135/Tables · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# baseline_table_generator.py

import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, kruskal, shapiro, f_oneway, levene, ttest_ind, mannwhitneyu

def bs_count(sample):
    cat_col = [col for col in sample.columns if sample[col].nunique() <= 3]
    cat_table = pd.DataFrame()
    for col in cat_col:
        counts = sample[col].value_counts().sort_index()
        percentages = (sample[col].value_counts(normalize=True) * 100).round(1).sort_index()
        for level in counts.index:
            var_name = f"{col}_{level}"
            merged_value = f"{counts[level]:,.0f} ({percentages[level]:.1f})"
            cat_table = pd.concat([cat_table, pd.DataFrame({'Variable': [var_name], 'Count': [merged_value]})], ignore_index=True)

    num_table = pd.DataFrame(columns=['Variable', 'Count'])
    num_col = [col for col in sample.columns if col not in cat_col]
    for col in num_col:
        med = sample[col].median()
        q1 = sample[col].quantile(0.25)
        q3 = sample[col].quantile(0.75)
        num_table = pd.concat([
            num_table,
            pd.DataFrame({'Variable': [col], 'Count': [f"{med:.1f} ({q1:.1f}-{q3:.1f})"]})
        ], ignore_index=True)

    final_tab = pd.concat([cat_table, num_table], axis=0, ignore_index=True)
    final_tab['Base_Col'] = final_tab['Variable'].str.extract(r'^(.*?)(?:_[^_]+)?$')[0]
    return final_tab

def bs_res_count(sample, response_col):
    sample[response_col] = sample[response_col].astype('category')
    sample = sample.dropna(subset=[response_col])
    cat_col = [col for col in sample.columns if (sample[col].nunique() <= 3) and col != response_col]

    results_cat = []
    for col in cat_col:
        cat_sample = sample.dropna(subset=[col])
        counts = cat_sample.groupby([response_col, col]).size().unstack(fill_value=0)
        percentages = counts.div(counts.sum(axis=1), axis=0) * 100
        for level in counts.columns:
            row_data = {'Variable': f"{col}_{level}", 'Chi-square': None, 'p-value': None, 'Test': 'Chi-square'}
            for group in counts.index:
                row_data[f'classification_{group}'] = f"{counts.loc[group, level]:,} ({percentages.loc[group, level]:.0f})"
            results_cat.append(row_data)

            sub_sample = cat_sample.copy()
            sub_sample['level_binary'] = (sub_sample[col] == level).astype(int)
            sub_crosstab = pd.crosstab(sub_sample['level_binary'], sub_sample[response_col])
            if sub_crosstab.shape[0] > 1 and sub_crosstab.shape[1] > 1:
                chi2_stat, chi2_p = chi2_contingency(sub_crosstab, correction=False)[:2]
            else:
                chi2_stat, chi2_p = None, 1.0

            for row in results_cat:
                if row['Variable'] == f"{col}_{level}":
                    row['Chi-square'] = round(chi2_stat, 3) if chi2_stat is not None else None
                    row['p-value'] = '<0.001' if chi2_p < 0.001 else round(chi2_p, 3)

    results_num = []
    num_col = [col for col in sample.columns if col not in cat_col + [response_col]]
    for col in num_col:
        num_sample = sample.dropna(subset=[col])
        normality = [shapiro(num_sample[num_sample[response_col] == cat][col])[1] if len(num_sample[num_sample[response_col] == cat]) >= 3 else 0 for cat in num_sample[response_col].cat.categories]
        is_normal = all(p > 0.05 for p in normality)
        group_stats = num_sample.groupby(response_col).agg(median=(col, 'median'), q1=(col, lambda x: x.quantile(0.25)), q3=(col, lambda x: x.quantile(0.75)))
        row_data = {'Variable': col, 'p-value': None}

        for group in group_stats.index:
            row_data[f'classification_{group}'] = f"{group_stats.loc[group, 'median']:.1f} ({group_stats.loc[group, 'q1']:.1f}-{group_stats.loc[group, 'q3']:.1f})"

        groups = [num_sample[num_sample[response_col] == cat][col] for cat in num_sample[response_col].cat.categories]
        test_used = None
        if len(groups) == 2:
            if is_normal:
                _, p_levene = levene(*groups)
                stat, p = ttest_ind(*groups, equal_var=p_levene > 0.05)
                test_used = "t-test" if p_levene > 0.05 else "Welch's t-test"
            else:
                stat, p = mannwhitneyu(*groups)
                test_used = "Mann-Whitney U"
        else:
            if is_normal:
                stat, p = f_oneway(*groups)
                test_used = "ANOVA"
            else:
                stat, p = kruskal(*groups)
                test_used = "Kruskal-Wallis"

        row_data['p-value'] = '<0.001' if p < 0.001 else round(p, 3)
        row_data['Test'] = test_used
        results_num.append(row_data)

    final_results = pd.DataFrame(results_cat + results_num)
    classification_cols = [col for col in final_results.columns if col.startswith('classification')]
    return final_results[['Variable'] + classification_cols + ['p-value', 'Test']]

def export_to_excel(df, filename='baseline_table.xlsx'):
    df.to_excel(filename, index=False)
    print(f"✅ Excel file saved: {filename}")