-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathnormalizing_data.py
More file actions
92 lines (73 loc) · 3.84 KB
/
Copy pathnormalizing_data.py
File metadata and controls
92 lines (73 loc) · 3.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
@author: Tobias Carryer
"""
import numpy as np
from sklearn.preprocessing import MinMaxScaler
def create_min_max_scalers(subject_data):
"""
Creates the MinMaxScalers necessary to normalize the data.
:param subject_data: DataFrame
:return: Dictionary. Key: String, feature to normalize. Value: MinMaxScaler that can normalize the feature.
"""
# We could create one scaler for all the features but this approach makes it easier to use the scalers outside
# of this project if someone needs them.
all_btO2_data = np.array(subject_data["BtO2"].tolist())
all_hr_data = np.array(subject_data["HR"].tolist())
all_spO2_data = np.array(subject_data["SpO2"].tolist())
all_artmap_data = np.array(subject_data["artMAP"].tolist())
btO2_scaler = MinMaxScaler()
hr_scaler = MinMaxScaler()
spO2_scaler = MinMaxScaler()
artmap_scaler = MinMaxScaler()
btO2_scaler.fit(all_btO2_data.reshape(-1, 1))
hr_scaler.fit(all_hr_data.reshape(-1, 1))
spO2_scaler.fit(all_spO2_data.reshape(-1, 1))
artmap_scaler.fit(all_artmap_data.reshape(-1, 1))
return {"BtO2": btO2_scaler, "HR": hr_scaler, "SpO2": spO2_scaler, "artMAP": artmap_scaler}
def identify_extreme_subjects(all_data):
"""
Identifies the subjects with the most extreme of each feature so the observation with that point can be put in the
training set, ensuring the normalized CV and test sets will be within the range defined by the most extreme features.
:param all_data: DataFrame
:return: Dictionary. Key: Subject number of a subject with the most extreme of a feature. Value: The time of the row
with the most extreme of a feature for the subject.
"""
def max_row(all_data, col):
return all_data.loc[all_data[col] == all_data[col].max()].iloc[0]
def min_row(all_data, col):
return all_data.loc[all_data[col] == all_data[col].min()].iloc[0]
def get_subject_number(row):
return int(row["subject_id"].lstrip("confocal_"))
def add_to_dictionary(dictionary, row):
subject_number = get_subject_number(row)
if subject_number in dictionary:
dictionary[subject_number] = max(row["time"], dictionary[subject_number]) # Store the later time
else:
dictionary[subject_number] = row["time"]
min_date_for_subject = {}
add_to_dictionary(min_date_for_subject, max_row(all_data, "BtO2"))
add_to_dictionary(min_date_for_subject, max_row(all_data, "HR"))
add_to_dictionary(min_date_for_subject, max_row(all_data, "SpO2"))
add_to_dictionary(min_date_for_subject, max_row(all_data, "artMAP"))
add_to_dictionary(min_date_for_subject, min_row(all_data, "BtO2"))
add_to_dictionary(min_date_for_subject, min_row(all_data, "HR"))
add_to_dictionary(min_date_for_subject, min_row(all_data, "SpO2"))
add_to_dictionary(min_date_for_subject, min_row(all_data, "artMAP"))
return min_date_for_subject
def normalize(data, min_max_scalers):
"""
Normalizes data into a 0 to 1 range. Some subjects' data span different ranges than others' so we normalize
all the data together independent of what subject contributed the data point. This way, the model can learn when a
subject's data is larger than the others'.
It is assumed each of the feature columns being normalized has at least one non-Nan value.
:param data: DataFrame
:param min_max_scalers: Dictionary. Key: String, feature to scale. Value: MinMaxScaler
:return: Normalized data
"""
for feature, scaler in min_max_scalers.items():
# Normalize the feature and assign it back to the same column, ignoring NaNs
null_index = data[feature].isnull()
to_transform = np.array(data[feature].loc[~null_index].tolist()).reshape(-1, 1)
normalized = scaler.transform(to_transform)
data.loc[~null_index, feature] = normalized
return data