-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalyse.py
More file actions
120 lines (91 loc) · 4.6 KB
/
analyse.py
File metadata and controls
120 lines (91 loc) · 4.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import numpy as np
np.random.seed(1337)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.models import load_model
from keras.layers.recurrent import LSTM
from sklearn.metrics import classification_report
import helpers
from constants import GLOVE_DIR, WORD2VEC_TXT, MAX_NB_WORDS, \
MAX_SEQUENCE_LENGTH, VALIDATION_SPLIT, EMBEDDING_DIM, EXCEL_NORMALIZED_DATASET_NAME, RESULTS_DIR
print('Processing text dataset')
if not os.path.exists(os.path.join(RESULTS_DIR, EXCEL_NORMALIZED_DATASET_NAME)):
print('Create normalized dataset with forms...')
texts_normalized, text_ids, labels, labels_names, labels_index = helpers.make_normalized_dataset()
labels_letter, labels_digit_names = helpers.make_letters_class_map(labels_names)
helpers.write_dataset_to_file(labels_names=labels_names,
labels=labels,
text_ids=text_ids,
texts=texts_normalized,
fileName="full_dataset_with_forms")
else:
file_name = os.path.join(RESULTS_DIR, EXCEL_NORMALIZED_DATASET_NAME)
print('Read normalized dataset from file {}...'.format(file_name))
texts_normalized, text_ids, labels, labels_names, labels_index = helpers.read_normalized_dataset(
file_name=file_name)
labels_letter, labels_digit_names, labels_letters_index = helpers.make_letters_class_map(labels_names)
CURRENT_MODE = helpers.ANALYSIS_MODE.GENERALIZE_LETTERS
if CURRENT_MODE == helpers.ANALYSIS_MODE.GENERALIZE_LETTERS:
group_labels_ids, group_labels, group_labels_index = helpers.generalize_labels(labels_names)
labels_for_training = group_labels_ids
labels_to_id_dic = group_labels_index
elif CURRENT_MODE == helpers.ANALYSIS_MODE.MIXED:
labels_for_training = labels
labels_to_id_dic = labels_index
elif CURRENT_MODE == helpers.ANALYSIS_MODE.ONLY_LETTERS:
labels_for_training = labels_letter
labels_to_id_dic = labels_letters_index
elif CURRENT_MODE == helpers.ANALYSIS_MODE.ONLY_DIGITS:
labels_for_training = labels_digit_names
labels_to_id_dic = {str(i):i for i in set(labels_digit_names)}
print('Found %s texts.' % len(texts_normalized))
if not os.path.exists(os.path.join(GLOVE_DIR, WORD2VEC_TXT)):
print('Create txt file from Russian binary word2vec file...')
helpers.make_txt_word2vec_from_bin()
print('Indexing word vectors.')
# finally, vectorize the text samples into a 2D integer tensor
data, word_index = helpers.tokenize_texts(texts_normalized=texts_normalized)
labels = to_categorical(np.asarray(labels_for_training))
# number of distinct classes
class_num = labels.shape[1]
print("Number of distinct classes: {}".format(class_num))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
print('Preparing embedding matrix.')
nb_words, embedding_matrix = helpers.create_embedding_matrix(word_index=word_index, texts_normalized=texts_normalized)
print('Training model.')
# sequence_input, preds = helpers.construct_cnn(nb_words=nb_words, class_num=class_num, embedding_matrix=embedding_matrix)
sequence_input, preds = helpers.construct_lstm(nb_words=nb_words, class_num=class_num, embedding_matrix=embedding_matrix)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])
NUMBER_OF_EPOCHS = 5
# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val),
nb_epoch=NUMBER_OF_EPOCHS, batch_size=200)
model.save('results/russian_{}_ep.h5'.format(NUMBER_OF_EPOCHS))
print('Saved model.')
model = load_model('results/russian_{}_ep.h5'.format(NUMBER_OF_EPOCHS))
print('Loaded model.')
label_list = [0 for i in range(len(list(labels_to_id_dic.keys())))]
for (key, value) in labels_to_id_dic.items():
try:
label_list[value] = key
except IndexError:
print(value, key)
y_est = model.predict(x_val)
print(classification_report(np.argmax(y_val, axis=1), np.argmax(y_est, axis=1), target_names=label_list))