Yunitator/yunified.py at master · srvk/Yunitator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
# Unified scipt for Noisemes, Yunitator, and TALNet
#
#
# Usage: yunified.py SCRIPT INPUT_DIR YUNITATOR_OUTPUT_FILE HTK_CHUNKSIZE
#
# SCRIPT: which script to run [yunitator, noisemes]
# INPUT_DIR: the input directory (eg. /vagrant/data)
# HTK_CHUNKSIZE: The number of frames to use for each chunk (10 frames per second)

# ---------------------------------------------------------------------
# -------------------------- PATH VARIABLES ---------------------------
# ---------------------------------------------------------------------


RNNPATH='OpenSAT/SSSF/code/predict/RNN'
TOOLSPATH="G/coconut"
SCALINGFACTORS='OpenSAT/SSSF/code/predict/model/noiseme.old/scale.pkl'
NOISEMES_CLASSES='OpenSAT/noisemeclasses_sum.txt'


# ---------------------------------------------------------------------
# ------------------------------ PACKAGES -----------------------------
# ---------------------------------------------------------------------


import os
import sys
import torch
from torch.nn.utils.rnn import pack_padded_sequence
from torch.autograd import Variable
import _pickle as cPickle
import pickle
import numpy
from scipy.io import savemat

sys.path.append(os.path.expanduser(TOOLSPATH))
from fileutils import smart_open
from fileutils.htk import readHtk

sys.path.append(os.path.expanduser(RNNPATH))
from RNN import RNN
from Yunitator.Net import Net

import warnings                     # Not safe at all, but it's to discard
warnings.filterwarnings('ignore')   # ComplexWarning: Casting complex values to real discards the imaginary part
                                    # that occurs when applying the LDA (some coefficients of the matrix can have
                                    # an imaginary part)
# ---------------------------------------------------------------------
# --------------------------- BEGIN SCRIPT ----------------------------
# ---------------------------------------------------------------------


# Script options
YUNITATOR = 'yunitator'
NOISEMES = 'noisemes'
ENGLISH = 'english'
UNIVERSAL = 'universal'
OLD = 'old'

# Args
try:
    SCRIPT = sys.argv[1]  # Which script to run [yunitator, noisemes]
    INPUT_DIR = sys.argv[2].rstrip('/')  # HTK Dir (eg. /vagrant/data/)
    HTK_CHUNKSIZE = int(sys.argv[3])
    try:
        MODE = sys.argv[4]  # Which mode tu use [english, universal, old], only used if SCRIPT == YUNITATOR
    except IndexError:
        MODE = OLD
except IndexError:
    print("WRONG NUMBER INPUTS")
    exit()

# Choose Matrix variables
if SCRIPT == YUNITATOR:
    if MODE == OLD:
        TRANSFMATRIX = 'Yunitator/reductions/pca-old.pkl'         # PCA of size 50
        NNET = 'Yunitator/models/model-old.pt'
    elif MODE == ENGLISH:
        TRANSFMATRIX = 'Yunitator/reductions/pca-english.pkl'      # PCA of size 150
        NNET = 'Yunitator/models/model-english.pt'
    elif MODE == UNIVERSAL:
        TRANSFMATRIX = 'Yunitator/reductions/lda-universal.pkl'    # LDA of size 150
        NNET = 'Yunitator/models/model-universal.pt'
elif SCRIPT == NOISEMES:
    TRANSFMATRIX = 'OpenSAT/SSSF/code/predict/model/noiseme.old/pca.pkl'
    NNET = 'OpenSAT/SSSF/code/predict/model/noiseme.old/net.pkl.gz'

# Prepare output directories
if SCRIPT == YUNITATOR:
    INPUT_DIR += "/Yunitemp"
    OUTPUT_DIR = INPUT_DIR
elif SCRIPT == NOISEMES:
    OUTPUT_DIR = INPUT_DIR + "/hyp_sum"
    INPUT_DIR += "/feature"
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)


# Load neural network
if SCRIPT == YUNITATOR:
    if MODE == OLD:
        net = Net(50, 200, 1, 4) #.cuda()
    elif MODE == UNIVERSAL:
        net = Net(150, 300, 2, 4)
    elif MODE == ENGLISH:
        net = Net(150, 200, 1, 4)
    net.load_state_dict(torch.load(NNET, map_location = lambda storage, loc: storage))
elif SCRIPT == NOISEMES:
    net = RNN(filename=os.path.expanduser(NNET))


# Get class names
# noisemeclasses_sum.txt contains class name strings, one per line
if SCRIPT == YUNITATOR:
    class_names =['SIL', 'CHI', 'MAL', 'FEM']
elif SCRIPT == NOISEMES:
    class_names = []
    classfile = open(NOISEMES_CLASSES, 'rb')
    for line in classfile:
        class_names.append(line.decode('utf-8').rstrip('\n'))


# Load PCA matrix and scaling factors
if SCRIPT == YUNITATOR:
    with open(TRANSFMATRIX, 'rb') as f:
        if MODE == ENGLISH or MODE == UNIVERSAL:
            data = pickle.load(f, encoding="ASCII")
        elif MODE == OLD:
            data = cPickle.load(f, encoding="latin1")

        if MODE == OLD or MODE == ENGLISH:
            mask, mu, sigma, V, w, b = data['mask'], data['mu'], data['sigma'], data['V'], data['w'], data['b']
            reductor = lambda feat: ((feat[:, mask] - mu) / sigma).dot(V) * w + b
        elif MODE == UNIVERSAL:
            mask, V, mu, sigma = data['mask'], data['V'], data['mu'], data['sigma']
            reductor = lambda feat: (feat.dot(V)[:, mask] - mu) / sigma
elif SCRIPT == NOISEMES:
    with open(os.path.expanduser(TRANSFMATRIX), 'rb') as f:
        locals().update(cPickle.load(f, encoding="latin1"))
        with open(os.path.expanduser(SCALINGFACTORS), 'rb') as f:
            w, b = cPickle.load(f, encoding="latin1")
            reductor = lambda feat: ((feat[:, mask] - mu) / sigma).dot(V) * w + b

# These are chunking parameters.
# ex: HTK_chunksize = 2000
if SCRIPT == YUNITATOR:
    preSamples = 0
elif SCRIPT == NOISEMES:
    preSamples = 30

for file in os.listdir(INPUT_DIR):
    # Load input feature and predict
    filename, extension = os.path.splitext(os.path.split(file)[1])
    if extension != ".htk":
        print("Ignoring "+os.path.join(INPUT_DIR, file))
        continue
    conf = {}

    # noisemes needs a variable to remember the last timestep when chunking
    last_t = 0
    chunks = 0
    for feat in readHtk(os.path.join(INPUT_DIR,file), HTK_CHUNKSIZE, preSamples):
        if SCRIPT == YUNITATOR:
            feature = reductor(feat)
            input = Variable(torch.from_numpy(numpy.expand_dims(feature, 0).astype('float32')))
            input = pack_padded_sequence(input, [len(feature)], batch_first = True)
            output = net(input).data.data.cpu().numpy()

            output = output == output.max(axis = 1, keepdims = True)
            z = numpy.zeros((len(class_names), 1), dtype = 'bool')
            output = numpy.hstack([z, output.T, z])
            cls_ids, starts = (~output[:, :-1] & output[:, 1:]).nonzero()
            _, ends = (output[:, :-1] & ~output[:, 1:]).nonzero()

            with open(OUTPUT_DIR+"/"+filename+".rttm.sorted", 'a') as f:
                #print("Feature shape "+str(feat.shape))
                for cls, start, end in zip(cls_ids, starts, ends):
                    print("%s\t%s\t%s" % (cls,start,end))
                    f.write("SPEAKER {} 1 {:.1f} {:.1f} <NA> <NA> {} <NA> <NA>\n".format(
                        filename+".rttm", (start+(chunks*HTK_CHUNKSIZE)) * 0.1, (end - start) * 0.1, class_names[cls]))
                chunks += 1
        elif SCRIPT == NOISEMES:
            feature = reductor(feat).astype('float32')
            x = feature.reshape((1,) + feature.shape)
            m = numpy.ones(x.shape[:-1], dtype='int32')
            conf[filename] = net.predict(x, m)[0]

            # Save predictions
            with smart_open(os.path.join(OUTPUT_DIR, filename + '.confidence.pkl.gz'), 'wb') as f:
                cPickle.dump(conf, f)
                savemat(os.path.join(OUTPUT_DIR, filename + '.confidence.mat'), conf)
            result_ = conf[filename]

            # Add classes 1 and 2 (speech english and speech non english)
            # to create a class " Speech "
            result = numpy.zeros((result_.shape[0], result_.shape[1] - 1))
            result[:, 0] = result_[:, 0]
            result[:, 1] = result_[:, 1] + result_[:, 2]
            result[:, 2:] = result_[:, 3:]

            # Output RTTM
            most_likely = result.argmax(axis=1)
            confidence = result.max(axis=1)

            length_sample = len(most_likely)
            time_frame = numpy.arange(0, length_sample) * 0.1

            with open(os.path.join(OUTPUT_DIR, filename + ".lab"), "a") as lab:
                t_start = time_frame[0]+last_t
                new_t = 0

                for t in range(length_sample-preSamples):
                    # Add the last timestep if this is the first of the chunk.
                    time_frame[t] += last_t

                    # If current frame is from a different class then previous frame,
                    # then write class that ended at last frame (t-1 !).
                    # Also avoid problem: for t=0, most_likely[t-1] = most_likely[-1]
                    # which is the last item of the vector !
                    if t == 0:
                        continue

                    # write class only if it correspond to "speech" !
                    if most_likely[t] != most_likely[t-1]:
                        # get duration of class
                        t_end = time_frame[t]
                        # Choose rttm type label for speech and everything else. This may need to change to fit convention
                        type_label = "SPEAKER" if class_names[most_likely[t-1]] == 'speech' else "NON-SPEECH"
                        lab.write(u"{:.1f} {:.1f} {} {}\n".format(t_start+(0.1*chunks), t_end+(0.1*chunks), class_names[most_likely[t-1]], type_label))
                        t_start = time_frame[t]

                    new_t = time_frame[t]
                last_t = new_t
                chunks += 1