forked from ruohoruotsi/LSTM-Music-Genre-Classification
-
Notifications
You must be signed in to change notification settings - Fork 0
/
GenreFeatureData.py
157 lines (127 loc) · 5.74 KB
/
GenreFeatureData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import librosa
import math
import os
import re
import numpy as np
class GenreFeatureData:
"Music audio features for genre classification"
hop_length = None
genre_list = [
"classical",
"country",
"disco",
"hiphop",
"jazz",
"metal",
"pop",
"reggae",
]
dir_trainfolder = "./gtzan/_train"
dir_devfolder = "./gtzan/_validation"
dir_testfolder = "./gtzan/_test"
dir_all_files = "./gtzan"
train_X_preprocessed_data = "./gtzan/data_train_input.npy"
train_Y_preprocessed_data = "./gtzan/data_train_target.npy"
dev_X_preprocessed_data = "./gtzan/data_validation_input.npy"
dev_Y_preprocessed_data = "./gtzan/data_validation_target.npy"
test_X_preprocessed_data = "./gtzan/data_test_input.npy"
test_Y_preprocessed_data = "./gtzan/data_test_target.npy"
train_X = train_Y = None
dev_X = dev_Y = None
test_X = test_Y = None
def __init__(self):
self.hop_length = 512
self.timeseries_length_list = []
self.trainfiles_list = self.path_to_audiofiles(self.dir_trainfolder)
self.devfiles_list = self.path_to_audiofiles(self.dir_devfolder)
self.testfiles_list = self.path_to_audiofiles(self.dir_testfolder)
self.all_files_list = []
self.all_files_list.extend(self.trainfiles_list)
self.all_files_list.extend(self.devfiles_list)
self.all_files_list.extend(self.testfiles_list)
# compute minimum timeseries length, slow to compute, caching pre-computed value of 1290
# self.precompute_min_timeseries_len()
# print("min(self.timeseries_length_list) ==" + str(min(self.timeseries_length_list)))
# self.timeseries_length = min(self.timeseries_length_list)
self.timeseries_length = (
128
) # sequence length == 128, default fftsize == 2048 & hop == 512 @ SR of 22050
# equals 128 overlapped windows that cover approx ~3.065 seconds of audio, which is a bit small!
def load_preprocess_data(self):
print("[DEBUG] total number of files: " + str(len(self.timeseries_length_list)))
# Training set
self.train_X, self.train_Y = self.extract_audio_features(self.trainfiles_list)
with open(self.train_X_preprocessed_data, "wb") as f:
np.save(f, self.train_X)
with open(self.train_Y_preprocessed_data, "wb") as f:
self.train_Y = self.one_hot(self.train_Y)
np.save(f, self.train_Y)
# Validation set
self.dev_X, self.dev_Y = self.extract_audio_features(self.devfiles_list)
with open(self.dev_X_preprocessed_data, "wb") as f:
np.save(f, self.dev_X)
with open(self.dev_Y_preprocessed_data, "wb") as f:
self.dev_Y = self.one_hot(self.dev_Y)
np.save(f, self.dev_Y)
# Test set
self.test_X, self.test_Y = self.extract_audio_features(self.testfiles_list)
with open(self.test_X_preprocessed_data, "wb") as f:
np.save(f, self.test_X)
with open(self.test_Y_preprocessed_data, "wb") as f:
self.test_Y = self.one_hot(self.test_Y)
np.save(f, self.test_Y)
def load_deserialize_data(self):
self.train_X = np.load(self.train_X_preprocessed_data)
self.train_Y = np.load(self.train_Y_preprocessed_data)
self.dev_X = np.load(self.dev_X_preprocessed_data)
self.dev_Y = np.load(self.dev_Y_preprocessed_data)
self.test_X = np.load(self.test_X_preprocessed_data)
self.test_Y = np.load(self.test_Y_preprocessed_data)
def precompute_min_timeseries_len(self):
for file in self.all_files_list:
print("Loading " + str(file))
y, sr = librosa.load(file)
self.timeseries_length_list.append(math.ceil(len(y) / self.hop_length))
def extract_audio_features(self, list_of_audiofiles):
data = np.zeros(
(len(list_of_audiofiles), self.timeseries_length, 33), dtype=np.float64
)
target = []
for i, file in enumerate(list_of_audiofiles):
y, sr = librosa.load(file)
mfcc = librosa.feature.mfcc(
y=y, sr=sr, hop_length=self.hop_length, n_mfcc=13
)
spectral_center = librosa.feature.spectral_centroid(
y=y, sr=sr, hop_length=self.hop_length
)
chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=self.hop_length)
spectral_contrast = librosa.feature.spectral_contrast(
y=y, sr=sr, hop_length=self.hop_length
)
splits = re.split("[ .]", file)
genre = re.split("[ /]", splits[1])[3]
target.append(genre)
data[i, :, 0:13] = mfcc.T[0:self.timeseries_length, :]
data[i, :, 13:14] = spectral_center.T[0:self.timeseries_length, :]
data[i, :, 14:26] = chroma.T[0:self.timeseries_length, :]
data[i, :, 26:33] = spectral_contrast.T[0:self.timeseries_length, :]
print(
"Extracted features audio track %i of %i."
% (i + 1, len(list_of_audiofiles))
)
return data, np.expand_dims(np.asarray(target), axis=1)
def one_hot(self, Y_genre_strings):
y_one_hot = np.zeros((Y_genre_strings.shape[0], len(self.genre_list)))
for i, genre_string in enumerate(Y_genre_strings):
index = self.genre_list.index(genre_string)
y_one_hot[i, index] = 1
return y_one_hot
@staticmethod
def path_to_audiofiles(dir_folder):
list_of_audio = []
for file in os.listdir(dir_folder):
if file.endswith(".au"):
directory = "%s/%s" % (dir_folder, file)
list_of_audio.append(directory)
return list_of_audio