forked from AI4Bharat/IndicWav2Vec
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_report_from_manifest.py
executable file
·64 lines (51 loc) · 2 KB
/
generate_report_from_manifest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# taken from https://github.com/Open-Speech-EkStep/vakyansh-wav2vec2-experimentation/blob/main/utils/analysis/generate_wav_report_from_tsv.py
import pandas as pd
import numpy as np
import argparse
def generate_report(tsv_file):
df = pd.read_csv(tsv_file, sep="\t", header=0, names=["file", "frames"])
df["duration_in_sec"] = [i / 16000 for i in df["frames"]]
print("=" * 100)
print("Total number of files in ", tsv_file, ":", len(df))
total_duration = sum(df["duration_in_sec"]) / 3600
print("Total duration in hours: ", total_duration)
min_duration = min(df["duration_in_sec"])
max_duration = max(df["duration_in_sec"])
print("Min duration of a file in seconds:", min_duration)
print("Max duration of a file in seconds:", max_duration)
mean_duration = np.mean(df["duration_in_sec"])
print("Mean duration across files in seconds:", mean_duration)
files_less_than_equal_to_1s = []
files_less_than_equal_to_5s = []
min_dur = []
for index, dur in enumerate(df["duration_in_sec"]):
if dur <= 1:
files_less_than_equal_to_1s.append(dur)
if dur <= 5:
files_less_than_equal_to_5s.append(dur)
print(
"Number of files with duration less than/equal to 1 seconds:",
len(files_less_than_equal_to_1s),
)
print(
"Total duration of files less thanthan/equal to 1 seconds:",
sum(files_less_than_equal_to_1s) / 3600,
" hours",
)
print("*" * 100)
print(
"Number of files with duration less than/equal to 5 seconds:",
len(files_less_than_equal_to_5s),
)
print(
"Total duration of files less thanthan/equal to 5 seconds:",
sum(files_less_than_equal_to_5s) / 3600,
" hours",
)
print("=" * 100)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run")
parser.add_argument("--tsv", type=str, help="tsv file to analyse")
args = parser.parse_args()
tsv_file = args.tsv
generate_report(tsv_file)