-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
60 lines (45 loc) · 2.3 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import time
start_time = time.time()
print('Start program.')
import os
import whisper
from shutil import which
from prettify_txt import prettify_txt
# print(whisper.available_models())
# Define the path to the ffmpeg executable
ffmpeg_path = r"C:\Users\hamza\Downloads\ffmpeg-7.0.1-essentials_build\ffmpeg-7.0.1-essentials_build\bin\ffmpeg.exe" # Adjust the path to where you extracted ffmpeg
# ffmpeg_path = r"C:\Users\hamza\OneDrive - University of Manitoba\Documents\HISHAM\Computer Science\Transcription Project\Whisper Testing\ffmpeg-7.0.1.tar.xz" # Adjust the path to where you extracted ffmpeg
# C:\Users\hamza\Downloads\ffmpeg-7.0.1-essentials_build\ffmpeg-7.0.1-essentials_build\bin\ffmpeg.exe
# Check if ffmpeg is available
if not which("ffmpeg"):
if os.path.isfile(ffmpeg_path):
os.environ["PATH"] += os.pathsep + os.path.dirname(ffmpeg_path)
else:
print("FFmpeg not found. Please ensure it is installed and the path is correct.")
exit(1)
# Load the Whisper model with GPU support
# ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large-v2', 'large-v3', 'large']
# TODO: .en means fine-tuned on English data
model_size = "large"
# base model sucks
# small took 22.46 seconds
# medium took 70.64 seconds
# large took 161.56 seconds
model = whisper.load_model(model_size)
# Transcribe the audio file with language specified and word timestamps
result = model.transcribe(r"C:\Users\hamza\OneDrive - University of Manitoba\Documents\HISHAM\Computer Science\Transcription Project\Whisper Testing\test.mp3", fp16=False, language="ur", word_timestamps=True)
# result = model.transcribe(r"C:\Users\hamza\Downloads\test1.mp3", fp16=False, language="ur", word_timestamps=True)
output_file_name = f'transcription-{model_size}.txt'
# Write the transcription to a text file with utf-8 encoding
with open(output_file_name, "w", encoding="utf-8") as f:
f.write(result["text"])
# for segment in result["segments"]:
# start = segment["start"]
# end = segment["end"]
# text = segment["text"]
# f.write(f"[{start:.2f} - {end:.2f}] {text}\n")
# Prettify the transcription text file
prettify_txt(output_file_name)
end_time = time.time()
# Print the total execution time
print(f"Execution time: {end_time - start_time:.2f} seconds")