Skip to content

Commit

Permalink
Adding UI improvements to the subtitle merging (#1033)
Browse files Browse the repository at this point in the history
  • Loading branch information
raivisdejus authored Dec 29, 2024
1 parent 6037287 commit 9d5ec9c
Show file tree
Hide file tree
Showing 7 changed files with 120 additions and 61 deletions.
2 changes: 1 addition & 1 deletion buzz/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
VERSION = "1.2.1"
VERSION = "1.3.0"
162 changes: 111 additions & 51 deletions buzz/widgets/transcription_viewer/transcription_resizer_widget.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import re
import os
import logging
import stable_whisper

import srt
from srt_equalizer import srt_equalizer
from typing import Optional
from PyQt6.QtCore import Qt, pyqtSignal
from PyQt6.QtCore import Qt, QThread, QObject, pyqtSignal
from PyQt6.QtGui import QFont
from PyQt6.QtWidgets import (
QWidget,
Expand Down Expand Up @@ -34,6 +35,69 @@

SENTENCE_END = re.compile(r'.*[.!?。!?]')

class TranscriptionWorker(QObject):
finished = pyqtSignal()
result_ready = pyqtSignal(list)

def __init__(self, transcription, transcription_service, regroup_string: str):
super().__init__()
self.transcription = transcription
self.transcription_service = transcription_service
self.regroup_string = regroup_string

def get_transcript(self, audio, **kwargs) -> dict:
buzz_segments = self.transcription_service.get_transcription_segments(
transcription_id=self.transcription.id_as_uuid
)

segments = []
words = []
text = ""
for buzz_segment in buzz_segments:
words.append({
'word': buzz_segment.text + " ",
'start': buzz_segment.start_time / 100,
'end': buzz_segment.end_time / 100,
})
text += buzz_segment.text + " "

if SENTENCE_END.match(buzz_segment.text):
segments.append({
'text': text,
'words': words
})
words = []
text = ""

return {
'language': self.transcription.language,
'segments': segments
}

def run(self):
result = stable_whisper.transcribe_any(
self.get_transcript,
self.transcription.file,
vad=os.path.exists(self.transcription.file),
suppress_silence=os.path.exists(self.transcription.file),
regroup=self.regroup_string,
check_sorted=False,
)

segments = []
for segment in result.segments:
segments.append(
Segment(
start=int(segment.start * 100),
end=int(segment.end * 100),
text=segment.text
)
)

self.result_ready.emit(segments)
self.finished.emit()


class TranscriptionResizerWidget(QWidget):
resize_button_clicked = pyqtSignal()
transcription: Transcription
Expand All @@ -52,6 +116,10 @@ def __init__(
self.transcription_service = transcription_service
self.transcriptions_updated_signal = transcriptions_updated_signal

self.new_transcript_id = None
self.thread = None
self.worker = None

self.setMinimumWidth(600)
self.setMinimumHeight(300)

Expand Down Expand Up @@ -119,7 +187,7 @@ def __init__(
self.merge_by_gap = QCheckBox(_("Merge by gap"))
self.merge_by_gap.setChecked(True)
self.merge_by_gap.setMinimumWidth(250)
self.merge_by_gap_input = LineEdit("0.1", self)
self.merge_by_gap_input = LineEdit("0.2", self)
merge_by_gap_layout = QHBoxLayout()
merge_by_gap_layout.addWidget(self.merge_by_gap)
merge_by_gap_layout.addWidget(self.merge_by_gap_input)
Expand Down Expand Up @@ -207,41 +275,22 @@ def on_resize_button_clicked(self):
if self.transcriptions_updated_signal:
self.transcriptions_updated_signal.emit(new_transcript_id)

def get_transcript(self, audio, **kwargs) -> dict:
buzz_segments = self.transcription_service.get_transcription_segments(
transcription_id=self.transcription.id_as_uuid
def on_merge_button_clicked(self):
self.new_transcript_id = self.transcription_service.copy_transcription(
self.transcription.id_as_uuid
)
self.transcription_service.update_transcription_progress(self.new_transcript_id, 0.0)

segments = []
words = []
text = ""
for buzz_segment in buzz_segments:
words.append({
'word': buzz_segment.text + " ",
'start': buzz_segment.start_time / 100,
'end': buzz_segment.end_time / 100,
})
text += buzz_segment.text + " "

if SENTENCE_END.match(buzz_segment.text):
segments.append({
'text': text,
'words': words
})
words = []
text = ""

return {
'language': self.transcription.language,
'segments': segments
}

if self.transcriptions_updated_signal:
self.transcriptions_updated_signal.emit(self.new_transcript_id)

def on_merge_button_clicked(self):
regroup_string = ''
if self.merge_by_gap.isChecked():
regroup_string += f'mg={self.merge_by_gap_input.text()}'

if self.split_by_max_length.isChecked():
regroup_string += f'++{self.split_by_max_length_input.text()}+1'

if self.split_by_punctuation.isChecked():
if regroup_string:
regroup_string += '_'
Expand All @@ -252,32 +301,43 @@ def on_merge_button_clicked(self):
regroup_string += '_'
regroup_string += f'sl={self.split_by_max_length_input.text()}'

result = stable_whisper.transcribe_any(
self.get_transcript,
self.transcription.file,
vad=True,
suppress_silence=True,
regroup=regroup_string,
)
if self.merge_by_gap.isChecked():
if regroup_string:
regroup_string += '_'
regroup_string += f'mg={self.merge_by_gap_input.text()}'

segments = []
for segment in result.segments:
segments.append(
Segment(
start=int(segment.start * 100),
end=int(segment.end * 100),
text=segment.text
)
)
if self.split_by_max_length.isChecked():
regroup_string += f'++{self.split_by_max_length_input.text()}+1'

new_transcript_id = self.transcription_service.copy_transcription(
self.transcription.id_as_uuid
regroup_string = os.getenv("BUZZ_MERGE_REGROUP_RULE", regroup_string)

self.hide()

self.thread = QThread()
self.worker = TranscriptionWorker(
self.transcription,
self.transcription_service,
regroup_string
)
self.transcription_service.update_transcription_as_completed(new_transcript_id, segments)
self.worker.moveToThread(self.thread)
self.thread.started.connect(self.worker.run)
self.worker.finished.connect(self.thread.quit)
self.worker.finished.connect(self.worker.deleteLater)
self.thread.finished.connect(self.thread.deleteLater)
self.worker.result_ready.connect(self.on_transcription_completed)

if self.transcriptions_updated_signal:
self.transcriptions_updated_signal.emit(new_transcript_id)
self.thread.start()

def on_transcription_completed(self, segments):
if self.new_transcript_id is not None:
self.transcription_service.update_transcription_as_completed(self.new_transcript_id, segments)

if self.transcriptions_updated_signal:
self.transcriptions_updated_signal.emit(self.new_transcript_id)

self.close()

def closeEvent(self, event):
self.hide()

super().closeEvent(event)
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,8 @@ def on_resize_button_clicked(self):
transcriptions_updated_signal=self.transcriptions_updated_signal,
)

self.transcriptions_updated_signal.connect(self.close)

self.transcription_resizer_dialog.show()

def closeEvent(self, event):
Expand Down
2 changes: 2 additions & 0 deletions docs/docs/preferences.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,5 @@ Defaults to [user_cache_dir](https://pypi.org/project/platformdirs/).
**BUZZ_DOWNLOAD_COOKIEFILE** - Location of a [cookiefile](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp) to use for downloading private videos or as workaround for anti-bot protection.

**BUZZ_FORCE_CPU** - Will force Buzz to use CPU and not GPU, useful for setups with older GPU if that is slower than GPU or GPU has issues. Example usage `BUZZ_FORCE_CPU=true`. Available since `1.2.1`

**BUZZ_MERGE_REGROUP_RULE** - Custom regroup merge rule to use when combining transcripts with word-level timings. More information on available options [in stable-ts repo](https://github.com/jianfch/stable-ts?tab=readme-ov-file#regrouping-methods). Available since `1.3.0`
2 changes: 1 addition & 1 deletion docs/docs/usage/4_edit_and_resize.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ When transcript of some audio or video file is generated you can edit it and exp

Transcription view screen has option to resize the transcripts. Click on the "Resize" button so see available options. Transcripts that have been generated **with word-level timings** setting enabled can be combined into subtitles specifying different options, like maximum length of a subtitle and if subtitles should be split on punctuation. For transcripts that have been generated **without word-level timings** setting enabled can only be recombined specifying desired max length of a subtitle.

Subtitle generation from transcripts with word-level timings is available since version 1.2.1.
If audio file is still present on the system word-level timing merge will also analyze the audio for silences to improve subtitle accuracy. Subtitle generation from transcripts with word-level timings is available since version 1.3.0.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "buzz-captions"
version = "1.2.1"
version = "1.3.0"
description = ""
authors = ["Chidi Williams <[email protected]>"]
license = "MIT"
Expand Down
9 changes: 2 additions & 7 deletions tests/widgets/transcription_viewer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,17 +85,12 @@ def test_should_resize_segment_text(self, qtbot, transcription, transcription_se

widget = TranscriptionResizerWidget(transcription, transcription_service)
widget.target_chars_spin_box.setValue(5)

qtbot.add_widget(widget)

widget.on_resize_button_clicked()

transcription_service.update_transcription_as_completed.assert_called()
assert transcription_service.update_transcription_as_completed.call_count == 1

widget.on_merge_button_clicked()

transcription_service.update_transcription_as_completed.assert_called()
assert transcription_service.update_transcription_as_completed.call_count == 2
transcription_service.update_transcription_as_completed.assert_called_once()

widget.close()

Expand Down

0 comments on commit 9d5ec9c

Please sign in to comment.