Adding UI improvements to the subtitle merging (#1033)

chidiwilliams · Dec 29, 2024 · 9d5ec9c · 9d5ec9c
1 parent 6037287
commit 9d5ec9c
Show file tree

Hide file tree

Showing 7 changed files with 120 additions and 61 deletions.
diff --git a/buzz/__version__.py b/buzz/__version__.py
@@ -1 +1 @@
-VERSION = "1.2.1"
+VERSION = "1.3.0"
diff --git a/buzz/widgets/transcription_viewer/transcription_resizer_widget.py b/buzz/widgets/transcription_viewer/transcription_resizer_widget.py
@@ -1,11 +1,12 @@
 import re
+import os
 import logging
 import stable_whisper
 
 import srt
 from srt_equalizer import srt_equalizer
 from typing import Optional
-from PyQt6.QtCore import Qt, pyqtSignal
+from PyQt6.QtCore import Qt, QThread, QObject, pyqtSignal
 from PyQt6.QtGui import QFont
 from PyQt6.QtWidgets import (
     QWidget,
@@ -34,6 +35,69 @@
 
 SENTENCE_END = re.compile(r'.*[.!?。！？]')
 
+class TranscriptionWorker(QObject):
+    finished = pyqtSignal()
+    result_ready = pyqtSignal(list)
+
+    def __init__(self, transcription, transcription_service, regroup_string: str):
+        super().__init__()
+        self.transcription = transcription
+        self.transcription_service = transcription_service
+        self.regroup_string = regroup_string
+
+    def get_transcript(self, audio, **kwargs) -> dict:
+        buzz_segments = self.transcription_service.get_transcription_segments(
+            transcription_id=self.transcription.id_as_uuid
+        )
+
+        segments = []
+        words = []
+        text = ""
+        for buzz_segment in buzz_segments:
+            words.append({
+                'word': buzz_segment.text + " ",
+                'start': buzz_segment.start_time / 100,
+                'end': buzz_segment.end_time / 100,
+            })
+            text += buzz_segment.text + " "
+
+            if SENTENCE_END.match(buzz_segment.text):
+                segments.append({
+                    'text': text,
+                    'words': words
+                })
+                words = []
+                text = ""
+
+        return {
+            'language': self.transcription.language,
+            'segments': segments
+        }
+
+    def run(self):
+        result = stable_whisper.transcribe_any(
+            self.get_transcript,
+            self.transcription.file,
+            vad=os.path.exists(self.transcription.file),
+            suppress_silence=os.path.exists(self.transcription.file),
+            regroup=self.regroup_string,
+            check_sorted=False,
+        )
+
+        segments = []
+        for segment in result.segments:
+            segments.append(
+                Segment(
+                    start=int(segment.start * 100),
+                    end=int(segment.end * 100),
+                    text=segment.text
+                )
+            )
+
+        self.result_ready.emit(segments)
+        self.finished.emit()
+
+
 class TranscriptionResizerWidget(QWidget):
     resize_button_clicked = pyqtSignal()
     transcription: Transcription
@@ -52,6 +116,10 @@ def __init__(
         self.transcription_service = transcription_service
         self.transcriptions_updated_signal = transcriptions_updated_signal
 
+        self.new_transcript_id = None
+        self.thread = None
+        self.worker = None
+
         self.setMinimumWidth(600)
         self.setMinimumHeight(300)
 
@@ -119,7 +187,7 @@ def __init__(
         self.merge_by_gap = QCheckBox(_("Merge by gap"))
         self.merge_by_gap.setChecked(True)
         self.merge_by_gap.setMinimumWidth(250)
-        self.merge_by_gap_input = LineEdit("0.1", self)
+        self.merge_by_gap_input = LineEdit("0.2", self)
         merge_by_gap_layout = QHBoxLayout()
         merge_by_gap_layout.addWidget(self.merge_by_gap)
         merge_by_gap_layout.addWidget(self.merge_by_gap_input)
@@ -207,41 +275,22 @@ def on_resize_button_clicked(self):
         if self.transcriptions_updated_signal:
             self.transcriptions_updated_signal.emit(new_transcript_id)
 
-    def get_transcript(self, audio, **kwargs) -> dict:
-        buzz_segments = self.transcription_service.get_transcription_segments(
-            transcription_id=self.transcription.id_as_uuid
+    def on_merge_button_clicked(self):
+        self.new_transcript_id = self.transcription_service.copy_transcription(
+            self.transcription.id_as_uuid
         )
+        self.transcription_service.update_transcription_progress(self.new_transcript_id, 0.0)
 
-        segments = []
-        words = []
-        text = ""
-        for buzz_segment in buzz_segments:
-            words.append({
-                'word': buzz_segment.text + " ",
-                'start': buzz_segment.start_time / 100,
-                'end': buzz_segment.end_time / 100,
-            })
-            text += buzz_segment.text + " "
-
-            if SENTENCE_END.match(buzz_segment.text):
-                segments.append({
-                    'text': text,
-                    'words': words
-                })
-                words = []
-                text = ""
-
-        return {
-            'language': self.transcription.language,
-            'segments': segments
-        }
-
+        if self.transcriptions_updated_signal:
+            self.transcriptions_updated_signal.emit(self.new_transcript_id)
 
-    def on_merge_button_clicked(self):
         regroup_string = ''
         if self.merge_by_gap.isChecked():
             regroup_string += f'mg={self.merge_by_gap_input.text()}'
 
+            if self.split_by_max_length.isChecked():
+                regroup_string += f'++{self.split_by_max_length_input.text()}+1'
+
         if self.split_by_punctuation.isChecked():
             if regroup_string:
                 regroup_string += '_'
@@ -252,32 +301,43 @@ def on_merge_button_clicked(self):
                 regroup_string += '_'
             regroup_string += f'sl={self.split_by_max_length_input.text()}'
 
-        result = stable_whisper.transcribe_any(
-            self.get_transcript,
-            self.transcription.file,
-            vad=True,
-            suppress_silence=True,
-            regroup=regroup_string,
-        )
+        if self.merge_by_gap.isChecked():
+            if regroup_string:
+                regroup_string += '_'
+            regroup_string += f'mg={self.merge_by_gap_input.text()}'
 
-        segments = []
-        for segment in result.segments:
-            segments.append(
-                Segment(
-                    start=int(segment.start * 100),
-                    end=int(segment.end * 100),
-                    text=segment.text
-                )
-            )
+            if self.split_by_max_length.isChecked():
+                regroup_string += f'++{self.split_by_max_length_input.text()}+1'
 
-        new_transcript_id = self.transcription_service.copy_transcription(
-            self.transcription.id_as_uuid
+        regroup_string = os.getenv("BUZZ_MERGE_REGROUP_RULE", regroup_string)
+
+        self.hide()
+
+        self.thread = QThread()
+        self.worker = TranscriptionWorker(
+            self.transcription,
+            self.transcription_service,
+            regroup_string
         )
-        self.transcription_service.update_transcription_as_completed(new_transcript_id, segments)
+        self.worker.moveToThread(self.thread)
+        self.thread.started.connect(self.worker.run)
+        self.worker.finished.connect(self.thread.quit)
+        self.worker.finished.connect(self.worker.deleteLater)
+        self.thread.finished.connect(self.thread.deleteLater)
+        self.worker.result_ready.connect(self.on_transcription_completed)
 
-        if self.transcriptions_updated_signal:
-            self.transcriptions_updated_signal.emit(new_transcript_id)
+        self.thread.start()
+
+    def on_transcription_completed(self, segments):
+        if self.new_transcript_id is not None:
+            self.transcription_service.update_transcription_as_completed(self.new_transcript_id, segments)
+
+            if self.transcriptions_updated_signal:
+                self.transcriptions_updated_signal.emit(self.new_transcript_id)
+
+        self.close()
 
     def closeEvent(self, event):
         self.hide()
+
         super().closeEvent(event)
diff --git a/buzz/widgets/transcription_viewer/transcription_viewer_widget.py b/buzz/widgets/transcription_viewer/transcription_viewer_widget.py
@@ -294,6 +294,8 @@ def on_resize_button_clicked(self):
             transcriptions_updated_signal=self.transcriptions_updated_signal,
         )
 
+        self.transcriptions_updated_signal.connect(self.close)
+
         self.transcription_resizer_dialog.show()
 
     def closeEvent(self, event):

diff --git a/docs/docs/preferences.md b/docs/docs/preferences.md
@@ -96,3 +96,5 @@ Defaults to [user_cache_dir](https://pypi.org/project/platformdirs/).
 **BUZZ_DOWNLOAD_COOKIEFILE** - Location of a [cookiefile](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp) to use for downloading private videos or as workaround for anti-bot protection.
 
 **BUZZ_FORCE_CPU** - Will force Buzz to use CPU and not GPU, useful for setups with older GPU if that is slower than GPU or GPU has issues. Example usage `BUZZ_FORCE_CPU=true`. Available since `1.2.1`
+
+**BUZZ_MERGE_REGROUP_RULE** - Custom regroup merge rule to use when combining transcripts with word-level timings. More information on available options [in stable-ts repo](https://github.com/jianfch/stable-ts?tab=readme-ov-file#regrouping-methods). Available since `1.3.0`
diff --git a/docs/docs/usage/4_edit_and_resize.md b/docs/docs/usage/4_edit_and_resize.md
@@ -6,4 +6,4 @@ When transcript of some audio or video file is generated you can edit it and exp
 
 Transcription view screen has option to resize the transcripts. Click on the "Resize" button so see available options. Transcripts that have been generated **with word-level timings** setting enabled can be combined into subtitles specifying different options, like maximum length of a subtitle and if subtitles should be split on punctuation. For transcripts that have been generated **without word-level timings** setting enabled can only be recombined specifying desired max length of a subtitle.  
 
-Subtitle generation from transcripts with word-level timings is available since version 1.2.1.
+If audio file is still present on the system word-level timing merge will also analyze the audio for silences to improve subtitle accuracy. Subtitle generation from transcripts with word-level timings is available since version 1.3.0.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "buzz-captions"
-version = "1.2.1"
+version = "1.3.0"
 description = ""
 authors = ["Chidi Williams <[email protected]>"]
 license = "MIT"

diff --git a/tests/widgets/transcription_viewer_test.py b/tests/widgets/transcription_viewer_test.py
@@ -85,17 +85,12 @@ def test_should_resize_segment_text(self, qtbot, transcription, transcription_se
 
         widget = TranscriptionResizerWidget(transcription, transcription_service)
         widget.target_chars_spin_box.setValue(5)
+
         qtbot.add_widget(widget)
 
         widget.on_resize_button_clicked()
 
-        transcription_service.update_transcription_as_completed.assert_called()
-        assert transcription_service.update_transcription_as_completed.call_count == 1
-
-        widget.on_merge_button_clicked()
-
-        transcription_service.update_transcription_as_completed.assert_called()
-        assert transcription_service.update_transcription_as_completed.call_count == 2
+        transcription_service.update_transcription_as_completed.assert_called_once()
 
         widget.close()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -96,3 +96,5 @@ Defaults to [user_cache_dir](https://pypi.org/project/platformdirs/).
		BUZZ_DOWNLOAD_COOKIEFILE - Location of a [cookiefile](https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp) to use for downloading private videos or as workaround for anti-bot protection.

		BUZZ_FORCE_CPU - Will force Buzz to use CPU and not GPU, useful for setups with older GPU if that is slower than GPU or GPU has issues. Example usage `BUZZ_FORCE_CPU=true`. Available since `1.2.1`

		BUZZ_MERGE_REGROUP_RULE - Custom regroup merge rule to use when combining transcripts with word-level timings. More information on available options [in stable-ts repo](https://github.com/jianfch/stable-ts?tab=readme-ov-file#regrouping-methods). Available since `1.3.0`
Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,4 +6,4 @@ When transcript of some audio or video file is generated you can edit it and exp

		Transcription view screen has option to resize the transcripts. Click on the "Resize" button so see available options. Transcripts that have been generated with word-level timings setting enabled can be combined into subtitles specifying different options, like maximum length of a subtitle and if subtitles should be split on punctuation. For transcripts that have been generated without word-level timings setting enabled can only be recombined specifying desired max length of a subtitle.

		Subtitle generation from transcripts with word-level timings is available since version 1.2.1.
		If audio file is still present on the system word-level timing merge will also analyze the audio for silences to improve subtitle accuracy. Subtitle generation from transcripts with word-level timings is available since version 1.3.0.