Initial dependencies

2026-03-15 07:05:48 +01:00 · 2025-04-05 08:16:41 +03:00 · 2025-04-05 08:16:41 +03:00 · 840fca9d4f
commit 840fca9d4f
parent 806546282d
6 changed files with 4610 additions and 241 deletions
--- a/buzz/assets/speaker-identification.svg
+++ b/buzz/assets/speaker-identification.svg
@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<svg height="800px" width="800px" version="1.1" id="Capa_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"
+	 viewBox="0 0 493.347 493.347" xml:space="preserve">
+<g>
+	<path style="fill:#010002;" d="M191.936,385.946c-14.452,0-29.029-1.36-43.319-4.04l-5.299-0.996l-66.745,37.15v-63.207
+		l-6.629-4.427C25.496,320.716,0,277.045,0,230.617c0-85.648,86.102-155.33,191.936-155.33c17.077,0,33.623,1.838,49.394,5.239
+		c-50.486,27.298-84.008,74.801-84.008,128.765c0,72.969,61.25,134.147,142.942,149.464
+		C269.41,375.892,232.099,385.946,191.936,385.946z"/>
+	<path style="fill:#010002;" d="M437.777,304.278l-6.629,4.427v48.075l-50.933-28.343l-0.125,0.024l-5.167,0.967
+		c-11.444,2.142-23.104,3.228-34.673,3.228c-1.241,0-2.47-0.054-3.705-0.078c-82.707-1.599-149.387-56.268-149.387-123.287
+		c0-52.109,40.324-96.741,97.129-114.791c14.47-4.594,30.001-7.471,46.219-8.3c3.228-0.167,6.468-0.274,9.75-0.274
+		c84.413,0,153.092,55.343,153.092,123.365C493.347,246.053,473.089,280.679,437.777,304.278z"/>
+</g>
+</svg>
--- a/buzz/widgets/icon.py
+++ b/buzz/widgets/icon.py
@ -82,6 +82,10 @@ class ResizeIcon(Icon):
    def __init__(self, parent: QWidget):
        super().__init__(get_path("assets/resize_black.svg"), parent)

+class SpeakerIdentificationIcon(Icon):
+    def __init__(self, parent: QWidget):
+        super().__init__(get_path("assets/speaker-identification.svg"), parent)
+
 class VisibilityIcon(Icon):
    def __init__(self, parent: QWidget):
        super().__init__(
--- a/buzz/widgets/transcription_viewer/speaker_identification_widget.py
+++ b/buzz/widgets/transcription_viewer/speaker_identification_widget.py
@ -0,0 +1,329 @@
+import re
+import os
+import logging
+import stable_whisper
+import srt
+from pathlib import Path
+from typing import Optional
+from PyQt6.QtCore import Qt, QThread, QObject, pyqtSignal
+from PyQt6.QtGui import QFont
+from PyQt6.QtWidgets import (
+    QWidget,
+    QFormLayout,
+    QVBoxLayout,
+    QHBoxLayout,
+    QLabel,
+    QSpinBox,
+    QPushButton,
+    QCheckBox,
+    QGroupBox,
+    QSpacerItem,
+    QSizePolicy,
+)
+from buzz.locale import _, languages
+from buzz.db.entity.transcription import Transcription
+from buzz.db.service.transcription_service import TranscriptionService
+from buzz.paths import file_path_as_title
+from buzz.settings.settings import Settings
+from buzz.widgets.line_edit import LineEdit
+from buzz.transcriber.transcriber import Segment
+from buzz.widgets.preferences_dialog.models.file_transcription_preferences import (
+    FileTranscriptionPreferences,
+)
+
+
+SENTENCE_END = re.compile(r'.*[.!?。！？]')
+
+class TranscriptionWorker(QObject):
+    finished = pyqtSignal()
+    result_ready = pyqtSignal(list)
+
+    def __init__(self, transcription, transcription_options, transcription_service, regroup_string: str):
+        super().__init__()
+        self.transcription = transcription
+        self.transcription_options = transcription_options
+        self.transcription_service = transcription_service
+        self.regroup_string = regroup_string
+
+    def get_transcript(self, audio, **kwargs) -> dict:
+        buzz_segments = self.transcription_service.get_transcription_segments(
+            transcription_id=self.transcription.id_as_uuid
+        )
+
+        segments = []
+        words = []
+        text = ""
+        for buzz_segment in buzz_segments:
+            words.append({
+                'word': buzz_segment.text + " ",
+                'start': buzz_segment.start_time / 100,
+                'end': buzz_segment.end_time / 100,
+            })
+            text += buzz_segment.text + " "
+
+            if SENTENCE_END.match(buzz_segment.text):
+                segments.append({
+                    'text': text,
+                    'words': words
+                })
+                words = []
+                text = ""
+
+        return {
+            'language': self.transcription.language,
+            'segments': segments
+        }
+
+    def run(self):
+        transcription_file = self.transcription.file
+        transcription_file_exists = os.path.exists(transcription_file)
+
+        transcription_file_path = Path(transcription_file)
+        speech_path = transcription_file_path.with_name(f"{transcription_file_path.stem}_speech.mp3")
+        if self.transcription_options.extract_speech and os.path.exists(speech_path):
+            transcription_file = str(speech_path)
+            transcription_file_exists = True
+
+        result = stable_whisper.transcribe_any(
+            self.get_transcript,
+            transcription_file,
+            vad=transcription_file_exists,
+            suppress_silence=transcription_file_exists,
+            regroup=self.regroup_string,
+            check_sorted=False,
+        )
+
+        segments = []
+        for segment in result.segments:
+            segments.append(
+                Segment(
+                    start=int(segment.start * 100),
+                    end=int(segment.end * 100),
+                    text=segment.text
+                )
+            )
+
+        self.result_ready.emit(segments)
+        self.finished.emit()
+
+
+class SpeakerIdentificationWidget(QWidget):
+    resize_button_clicked = pyqtSignal()
+    transcription: Transcription
+    settings = Settings()
+
+    def __init__(
+        self,
+        transcription: Transcription,
+        transcription_service: TranscriptionService,
+        parent: Optional["QWidget"] = None,
+        flags: Qt.WindowType = Qt.WindowType.Widget,
+        transcriptions_updated_signal: Optional[pyqtSignal] = None,
+    ) -> None:
+        super().__init__(parent, flags)
+        self.transcription = transcription
+        self.transcription_service = transcription_service
+        self.transcriptions_updated_signal = transcriptions_updated_signal
+
+        self.new_transcript_id = None
+        self.thread = None
+        self.worker = None
+
+        self.setMinimumWidth(600)
+        self.setMinimumHeight(300)
+
+        self.setWindowTitle(file_path_as_title(transcription.file))
+
+        preferences = self.load_preferences()
+
+        (
+            self.transcription_options,
+            self.file_transcription_options,
+        ) = preferences.to_transcription_options(
+            openai_access_token=''
+        )
+
+        layout = QFormLayout(self)
+
+        # Resize longer subtitles
+        resize_label = QLabel(_("Resize Options"), self)
+        font = resize_label.font()
+        font.setWeight(QFont.Weight.Bold)
+        resize_label.setFont(font)
+        layout.addRow(resize_label)
+
+        resize_group_box = QGroupBox(self)
+        resize_layout = QVBoxLayout(resize_group_box)
+
+        self.resize_row = QHBoxLayout()
+
+        self.desired_subtitle_length_label = QLabel(_("Desired subtitle length"), self)
+
+        self.target_chars_spin_box = QSpinBox(self)
+        self.target_chars_spin_box.setMinimum(1)
+        self.target_chars_spin_box.setMaximum(100)
+        self.target_chars_spin_box.setValue(42)
+
+        self.resize_button = QPushButton(_("Resize"))
+        self.resize_button.clicked.connect(self.on_resize_button_clicked)
+
+        self.resize_row.addWidget(self.desired_subtitle_length_label)
+        self.resize_row.addWidget(self.target_chars_spin_box)
+        self.resize_row.addWidget(self.resize_button)
+
+        resize_layout.addLayout(self.resize_row)
+
+        resize_group_box.setEnabled(self.transcription.word_level_timings != 1)
+
+        layout.addRow(resize_group_box)
+
+        # Spacer
+        spacer = QSpacerItem(0, 10, QSizePolicy.Policy.Minimum, QSizePolicy.Policy.Fixed)
+        layout.addItem(spacer)
+
+        # Merge words into subtitles
+        merge_options_label = QLabel(_("Merge Options"), self)
+        font = merge_options_label.font()
+        font.setWeight(QFont.Weight.Bold)
+        merge_options_label.setFont(font)
+        layout.addRow(merge_options_label)
+
+        merge_options_group_box = QGroupBox(self)
+        merge_options_layout = QVBoxLayout(merge_options_group_box)
+
+        self.merge_options_row = QVBoxLayout()
+
+        self.merge_by_gap = QCheckBox(_("Merge by gap"))
+        self.merge_by_gap.setChecked(True)
+        self.merge_by_gap.setMinimumWidth(250)
+        self.merge_by_gap_input = LineEdit("0.2", self)
+        merge_by_gap_layout = QHBoxLayout()
+        merge_by_gap_layout.addWidget(self.merge_by_gap)
+        merge_by_gap_layout.addWidget(self.merge_by_gap_input)
+
+        self.split_by_punctuation = QCheckBox(_("Split by punctuation"))
+        self.split_by_punctuation.setChecked(True)
+        self.split_by_punctuation.setMinimumWidth(250)
+        self.split_by_punctuation_input = LineEdit(".* /./. /。/?/? /？/!/! /！/,/, ", self)
+        split_by_punctuation_layout = QHBoxLayout()
+        split_by_punctuation_layout.addWidget(self.split_by_punctuation)
+        split_by_punctuation_layout.addWidget(self.split_by_punctuation_input)
+
+        self.split_by_max_length = QCheckBox(_("Split by max length"))
+        self.split_by_max_length.setChecked(True)
+        self.split_by_max_length.setMinimumWidth(250)
+        self.split_by_max_length_input = LineEdit("42", self)
+        split_by_max_length_layout = QHBoxLayout()
+        split_by_max_length_layout.addWidget(self.split_by_max_length)
+        split_by_max_length_layout.addWidget(self.split_by_max_length_input)
+
+        self.merge_options_row.addLayout(merge_by_gap_layout)
+        self.merge_options_row.addLayout(split_by_punctuation_layout)
+        self.merge_options_row.addLayout(split_by_max_length_layout)
+
+        self.merge_button = QPushButton(_("Merge"))
+        self.merge_button.clicked.connect(self.on_merge_button_clicked)
+
+        self.merge_options_row.addWidget(self.merge_button)
+
+        merge_options_layout.addLayout(self.merge_options_row)
+
+        merge_options_group_box.setEnabled(self.transcription.word_level_timings == 1)
+
+        layout.addRow(merge_options_group_box)
+
+        self.setLayout(layout)
+
+    def load_preferences(self):
+        self.settings.settings.beginGroup("file_transcriber")
+        preferences = FileTranscriptionPreferences.load(settings=self.settings.settings)
+        self.settings.settings.endGroup()
+        return preferences
+
+    # TODO rename function
+    def on_resize_button_clicked(self):
+        #  Step 1 - Get the segments
+        segments = self.transcription_service.get_transcription_segments(
+            transcription_id=self.transcription.id_as_uuid
+        )
+
+        subs = []
+        for segment in segments:
+            subtitle = srt.Subtitle(
+                index=segment.id,
+                start=segment.start_time,
+                end=segment.end_time,
+                content=segment.text
+            )
+            subs.append(subtitle)
+
+        # Step 2 - ...
+        logging.debug(f"=== Will identify speakers ===")
+
+    def on_merge_button_clicked(self):
+        self.new_transcript_id = self.transcription_service.copy_transcription(
+            self.transcription.id_as_uuid
+        )
+        self.transcription_service.update_transcription_progress(self.new_transcript_id, 0.0)
+
+        if self.transcriptions_updated_signal:
+            self.transcriptions_updated_signal.emit(self.new_transcript_id)
+
+        regroup_string = ''
+        if self.merge_by_gap.isChecked():
+            regroup_string += f'mg={self.merge_by_gap_input.text()}'
+
+            if self.split_by_max_length.isChecked():
+                regroup_string += f'++{self.split_by_max_length_input.text()}+1'
+
+        if self.split_by_punctuation.isChecked():
+            if regroup_string:
+                regroup_string += '_'
+            regroup_string += f'sp={self.split_by_punctuation_input.text()}'
+
+        if self.split_by_max_length.isChecked():
+            if regroup_string:
+                regroup_string += '_'
+            regroup_string += f'sl={self.split_by_max_length_input.text()}'
+
+        if self.merge_by_gap.isChecked():
+            if regroup_string:
+                regroup_string += '_'
+            regroup_string += f'mg={self.merge_by_gap_input.text()}'
+
+            if self.split_by_max_length.isChecked():
+                regroup_string += f'++{self.split_by_max_length_input.text()}+1'
+
+        regroup_string = os.getenv("BUZZ_MERGE_REGROUP_RULE", regroup_string)
+
+        self.hide()
+
+        self.thread = QThread()
+        self.worker = TranscriptionWorker(
+            self.transcription,
+            self.transcription_options,
+            self.transcription_service,
+            regroup_string
+        )
+        self.worker.moveToThread(self.thread)
+        self.thread.started.connect(self.worker.run)
+        self.worker.finished.connect(self.thread.quit)
+        self.worker.finished.connect(self.worker.deleteLater)
+        self.thread.finished.connect(self.thread.deleteLater)
+        self.worker.result_ready.connect(self.on_transcription_completed)
+
+        self.thread.start()
+
+    def on_transcription_completed(self, segments):
+        if self.new_transcript_id is not None:
+            self.transcription_service.update_transcription_as_completed(self.new_transcript_id, segments)
+
+            if self.transcriptions_updated_signal:
+                self.transcriptions_updated_signal.emit(self.new_transcript_id)
+
+        self.close()
+
+    def closeEvent(self, event):
+        self.hide()
+
+        super().closeEvent(event)
--- a/buzz/widgets/transcription_viewer/transcription_viewer_widget.py
+++ b/buzz/widgets/transcription_viewer/transcription_viewer_widget.py
@ -26,6 +26,7 @@ from buzz.widgets.icon import (
    FileDownloadIcon,
    TranslateIcon,
    ResizeIcon,
+    SpeakerIdentificationIcon,
 )
 from buzz.translator import Translator
 from buzz.widgets.text_display_box import TextDisplayBox
@ -46,6 +47,7 @@ from buzz.widgets.transcription_viewer.transcription_view_mode_tool_button impor
    ViewMode
 )
 from buzz.widgets.transcription_viewer.transcription_resizer_widget import TranscriptionResizerWidget
+from buzz.widgets.transcription_viewer.speaker_identification_widget import SpeakerIdentificationWidget


 class TranscriptionViewerWidget(QWidget):
@ -72,6 +74,7 @@ class TranscriptionViewerWidget(QWidget):
        self.setWindowTitle(file_path_as_title(transcription.file))

        self.transcription_resizer_dialog = None
+        self.speaker_identification_dialog = None
        self.transcriptions_updated_signal = transcriptions_updated_signal

        self.translation_thread = None
@ -191,6 +194,17 @@ class TranscriptionViewerWidget(QWidget):

        toolbar.addWidget(resize_button)

+        speaker_identification_button = QToolButton()
+        speaker_identification_button.setText(_("Identify Speakers"))
+        speaker_identification_button.setObjectName("speaker_identification_button")
+        speaker_identification_button.setIcon(SpeakerIdentificationIcon(self))
+        speaker_identification_button.setToolButtonStyle(
+            Qt.ToolButtonStyle.ToolButtonTextBesideIcon
+        )
+        speaker_identification_button.clicked.connect(self.on_speaker_identification_button_clicked)
+
+        toolbar.addWidget(speaker_identification_button)
+
        layout.setMenuBar(toolbar)

        layout.addWidget(self.table_widget)
@ -314,12 +328,26 @@ class TranscriptionViewerWidget(QWidget):

        self.transcription_resizer_dialog.show()

+    def on_speaker_identification_button_clicked(self):
+        self.speaker_identification_dialog = SpeakerIdentificationWidget(
+            transcription=self.transcription,
+            transcription_service=self.transcription_service,
+            transcriptions_updated_signal=self.transcriptions_updated_signal,
+        )
+
+        self.transcriptions_updated_signal.connect(self.close)
+
+        self.speaker_identification_dialog.show()
+
    def closeEvent(self, event):
        self.hide()

        if self.transcription_resizer_dialog:
            self.transcription_resizer_dialog.close()

+        if self.speaker_identification_dialog:
+            self.speaker_identification_dialog.close()
+
        self.translator.stop()
        self.translation_thread.quit()
        self.translation_thread.wait()
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -9,7 +9,7 @@ readme = "README.md"
 license = { text = "MIT" }
 repository = "https://github.com/chidiwilliams/buzz"
 documentation = "https://chidiwilliams.github.io/buzz/docs"
-requires-python = ">=3.9,<3.13"
+requires-python = ">=3.10,<3.13"
 dynamic = [ "dependencies" ]

 [project.scripts]
@ -33,7 +33,7 @@ name = "PyPI"
 priority = "primary"

 [tool.poetry.dependencies]
-python = ">=3.9,<3.13"
+python = ">=3.10,<3.13"
 sounddevice = "^0.4.5"
 humanize = "^4.4.0"
 PyQt6 = "6.8.1"
@ -49,8 +49,7 @@ yt-dlp = "^2025.2.19"
 stable-ts = "^2.18.3"
 faster-whisper = "^1.1.1"
 openai-whisper = "^20240930"
-# transformers 4.50.0 has some bug
-transformers = "4.49.0"
+transformers = "4.48.3"
 accelerate = "^1.0.1"
 polib = "^1.2.0"
 srt-equalizer = "^0.1.10"
@ -80,6 +79,9 @@ soundfile = "^0.13.1"
 urllib3 = "^2.3.0"
 demucs = {url = "https://github.com/raivisdejus/demucs/releases/download/4.1.0a3/demucs-4.1.0a3-py3-none-any.whl"}
 posthog = "^3.23.0"
+deepmultilingualpunctuation = {git = "https://github.com/oliverguhr/deepmultilingualpunctuation.git"}
+ctc-forced-aligner = {git = "https://github.com/MahmoudAshraf97/ctc-forced-aligner.git"}
+nemo-toolkit = {extras = ["asr"], version = "^2.2.1"}

 [tool.poetry.group.dev.dependencies]
 autopep8 = "^1.7.0"