Initial dependencies

This commit is contained in:
Raivis Dejus 2025-04-05 08:16:41 +03:00
commit 840fca9d4f
6 changed files with 4610 additions and 241 deletions

View file

@ -0,0 +1,14 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<svg height="800px" width="800px" version="1.1" id="Capa_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"
viewBox="0 0 493.347 493.347" xml:space="preserve">
<g>
<path style="fill:#010002;" d="M191.936,385.946c-14.452,0-29.029-1.36-43.319-4.04l-5.299-0.996l-66.745,37.15v-63.207
l-6.629-4.427C25.496,320.716,0,277.045,0,230.617c0-85.648,86.102-155.33,191.936-155.33c17.077,0,33.623,1.838,49.394,5.239
c-50.486,27.298-84.008,74.801-84.008,128.765c0,72.969,61.25,134.147,142.942,149.464
C269.41,375.892,232.099,385.946,191.936,385.946z"/>
<path style="fill:#010002;" d="M437.777,304.278l-6.629,4.427v48.075l-50.933-28.343l-0.125,0.024l-5.167,0.967
c-11.444,2.142-23.104,3.228-34.673,3.228c-1.241,0-2.47-0.054-3.705-0.078c-82.707-1.599-149.387-56.268-149.387-123.287
c0-52.109,40.324-96.741,97.129-114.791c14.47-4.594,30.001-7.471,46.219-8.3c3.228-0.167,6.468-0.274,9.75-0.274
c84.413,0,153.092,55.343,153.092,123.365C493.347,246.053,473.089,280.679,437.777,304.278z"/>
</g>
</svg>

After

Width:  |  Height:  |  Size: 1 KiB

View file

@ -82,6 +82,10 @@ class ResizeIcon(Icon):
def __init__(self, parent: QWidget):
super().__init__(get_path("assets/resize_black.svg"), parent)
class SpeakerIdentificationIcon(Icon):
def __init__(self, parent: QWidget):
super().__init__(get_path("assets/speaker-identification.svg"), parent)
class VisibilityIcon(Icon):
def __init__(self, parent: QWidget):
super().__init__(

View file

@ -0,0 +1,329 @@
import re
import os
import logging
import stable_whisper
import srt
from pathlib import Path
from typing import Optional
from PyQt6.QtCore import Qt, QThread, QObject, pyqtSignal
from PyQt6.QtGui import QFont
from PyQt6.QtWidgets import (
QWidget,
QFormLayout,
QVBoxLayout,
QHBoxLayout,
QLabel,
QSpinBox,
QPushButton,
QCheckBox,
QGroupBox,
QSpacerItem,
QSizePolicy,
)
from buzz.locale import _, languages
from buzz.db.entity.transcription import Transcription
from buzz.db.service.transcription_service import TranscriptionService
from buzz.paths import file_path_as_title
from buzz.settings.settings import Settings
from buzz.widgets.line_edit import LineEdit
from buzz.transcriber.transcriber import Segment
from buzz.widgets.preferences_dialog.models.file_transcription_preferences import (
FileTranscriptionPreferences,
)
SENTENCE_END = re.compile(r'.*[.!?。!?]')
class TranscriptionWorker(QObject):
finished = pyqtSignal()
result_ready = pyqtSignal(list)
def __init__(self, transcription, transcription_options, transcription_service, regroup_string: str):
super().__init__()
self.transcription = transcription
self.transcription_options = transcription_options
self.transcription_service = transcription_service
self.regroup_string = regroup_string
def get_transcript(self, audio, **kwargs) -> dict:
buzz_segments = self.transcription_service.get_transcription_segments(
transcription_id=self.transcription.id_as_uuid
)
segments = []
words = []
text = ""
for buzz_segment in buzz_segments:
words.append({
'word': buzz_segment.text + " ",
'start': buzz_segment.start_time / 100,
'end': buzz_segment.end_time / 100,
})
text += buzz_segment.text + " "
if SENTENCE_END.match(buzz_segment.text):
segments.append({
'text': text,
'words': words
})
words = []
text = ""
return {
'language': self.transcription.language,
'segments': segments
}
def run(self):
transcription_file = self.transcription.file
transcription_file_exists = os.path.exists(transcription_file)
transcription_file_path = Path(transcription_file)
speech_path = transcription_file_path.with_name(f"{transcription_file_path.stem}_speech.mp3")
if self.transcription_options.extract_speech and os.path.exists(speech_path):
transcription_file = str(speech_path)
transcription_file_exists = True
result = stable_whisper.transcribe_any(
self.get_transcript,
transcription_file,
vad=transcription_file_exists,
suppress_silence=transcription_file_exists,
regroup=self.regroup_string,
check_sorted=False,
)
segments = []
for segment in result.segments:
segments.append(
Segment(
start=int(segment.start * 100),
end=int(segment.end * 100),
text=segment.text
)
)
self.result_ready.emit(segments)
self.finished.emit()
class SpeakerIdentificationWidget(QWidget):
resize_button_clicked = pyqtSignal()
transcription: Transcription
settings = Settings()
def __init__(
self,
transcription: Transcription,
transcription_service: TranscriptionService,
parent: Optional["QWidget"] = None,
flags: Qt.WindowType = Qt.WindowType.Widget,
transcriptions_updated_signal: Optional[pyqtSignal] = None,
) -> None:
super().__init__(parent, flags)
self.transcription = transcription
self.transcription_service = transcription_service
self.transcriptions_updated_signal = transcriptions_updated_signal
self.new_transcript_id = None
self.thread = None
self.worker = None
self.setMinimumWidth(600)
self.setMinimumHeight(300)
self.setWindowTitle(file_path_as_title(transcription.file))
preferences = self.load_preferences()
(
self.transcription_options,
self.file_transcription_options,
) = preferences.to_transcription_options(
openai_access_token=''
)
layout = QFormLayout(self)
# Resize longer subtitles
resize_label = QLabel(_("Resize Options"), self)
font = resize_label.font()
font.setWeight(QFont.Weight.Bold)
resize_label.setFont(font)
layout.addRow(resize_label)
resize_group_box = QGroupBox(self)
resize_layout = QVBoxLayout(resize_group_box)
self.resize_row = QHBoxLayout()
self.desired_subtitle_length_label = QLabel(_("Desired subtitle length"), self)
self.target_chars_spin_box = QSpinBox(self)
self.target_chars_spin_box.setMinimum(1)
self.target_chars_spin_box.setMaximum(100)
self.target_chars_spin_box.setValue(42)
self.resize_button = QPushButton(_("Resize"))
self.resize_button.clicked.connect(self.on_resize_button_clicked)
self.resize_row.addWidget(self.desired_subtitle_length_label)
self.resize_row.addWidget(self.target_chars_spin_box)
self.resize_row.addWidget(self.resize_button)
resize_layout.addLayout(self.resize_row)
resize_group_box.setEnabled(self.transcription.word_level_timings != 1)
layout.addRow(resize_group_box)
# Spacer
spacer = QSpacerItem(0, 10, QSizePolicy.Policy.Minimum, QSizePolicy.Policy.Fixed)
layout.addItem(spacer)
# Merge words into subtitles
merge_options_label = QLabel(_("Merge Options"), self)
font = merge_options_label.font()
font.setWeight(QFont.Weight.Bold)
merge_options_label.setFont(font)
layout.addRow(merge_options_label)
merge_options_group_box = QGroupBox(self)
merge_options_layout = QVBoxLayout(merge_options_group_box)
self.merge_options_row = QVBoxLayout()
self.merge_by_gap = QCheckBox(_("Merge by gap"))
self.merge_by_gap.setChecked(True)
self.merge_by_gap.setMinimumWidth(250)
self.merge_by_gap_input = LineEdit("0.2", self)
merge_by_gap_layout = QHBoxLayout()
merge_by_gap_layout.addWidget(self.merge_by_gap)
merge_by_gap_layout.addWidget(self.merge_by_gap_input)
self.split_by_punctuation = QCheckBox(_("Split by punctuation"))
self.split_by_punctuation.setChecked(True)
self.split_by_punctuation.setMinimumWidth(250)
self.split_by_punctuation_input = LineEdit(".* /./. /。/?/? //!/! //,/, ", self)
split_by_punctuation_layout = QHBoxLayout()
split_by_punctuation_layout.addWidget(self.split_by_punctuation)
split_by_punctuation_layout.addWidget(self.split_by_punctuation_input)
self.split_by_max_length = QCheckBox(_("Split by max length"))
self.split_by_max_length.setChecked(True)
self.split_by_max_length.setMinimumWidth(250)
self.split_by_max_length_input = LineEdit("42", self)
split_by_max_length_layout = QHBoxLayout()
split_by_max_length_layout.addWidget(self.split_by_max_length)
split_by_max_length_layout.addWidget(self.split_by_max_length_input)
self.merge_options_row.addLayout(merge_by_gap_layout)
self.merge_options_row.addLayout(split_by_punctuation_layout)
self.merge_options_row.addLayout(split_by_max_length_layout)
self.merge_button = QPushButton(_("Merge"))
self.merge_button.clicked.connect(self.on_merge_button_clicked)
self.merge_options_row.addWidget(self.merge_button)
merge_options_layout.addLayout(self.merge_options_row)
merge_options_group_box.setEnabled(self.transcription.word_level_timings == 1)
layout.addRow(merge_options_group_box)
self.setLayout(layout)
def load_preferences(self):
self.settings.settings.beginGroup("file_transcriber")
preferences = FileTranscriptionPreferences.load(settings=self.settings.settings)
self.settings.settings.endGroup()
return preferences
# TODO rename function
def on_resize_button_clicked(self):
# Step 1 - Get the segments
segments = self.transcription_service.get_transcription_segments(
transcription_id=self.transcription.id_as_uuid
)
subs = []
for segment in segments:
subtitle = srt.Subtitle(
index=segment.id,
start=segment.start_time,
end=segment.end_time,
content=segment.text
)
subs.append(subtitle)
# Step 2 - ...
logging.debug(f"=== Will identify speakers ===")
def on_merge_button_clicked(self):
self.new_transcript_id = self.transcription_service.copy_transcription(
self.transcription.id_as_uuid
)
self.transcription_service.update_transcription_progress(self.new_transcript_id, 0.0)
if self.transcriptions_updated_signal:
self.transcriptions_updated_signal.emit(self.new_transcript_id)
regroup_string = ''
if self.merge_by_gap.isChecked():
regroup_string += f'mg={self.merge_by_gap_input.text()}'
if self.split_by_max_length.isChecked():
regroup_string += f'++{self.split_by_max_length_input.text()}+1'
if self.split_by_punctuation.isChecked():
if regroup_string:
regroup_string += '_'
regroup_string += f'sp={self.split_by_punctuation_input.text()}'
if self.split_by_max_length.isChecked():
if regroup_string:
regroup_string += '_'
regroup_string += f'sl={self.split_by_max_length_input.text()}'
if self.merge_by_gap.isChecked():
if regroup_string:
regroup_string += '_'
regroup_string += f'mg={self.merge_by_gap_input.text()}'
if self.split_by_max_length.isChecked():
regroup_string += f'++{self.split_by_max_length_input.text()}+1'
regroup_string = os.getenv("BUZZ_MERGE_REGROUP_RULE", regroup_string)
self.hide()
self.thread = QThread()
self.worker = TranscriptionWorker(
self.transcription,
self.transcription_options,
self.transcription_service,
regroup_string
)
self.worker.moveToThread(self.thread)
self.thread.started.connect(self.worker.run)
self.worker.finished.connect(self.thread.quit)
self.worker.finished.connect(self.worker.deleteLater)
self.thread.finished.connect(self.thread.deleteLater)
self.worker.result_ready.connect(self.on_transcription_completed)
self.thread.start()
def on_transcription_completed(self, segments):
if self.new_transcript_id is not None:
self.transcription_service.update_transcription_as_completed(self.new_transcript_id, segments)
if self.transcriptions_updated_signal:
self.transcriptions_updated_signal.emit(self.new_transcript_id)
self.close()
def closeEvent(self, event):
self.hide()
super().closeEvent(event)

View file

@ -26,6 +26,7 @@ from buzz.widgets.icon import (
FileDownloadIcon,
TranslateIcon,
ResizeIcon,
SpeakerIdentificationIcon,
)
from buzz.translator import Translator
from buzz.widgets.text_display_box import TextDisplayBox
@ -46,6 +47,7 @@ from buzz.widgets.transcription_viewer.transcription_view_mode_tool_button impor
ViewMode
)
from buzz.widgets.transcription_viewer.transcription_resizer_widget import TranscriptionResizerWidget
from buzz.widgets.transcription_viewer.speaker_identification_widget import SpeakerIdentificationWidget
class TranscriptionViewerWidget(QWidget):
@ -72,6 +74,7 @@ class TranscriptionViewerWidget(QWidget):
self.setWindowTitle(file_path_as_title(transcription.file))
self.transcription_resizer_dialog = None
self.speaker_identification_dialog = None
self.transcriptions_updated_signal = transcriptions_updated_signal
self.translation_thread = None
@ -191,6 +194,17 @@ class TranscriptionViewerWidget(QWidget):
toolbar.addWidget(resize_button)
speaker_identification_button = QToolButton()
speaker_identification_button.setText(_("Identify Speakers"))
speaker_identification_button.setObjectName("speaker_identification_button")
speaker_identification_button.setIcon(SpeakerIdentificationIcon(self))
speaker_identification_button.setToolButtonStyle(
Qt.ToolButtonStyle.ToolButtonTextBesideIcon
)
speaker_identification_button.clicked.connect(self.on_speaker_identification_button_clicked)
toolbar.addWidget(speaker_identification_button)
layout.setMenuBar(toolbar)
layout.addWidget(self.table_widget)
@ -314,12 +328,26 @@ class TranscriptionViewerWidget(QWidget):
self.transcription_resizer_dialog.show()
def on_speaker_identification_button_clicked(self):
self.speaker_identification_dialog = SpeakerIdentificationWidget(
transcription=self.transcription,
transcription_service=self.transcription_service,
transcriptions_updated_signal=self.transcriptions_updated_signal,
)
self.transcriptions_updated_signal.connect(self.close)
self.speaker_identification_dialog.show()
def closeEvent(self, event):
self.hide()
if self.transcription_resizer_dialog:
self.transcription_resizer_dialog.close()
if self.speaker_identification_dialog:
self.speaker_identification_dialog.close()
self.translator.stop()
self.translation_thread.quit()
self.translation_thread.wait()

4444
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -9,7 +9,7 @@ readme = "README.md"
license = { text = "MIT" }
repository = "https://github.com/chidiwilliams/buzz"
documentation = "https://chidiwilliams.github.io/buzz/docs"
requires-python = ">=3.9,<3.13"
requires-python = ">=3.10,<3.13"
dynamic = [ "dependencies" ]
[project.scripts]
@ -33,7 +33,7 @@ name = "PyPI"
priority = "primary"
[tool.poetry.dependencies]
python = ">=3.9,<3.13"
python = ">=3.10,<3.13"
sounddevice = "^0.4.5"
humanize = "^4.4.0"
PyQt6 = "6.8.1"
@ -49,8 +49,7 @@ yt-dlp = "^2025.2.19"
stable-ts = "^2.18.3"
faster-whisper = "^1.1.1"
openai-whisper = "^20240930"
# transformers 4.50.0 has some bug
transformers = "4.49.0"
transformers = "4.48.3"
accelerate = "^1.0.1"
polib = "^1.2.0"
srt-equalizer = "^0.1.10"
@ -80,6 +79,9 @@ soundfile = "^0.13.1"
urllib3 = "^2.3.0"
demucs = {url = "https://github.com/raivisdejus/demucs/releases/download/4.1.0a3/demucs-4.1.0a3-py3-none-any.whl"}
posthog = "^3.23.0"
deepmultilingualpunctuation = {git = "https://github.com/oliverguhr/deepmultilingualpunctuation.git"}
ctc-forced-aligner = {git = "https://github.com/MahmoudAshraf97/ctc-forced-aligner.git"}
nemo-toolkit = {extras = ["asr"], version = "^2.2.1"}
[tool.poetry.group.dev.dependencies]
autopep8 = "^1.7.0"