mirror of
https://github.com/chidiwilliams/buzz.git
synced 2026-03-15 07:05:48 +01:00
Initial dependencies
This commit is contained in:
parent
806546282d
commit
840fca9d4f
6 changed files with 4610 additions and 241 deletions
14
buzz/assets/speaker-identification.svg
Normal file
14
buzz/assets/speaker-identification.svg
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
<?xml version="1.0" encoding="iso-8859-1"?>
|
||||
<svg height="800px" width="800px" version="1.1" id="Capa_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
viewBox="0 0 493.347 493.347" xml:space="preserve">
|
||||
<g>
|
||||
<path style="fill:#010002;" d="M191.936,385.946c-14.452,0-29.029-1.36-43.319-4.04l-5.299-0.996l-66.745,37.15v-63.207
|
||||
l-6.629-4.427C25.496,320.716,0,277.045,0,230.617c0-85.648,86.102-155.33,191.936-155.33c17.077,0,33.623,1.838,49.394,5.239
|
||||
c-50.486,27.298-84.008,74.801-84.008,128.765c0,72.969,61.25,134.147,142.942,149.464
|
||||
C269.41,375.892,232.099,385.946,191.936,385.946z"/>
|
||||
<path style="fill:#010002;" d="M437.777,304.278l-6.629,4.427v48.075l-50.933-28.343l-0.125,0.024l-5.167,0.967
|
||||
c-11.444,2.142-23.104,3.228-34.673,3.228c-1.241,0-2.47-0.054-3.705-0.078c-82.707-1.599-149.387-56.268-149.387-123.287
|
||||
c0-52.109,40.324-96.741,97.129-114.791c14.47-4.594,30.001-7.471,46.219-8.3c3.228-0.167,6.468-0.274,9.75-0.274
|
||||
c84.413,0,153.092,55.343,153.092,123.365C493.347,246.053,473.089,280.679,437.777,304.278z"/>
|
||||
</g>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 1 KiB |
|
|
@ -82,6 +82,10 @@ class ResizeIcon(Icon):
|
|||
def __init__(self, parent: QWidget):
|
||||
super().__init__(get_path("assets/resize_black.svg"), parent)
|
||||
|
||||
class SpeakerIdentificationIcon(Icon):
|
||||
def __init__(self, parent: QWidget):
|
||||
super().__init__(get_path("assets/speaker-identification.svg"), parent)
|
||||
|
||||
class VisibilityIcon(Icon):
|
||||
def __init__(self, parent: QWidget):
|
||||
super().__init__(
|
||||
|
|
|
|||
|
|
@ -0,0 +1,329 @@
|
|||
import re
|
||||
import os
|
||||
import logging
|
||||
import stable_whisper
|
||||
import srt
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from PyQt6.QtCore import Qt, QThread, QObject, pyqtSignal
|
||||
from PyQt6.QtGui import QFont
|
||||
from PyQt6.QtWidgets import (
|
||||
QWidget,
|
||||
QFormLayout,
|
||||
QVBoxLayout,
|
||||
QHBoxLayout,
|
||||
QLabel,
|
||||
QSpinBox,
|
||||
QPushButton,
|
||||
QCheckBox,
|
||||
QGroupBox,
|
||||
QSpacerItem,
|
||||
QSizePolicy,
|
||||
)
|
||||
from buzz.locale import _, languages
|
||||
from buzz.db.entity.transcription import Transcription
|
||||
from buzz.db.service.transcription_service import TranscriptionService
|
||||
from buzz.paths import file_path_as_title
|
||||
from buzz.settings.settings import Settings
|
||||
from buzz.widgets.line_edit import LineEdit
|
||||
from buzz.transcriber.transcriber import Segment
|
||||
from buzz.widgets.preferences_dialog.models.file_transcription_preferences import (
|
||||
FileTranscriptionPreferences,
|
||||
)
|
||||
|
||||
|
||||
SENTENCE_END = re.compile(r'.*[.!?。!?]')
|
||||
|
||||
class TranscriptionWorker(QObject):
|
||||
finished = pyqtSignal()
|
||||
result_ready = pyqtSignal(list)
|
||||
|
||||
def __init__(self, transcription, transcription_options, transcription_service, regroup_string: str):
|
||||
super().__init__()
|
||||
self.transcription = transcription
|
||||
self.transcription_options = transcription_options
|
||||
self.transcription_service = transcription_service
|
||||
self.regroup_string = regroup_string
|
||||
|
||||
def get_transcript(self, audio, **kwargs) -> dict:
|
||||
buzz_segments = self.transcription_service.get_transcription_segments(
|
||||
transcription_id=self.transcription.id_as_uuid
|
||||
)
|
||||
|
||||
segments = []
|
||||
words = []
|
||||
text = ""
|
||||
for buzz_segment in buzz_segments:
|
||||
words.append({
|
||||
'word': buzz_segment.text + " ",
|
||||
'start': buzz_segment.start_time / 100,
|
||||
'end': buzz_segment.end_time / 100,
|
||||
})
|
||||
text += buzz_segment.text + " "
|
||||
|
||||
if SENTENCE_END.match(buzz_segment.text):
|
||||
segments.append({
|
||||
'text': text,
|
||||
'words': words
|
||||
})
|
||||
words = []
|
||||
text = ""
|
||||
|
||||
return {
|
||||
'language': self.transcription.language,
|
||||
'segments': segments
|
||||
}
|
||||
|
||||
def run(self):
|
||||
transcription_file = self.transcription.file
|
||||
transcription_file_exists = os.path.exists(transcription_file)
|
||||
|
||||
transcription_file_path = Path(transcription_file)
|
||||
speech_path = transcription_file_path.with_name(f"{transcription_file_path.stem}_speech.mp3")
|
||||
if self.transcription_options.extract_speech and os.path.exists(speech_path):
|
||||
transcription_file = str(speech_path)
|
||||
transcription_file_exists = True
|
||||
|
||||
result = stable_whisper.transcribe_any(
|
||||
self.get_transcript,
|
||||
transcription_file,
|
||||
vad=transcription_file_exists,
|
||||
suppress_silence=transcription_file_exists,
|
||||
regroup=self.regroup_string,
|
||||
check_sorted=False,
|
||||
)
|
||||
|
||||
segments = []
|
||||
for segment in result.segments:
|
||||
segments.append(
|
||||
Segment(
|
||||
start=int(segment.start * 100),
|
||||
end=int(segment.end * 100),
|
||||
text=segment.text
|
||||
)
|
||||
)
|
||||
|
||||
self.result_ready.emit(segments)
|
||||
self.finished.emit()
|
||||
|
||||
|
||||
class SpeakerIdentificationWidget(QWidget):
|
||||
resize_button_clicked = pyqtSignal()
|
||||
transcription: Transcription
|
||||
settings = Settings()
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
transcription: Transcription,
|
||||
transcription_service: TranscriptionService,
|
||||
parent: Optional["QWidget"] = None,
|
||||
flags: Qt.WindowType = Qt.WindowType.Widget,
|
||||
transcriptions_updated_signal: Optional[pyqtSignal] = None,
|
||||
) -> None:
|
||||
super().__init__(parent, flags)
|
||||
self.transcription = transcription
|
||||
self.transcription_service = transcription_service
|
||||
self.transcriptions_updated_signal = transcriptions_updated_signal
|
||||
|
||||
self.new_transcript_id = None
|
||||
self.thread = None
|
||||
self.worker = None
|
||||
|
||||
self.setMinimumWidth(600)
|
||||
self.setMinimumHeight(300)
|
||||
|
||||
self.setWindowTitle(file_path_as_title(transcription.file))
|
||||
|
||||
preferences = self.load_preferences()
|
||||
|
||||
(
|
||||
self.transcription_options,
|
||||
self.file_transcription_options,
|
||||
) = preferences.to_transcription_options(
|
||||
openai_access_token=''
|
||||
)
|
||||
|
||||
layout = QFormLayout(self)
|
||||
|
||||
# Resize longer subtitles
|
||||
resize_label = QLabel(_("Resize Options"), self)
|
||||
font = resize_label.font()
|
||||
font.setWeight(QFont.Weight.Bold)
|
||||
resize_label.setFont(font)
|
||||
layout.addRow(resize_label)
|
||||
|
||||
resize_group_box = QGroupBox(self)
|
||||
resize_layout = QVBoxLayout(resize_group_box)
|
||||
|
||||
self.resize_row = QHBoxLayout()
|
||||
|
||||
self.desired_subtitle_length_label = QLabel(_("Desired subtitle length"), self)
|
||||
|
||||
self.target_chars_spin_box = QSpinBox(self)
|
||||
self.target_chars_spin_box.setMinimum(1)
|
||||
self.target_chars_spin_box.setMaximum(100)
|
||||
self.target_chars_spin_box.setValue(42)
|
||||
|
||||
self.resize_button = QPushButton(_("Resize"))
|
||||
self.resize_button.clicked.connect(self.on_resize_button_clicked)
|
||||
|
||||
self.resize_row.addWidget(self.desired_subtitle_length_label)
|
||||
self.resize_row.addWidget(self.target_chars_spin_box)
|
||||
self.resize_row.addWidget(self.resize_button)
|
||||
|
||||
resize_layout.addLayout(self.resize_row)
|
||||
|
||||
resize_group_box.setEnabled(self.transcription.word_level_timings != 1)
|
||||
|
||||
layout.addRow(resize_group_box)
|
||||
|
||||
# Spacer
|
||||
spacer = QSpacerItem(0, 10, QSizePolicy.Policy.Minimum, QSizePolicy.Policy.Fixed)
|
||||
layout.addItem(spacer)
|
||||
|
||||
# Merge words into subtitles
|
||||
merge_options_label = QLabel(_("Merge Options"), self)
|
||||
font = merge_options_label.font()
|
||||
font.setWeight(QFont.Weight.Bold)
|
||||
merge_options_label.setFont(font)
|
||||
layout.addRow(merge_options_label)
|
||||
|
||||
merge_options_group_box = QGroupBox(self)
|
||||
merge_options_layout = QVBoxLayout(merge_options_group_box)
|
||||
|
||||
self.merge_options_row = QVBoxLayout()
|
||||
|
||||
self.merge_by_gap = QCheckBox(_("Merge by gap"))
|
||||
self.merge_by_gap.setChecked(True)
|
||||
self.merge_by_gap.setMinimumWidth(250)
|
||||
self.merge_by_gap_input = LineEdit("0.2", self)
|
||||
merge_by_gap_layout = QHBoxLayout()
|
||||
merge_by_gap_layout.addWidget(self.merge_by_gap)
|
||||
merge_by_gap_layout.addWidget(self.merge_by_gap_input)
|
||||
|
||||
self.split_by_punctuation = QCheckBox(_("Split by punctuation"))
|
||||
self.split_by_punctuation.setChecked(True)
|
||||
self.split_by_punctuation.setMinimumWidth(250)
|
||||
self.split_by_punctuation_input = LineEdit(".* /./. /。/?/? /?/!/! /!/,/, ", self)
|
||||
split_by_punctuation_layout = QHBoxLayout()
|
||||
split_by_punctuation_layout.addWidget(self.split_by_punctuation)
|
||||
split_by_punctuation_layout.addWidget(self.split_by_punctuation_input)
|
||||
|
||||
self.split_by_max_length = QCheckBox(_("Split by max length"))
|
||||
self.split_by_max_length.setChecked(True)
|
||||
self.split_by_max_length.setMinimumWidth(250)
|
||||
self.split_by_max_length_input = LineEdit("42", self)
|
||||
split_by_max_length_layout = QHBoxLayout()
|
||||
split_by_max_length_layout.addWidget(self.split_by_max_length)
|
||||
split_by_max_length_layout.addWidget(self.split_by_max_length_input)
|
||||
|
||||
self.merge_options_row.addLayout(merge_by_gap_layout)
|
||||
self.merge_options_row.addLayout(split_by_punctuation_layout)
|
||||
self.merge_options_row.addLayout(split_by_max_length_layout)
|
||||
|
||||
self.merge_button = QPushButton(_("Merge"))
|
||||
self.merge_button.clicked.connect(self.on_merge_button_clicked)
|
||||
|
||||
self.merge_options_row.addWidget(self.merge_button)
|
||||
|
||||
merge_options_layout.addLayout(self.merge_options_row)
|
||||
|
||||
merge_options_group_box.setEnabled(self.transcription.word_level_timings == 1)
|
||||
|
||||
layout.addRow(merge_options_group_box)
|
||||
|
||||
self.setLayout(layout)
|
||||
|
||||
def load_preferences(self):
|
||||
self.settings.settings.beginGroup("file_transcriber")
|
||||
preferences = FileTranscriptionPreferences.load(settings=self.settings.settings)
|
||||
self.settings.settings.endGroup()
|
||||
return preferences
|
||||
|
||||
# TODO rename function
|
||||
def on_resize_button_clicked(self):
|
||||
# Step 1 - Get the segments
|
||||
segments = self.transcription_service.get_transcription_segments(
|
||||
transcription_id=self.transcription.id_as_uuid
|
||||
)
|
||||
|
||||
subs = []
|
||||
for segment in segments:
|
||||
subtitle = srt.Subtitle(
|
||||
index=segment.id,
|
||||
start=segment.start_time,
|
||||
end=segment.end_time,
|
||||
content=segment.text
|
||||
)
|
||||
subs.append(subtitle)
|
||||
|
||||
# Step 2 - ...
|
||||
logging.debug(f"=== Will identify speakers ===")
|
||||
|
||||
def on_merge_button_clicked(self):
|
||||
self.new_transcript_id = self.transcription_service.copy_transcription(
|
||||
self.transcription.id_as_uuid
|
||||
)
|
||||
self.transcription_service.update_transcription_progress(self.new_transcript_id, 0.0)
|
||||
|
||||
if self.transcriptions_updated_signal:
|
||||
self.transcriptions_updated_signal.emit(self.new_transcript_id)
|
||||
|
||||
regroup_string = ''
|
||||
if self.merge_by_gap.isChecked():
|
||||
regroup_string += f'mg={self.merge_by_gap_input.text()}'
|
||||
|
||||
if self.split_by_max_length.isChecked():
|
||||
regroup_string += f'++{self.split_by_max_length_input.text()}+1'
|
||||
|
||||
if self.split_by_punctuation.isChecked():
|
||||
if regroup_string:
|
||||
regroup_string += '_'
|
||||
regroup_string += f'sp={self.split_by_punctuation_input.text()}'
|
||||
|
||||
if self.split_by_max_length.isChecked():
|
||||
if regroup_string:
|
||||
regroup_string += '_'
|
||||
regroup_string += f'sl={self.split_by_max_length_input.text()}'
|
||||
|
||||
if self.merge_by_gap.isChecked():
|
||||
if regroup_string:
|
||||
regroup_string += '_'
|
||||
regroup_string += f'mg={self.merge_by_gap_input.text()}'
|
||||
|
||||
if self.split_by_max_length.isChecked():
|
||||
regroup_string += f'++{self.split_by_max_length_input.text()}+1'
|
||||
|
||||
regroup_string = os.getenv("BUZZ_MERGE_REGROUP_RULE", regroup_string)
|
||||
|
||||
self.hide()
|
||||
|
||||
self.thread = QThread()
|
||||
self.worker = TranscriptionWorker(
|
||||
self.transcription,
|
||||
self.transcription_options,
|
||||
self.transcription_service,
|
||||
regroup_string
|
||||
)
|
||||
self.worker.moveToThread(self.thread)
|
||||
self.thread.started.connect(self.worker.run)
|
||||
self.worker.finished.connect(self.thread.quit)
|
||||
self.worker.finished.connect(self.worker.deleteLater)
|
||||
self.thread.finished.connect(self.thread.deleteLater)
|
||||
self.worker.result_ready.connect(self.on_transcription_completed)
|
||||
|
||||
self.thread.start()
|
||||
|
||||
def on_transcription_completed(self, segments):
|
||||
if self.new_transcript_id is not None:
|
||||
self.transcription_service.update_transcription_as_completed(self.new_transcript_id, segments)
|
||||
|
||||
if self.transcriptions_updated_signal:
|
||||
self.transcriptions_updated_signal.emit(self.new_transcript_id)
|
||||
|
||||
self.close()
|
||||
|
||||
def closeEvent(self, event):
|
||||
self.hide()
|
||||
|
||||
super().closeEvent(event)
|
||||
|
|
@ -26,6 +26,7 @@ from buzz.widgets.icon import (
|
|||
FileDownloadIcon,
|
||||
TranslateIcon,
|
||||
ResizeIcon,
|
||||
SpeakerIdentificationIcon,
|
||||
)
|
||||
from buzz.translator import Translator
|
||||
from buzz.widgets.text_display_box import TextDisplayBox
|
||||
|
|
@ -46,6 +47,7 @@ from buzz.widgets.transcription_viewer.transcription_view_mode_tool_button impor
|
|||
ViewMode
|
||||
)
|
||||
from buzz.widgets.transcription_viewer.transcription_resizer_widget import TranscriptionResizerWidget
|
||||
from buzz.widgets.transcription_viewer.speaker_identification_widget import SpeakerIdentificationWidget
|
||||
|
||||
|
||||
class TranscriptionViewerWidget(QWidget):
|
||||
|
|
@ -72,6 +74,7 @@ class TranscriptionViewerWidget(QWidget):
|
|||
self.setWindowTitle(file_path_as_title(transcription.file))
|
||||
|
||||
self.transcription_resizer_dialog = None
|
||||
self.speaker_identification_dialog = None
|
||||
self.transcriptions_updated_signal = transcriptions_updated_signal
|
||||
|
||||
self.translation_thread = None
|
||||
|
|
@ -191,6 +194,17 @@ class TranscriptionViewerWidget(QWidget):
|
|||
|
||||
toolbar.addWidget(resize_button)
|
||||
|
||||
speaker_identification_button = QToolButton()
|
||||
speaker_identification_button.setText(_("Identify Speakers"))
|
||||
speaker_identification_button.setObjectName("speaker_identification_button")
|
||||
speaker_identification_button.setIcon(SpeakerIdentificationIcon(self))
|
||||
speaker_identification_button.setToolButtonStyle(
|
||||
Qt.ToolButtonStyle.ToolButtonTextBesideIcon
|
||||
)
|
||||
speaker_identification_button.clicked.connect(self.on_speaker_identification_button_clicked)
|
||||
|
||||
toolbar.addWidget(speaker_identification_button)
|
||||
|
||||
layout.setMenuBar(toolbar)
|
||||
|
||||
layout.addWidget(self.table_widget)
|
||||
|
|
@ -314,12 +328,26 @@ class TranscriptionViewerWidget(QWidget):
|
|||
|
||||
self.transcription_resizer_dialog.show()
|
||||
|
||||
def on_speaker_identification_button_clicked(self):
|
||||
self.speaker_identification_dialog = SpeakerIdentificationWidget(
|
||||
transcription=self.transcription,
|
||||
transcription_service=self.transcription_service,
|
||||
transcriptions_updated_signal=self.transcriptions_updated_signal,
|
||||
)
|
||||
|
||||
self.transcriptions_updated_signal.connect(self.close)
|
||||
|
||||
self.speaker_identification_dialog.show()
|
||||
|
||||
def closeEvent(self, event):
|
||||
self.hide()
|
||||
|
||||
if self.transcription_resizer_dialog:
|
||||
self.transcription_resizer_dialog.close()
|
||||
|
||||
if self.speaker_identification_dialog:
|
||||
self.speaker_identification_dialog.close()
|
||||
|
||||
self.translator.stop()
|
||||
self.translation_thread.quit()
|
||||
self.translation_thread.wait()
|
||||
|
|
|
|||
4444
poetry.lock
generated
4444
poetry.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -9,7 +9,7 @@ readme = "README.md"
|
|||
license = { text = "MIT" }
|
||||
repository = "https://github.com/chidiwilliams/buzz"
|
||||
documentation = "https://chidiwilliams.github.io/buzz/docs"
|
||||
requires-python = ">=3.9,<3.13"
|
||||
requires-python = ">=3.10,<3.13"
|
||||
dynamic = [ "dependencies" ]
|
||||
|
||||
[project.scripts]
|
||||
|
|
@ -33,7 +33,7 @@ name = "PyPI"
|
|||
priority = "primary"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.9,<3.13"
|
||||
python = ">=3.10,<3.13"
|
||||
sounddevice = "^0.4.5"
|
||||
humanize = "^4.4.0"
|
||||
PyQt6 = "6.8.1"
|
||||
|
|
@ -49,8 +49,7 @@ yt-dlp = "^2025.2.19"
|
|||
stable-ts = "^2.18.3"
|
||||
faster-whisper = "^1.1.1"
|
||||
openai-whisper = "^20240930"
|
||||
# transformers 4.50.0 has some bug
|
||||
transformers = "4.49.0"
|
||||
transformers = "4.48.3"
|
||||
accelerate = "^1.0.1"
|
||||
polib = "^1.2.0"
|
||||
srt-equalizer = "^0.1.10"
|
||||
|
|
@ -80,6 +79,9 @@ soundfile = "^0.13.1"
|
|||
urllib3 = "^2.3.0"
|
||||
demucs = {url = "https://github.com/raivisdejus/demucs/releases/download/4.1.0a3/demucs-4.1.0a3-py3-none-any.whl"}
|
||||
posthog = "^3.23.0"
|
||||
deepmultilingualpunctuation = {git = "https://github.com/oliverguhr/deepmultilingualpunctuation.git"}
|
||||
ctc-forced-aligner = {git = "https://github.com/MahmoudAshraf97/ctc-forced-aligner.git"}
|
||||
nemo-toolkit = {extras = ["asr"], version = "^2.2.1"}
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
autopep8 = "^1.7.0"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue