This commit is contained in:
Chidi Williams 2023-04-29 21:23:20 +00:00 committed by GitHub
parent 66bd9a1834
commit 32af05da30
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 207 additions and 23 deletions

View file

@ -7,4 +7,4 @@ omit =
directory = coverage/html
[report]
fail_under = 76
fail_under = 74

View file

@ -20,11 +20,13 @@ OpenAI's [Whisper](https://github.com/openai/whisper).
- Import audio and video files and export transcripts to TXT, SRT, and
VTT ([Demo](https://www.loom.com/share/cf263b099ac3481082bb56d19b7c87fe))
- Transcription and translation from your computer's microphones to text (Resource-intensive and may not be real-time, [Demo](https://www.loom.com/share/564b753eb4d44b55b985b8abd26b55f7))
- Transcription and translation from your computer's microphones to text (Resource-intensive and may not be
real-time, [Demo](https://www.loom.com/share/564b753eb4d44b55b985b8abd26b55f7))
- Supports [Whisper](https://github.com/openai/whisper#available-models-and-languages),
[Whisper.cpp](https://github.com/ggerganov/whisper.cpp), [Faster Whisper](https://github.com/guillaumekln/faster-whisper),
[Whisper-compatible Hugging Face models](https://huggingface.co/models?other=whisper), and
the [OpenAI Whisper API](https://platform.openai.com/docs/api-reference/introduction)
- [Command-Line Interface](#command-line-interface)
- Available on Mac, Windows, and Linux
## Installation
@ -129,6 +131,24 @@ and [Virtual Audio Cable](https://vac.muzychenko.net/en/)).
6. Open Buzz, select BlackHole as your microphone, and record as before to see transcriptions from the audio playing
through BlackHole.
## Command-Line Interface
### `add`
Start a new transcription task
Examples:
```shell
# Translate two MP3 files from French to English using OpenAI Whisper API
buzz add --task translate --language fr --model-type openaiapi /Users/user/Downloads/1b3b03e4-8db5-ea2c-ace5-b71ff32e3304.mp3 /Users/user/Downloads/koaf9083k1lkpsfdi0.mp3
# Transcribe an MP4 using Whisper.cpp "small" model and immediately export to SRT and VTT files
buzz add --task transcribe --model-type whispercpp --model-size small --prompt "My initial prompt" --srt --vtt /Users/user/Downloads/buzz/1b3b03e4-8db5-ea2c-ace5-b71ff32e3304.mp4
```
Run `buzz add --help` to see all available options.
## Build
To build/run Buzz locally from source, first install the requirements:

157
buzz/cli.py Normal file
View file

@ -0,0 +1,157 @@
import enum
import sys
import typing
from PyQt6.QtCore import QCommandLineParser, QCommandLineOption
from buzz.gui import Application
from buzz.model_loader import ModelType, WhisperModelSize, TranscriptionModel, get_local_model_path
from buzz.store.keyring_store import KeyringStore
from buzz.transcriber import Task, FileTranscriptionTask, FileTranscriptionOptions, TranscriptionOptions, LANGUAGES, \
OutputFormat
class CommandLineError(Exception):
def __init__(self, message: str):
super().__init__(message)
class CommandLineModelType(enum.Enum):
WHISPER = 'whisper'
WHISPER_CPP = 'whispercpp'
HUGGING_FACE = 'huggingface'
FASTER_WHISPER = 'fasterwhisper'
OPEN_AI_WHISPER_API = 'openaiapi'
def parse_command_line(app: Application):
parser = QCommandLineParser()
try:
parse(app, parser)
except CommandLineError as exc:
print(f'Error: {str(exc)}\n', file=sys.stderr)
print(parser.helpText())
sys.exit(1)
def parse(app: Application, parser: QCommandLineParser):
parser.addPositionalArgument('<command>', 'One of the following commands:\n- add')
parser.parse(app.arguments())
args = parser.positionalArguments()
if len(args) == 0:
parser.addHelpOption()
parser.addVersionOption()
parser.process(app)
return
command = args[0]
if command == "add":
parser.clearPositionalArguments()
parser.addPositionalArgument('files', 'Input file paths', '[file file file...]')
task_option = QCommandLineOption(['t', 'task'],
f'The task to perform. Allowed: {join_values(Task)}. Default: {Task.TRANSCRIBE.value}.',
'task',
Task.TRANSCRIBE.value)
model_type_option = QCommandLineOption(['m', 'model-type'],
f'Model type. Allowed: {join_values(CommandLineModelType)}. Default: {CommandLineModelType.WHISPER.value}.',
'model-type',
CommandLineModelType.WHISPER.value)
model_size_option = QCommandLineOption(['s', 'model-size'],
f'Model size. Use only when --model-type is whisper, whispercpp, or fasterwhisper. Allowed: {join_values(WhisperModelSize)}. Default: {WhisperModelSize.TINY.value}.',
'model-size', WhisperModelSize.TINY.value)
hugging_face_model_id_option = QCommandLineOption(['hfid'],
f'Hugging Face model ID. Use only when --model-type is huggingface. Example: "openai/whisper-tiny"',
'id')
language_option = QCommandLineOption(['l', 'language'],
f'Language code. Allowed: {", ".join(sorted([k + " (" + LANGUAGES[k].title() + ")" for k in LANGUAGES]))}. Leave empty to detect language.',
'code', '')
initial_prompt_option = QCommandLineOption(['p', 'prompt'], f'Initial prompt', 'prompt', '')
open_ai_access_token_option = QCommandLineOption('openai-token',
f'OpenAI access token. Use only when --model-type is {CommandLineModelType.OPEN_AI_WHISPER_API.value}. Defaults to your previously saved access token, if one exists.',
'token')
srt_option = QCommandLineOption(['srt'], 'Output result in an SRT file.')
vtt_option = QCommandLineOption(['vtt'], 'Output result in a VTT file.')
txt_option = QCommandLineOption('txt', 'Output result in a TXT file.')
parser.addOptions(
[task_option, model_type_option, model_size_option, hugging_face_model_id_option, language_option,
initial_prompt_option, open_ai_access_token_option, srt_option, vtt_option, txt_option])
parser.addHelpOption()
parser.addVersionOption()
parser.process(app)
# slice after first argument, the command
file_paths = parser.positionalArguments()[1:]
if len(file_paths) == 0:
raise CommandLineError('No input files')
task = parse_enum_option(task_option, parser, Task)
model_type = parse_enum_option(model_type_option, parser, CommandLineModelType)
model_size = parse_enum_option(model_size_option, parser, WhisperModelSize)
hugging_face_model_id = parser.value(hugging_face_model_id_option)
if hugging_face_model_id == '' and model_type == CommandLineModelType.HUGGING_FACE:
raise CommandLineError('--hfid is required when --model-type is huggingface')
model = TranscriptionModel(model_type=ModelType[model_type.name], whisper_model_size=model_size,
hugging_face_model_id=hugging_face_model_id)
model_path = get_local_model_path(model)
if model_path is None:
raise CommandLineError('Model not found')
language = parser.value(language_option)
if language == '':
language = None
elif LANGUAGES.get(language) is None:
raise CommandLineError('Invalid language option')
initial_prompt = parser.value(initial_prompt_option)
output_formats: typing.Set[OutputFormat] = set()
if parser.isSet(srt_option):
output_formats.add(OutputFormat.SRT)
if parser.isSet(vtt_option):
output_formats.add(OutputFormat.VTT)
if parser.isSet(txt_option):
output_formats.add(OutputFormat.TXT)
openai_access_token = parser.value(open_ai_access_token_option)
if model.model_type == ModelType.OPEN_AI_WHISPER_API and openai_access_token == '':
openai_access_token = KeyringStore().get_password(key=KeyringStore.Key.OPENAI_API_KEY)
if openai_access_token == '':
raise CommandLineError('No OpenAI access token found')
transcription_options = TranscriptionOptions(model=model, task=task, language=language,
initial_prompt=initial_prompt,
openai_access_token=openai_access_token)
file_transcription_options = FileTranscriptionOptions(file_paths=file_paths, output_formats=output_formats)
for file_path in file_paths:
transcription_task = FileTranscriptionTask(file_path=file_path, model_path=model_path,
transcription_options=transcription_options,
file_transcription_options=file_transcription_options)
app.add_task(transcription_task)
T = typing.TypeVar("T", bound=enum.Enum)
def parse_enum_option(option: QCommandLineOption, parser: QCommandLineParser, enum_class: typing.Type[T]) -> T:
try:
return enum_class(parser.value(option))
except ValueError:
raise CommandLineError(f'Invalid value for --{option.names()[-1]} option.')
def join_values(enum_class: typing.Type[enum.Enum]) -> str:
return ', '.join([v.value for v in enum_class])

View file

@ -3,11 +3,9 @@ import json
import logging
import os
import sys
from datetime import datetime
from enum import auto
from typing import Dict, List, Optional, Tuple
import humanize
import sounddevice
from PyQt6 import QtGui
from PyQt6.QtCore import (QObject, Qt, QThread,
@ -18,10 +16,9 @@ from PyQt6.QtGui import (QAction, QCloseEvent, QDesktopServices, QIcon,
from PyQt6.QtNetwork import QNetworkAccessManager, QNetworkReply, QNetworkRequest
from PyQt6.QtWidgets import (QApplication, QCheckBox, QComboBox, QDialog,
QDialogButtonBox, QFileDialog, QLabel, QMainWindow, QMessageBox, QPlainTextEdit,
QProgressDialog, QPushButton, QVBoxLayout, QHBoxLayout, QWidget, QGroupBox, QTableWidget,
QPushButton, QVBoxLayout, QHBoxLayout, QWidget, QGroupBox, QTableWidget,
QMenuBar, QFormLayout, QTableWidgetItem,
QAbstractItemView, QListWidget, QListWidgetItem, QSizePolicy)
from whisper import tokenizer
from buzz.cache import TasksCache
from .__version__ import VERSION
@ -41,7 +38,7 @@ from .transcriber import (SUPPORTED_OUTPUT_FORMATS, FileTranscriptionOptions, Ou
Task,
TranscriptionOptions,
FileTranscriberQueueWorker, FileTranscriptionTask, RecordingTranscriber, LOADED_WHISPER_DLL,
DEFAULT_WHISPER_TEMPERATURE)
DEFAULT_WHISPER_TEMPERATURE, LANGUAGES)
from .widgets.line_edit import LineEdit
from .widgets.model_download_progress_dialog import ModelDownloadProgressDialog
from .widgets.model_type_combo_box import ModelTypeComboBox
@ -112,7 +109,7 @@ class LanguagesComboBox(QComboBox):
super().__init__(parent)
whisper_languages = sorted(
[(lang, tokenizer.LANGUAGES[lang].title()) for lang in tokenizer.LANGUAGES], key=lambda lang: lang[1])
[(lang, LANGUAGES[lang].title()) for lang in LANGUAGES], key=lambda lang: lang[1])
self.languages = [('', _('Detect Language'))] + whisper_languages
self.addItems([lang[1] for lang in self.languages])
@ -259,7 +256,7 @@ class FileTranscriberWidget(QWidget):
self.transcription_options = transcription_options
self.word_level_timings_checkbox.setDisabled(
self.transcription_options.model.model_type == ModelType.HUGGING_FACE or self.transcription_options.model.model_type == ModelType.OPEN_AI_WHISPER_API)
if self.transcription_options.openai_access_token is not None:
if self.transcription_options.openai_access_token != '':
self.openai_access_token_changed.emit(self.transcription_options.openai_access_token)
def on_click_run(self):
@ -950,7 +947,7 @@ class MainWindow(QMainWindow):
for file_path in file_transcription_options.file_paths:
task = FileTranscriptionTask(
file_path, transcription_options, file_transcription_options, model_path)
self.transcriber_worker.add_task(task)
self.add_task(task)
def update_task_table_row(self, task: FileTranscriptionTask):
self.table_widget.upsert_task(task)
@ -1053,6 +1050,9 @@ class MainWindow(QMainWindow):
transcription_task=task, parent=self, flags=Qt.WindowType.Window)
transcription_viewer_widget.show()
def add_task(self, task: FileTranscriptionTask):
self.transcriber_worker.add_task(task)
def load_tasks_from_cache(self):
tasks = self.tasks_cache.load()
for task in tasks:
@ -1378,9 +1378,15 @@ class Application(QApplication):
def __init__(self) -> None:
super().__init__(sys.argv)
self.setApplicationName(APP_NAME)
self.setApplicationVersion(VERSION)
self.window = MainWindow()
self.window.show()
def add_task(self, task: FileTranscriptionTask):
self.window.add_task(task)
class AdvancedSettingsDialog(QDialog):
transcription_options: TranscriptionOptions

View file

@ -7,7 +7,8 @@ APP_NAME = 'Buzz'
class Settings:
settings = QSettings(APP_NAME)
def __init__(self):
self.settings = QSettings(APP_NAME)
class Key(enum.Enum):
RECORDING_TRANSCRIBER_TASK = 'recording-transcriber/task'

View file

@ -11,9 +11,9 @@ class KeyringStore:
class Key(enum.Enum):
OPENAI_API_KEY = 'OpenAI API key'
def get_password(self, username: Key) -> str:
def get_password(self, key: Key) -> str:
try:
password = keyring.get_password(APP_NAME, username=username.value)
password = keyring.get_password(APP_NAME, username=key.value)
if password is None:
return ''
return password

View file

@ -5,14 +5,12 @@ import json
import logging
import multiprocessing
import os
import platform
import queue
import re
import subprocess
import sys
import tempfile
import threading
from abc import ABC, abstractmethod
from abc import abstractmethod
from dataclasses import dataclass, field
from multiprocessing.connection import Connection
from random import randint
@ -20,16 +18,16 @@ from threading import Thread
from typing import Any, List, Optional, Tuple, Union, Set
import faster_whisper
import openai
import ffmpeg
import numpy as np
import openai
import sounddevice
import stable_whisper
import tqdm
import whisper
from PyQt6.QtCore import QObject, QProcess, pyqtSignal, pyqtSlot, QThread
from sounddevice import PortAudioError
from whisper import tokenizer
from . import transformers_whisper
from .conn import pipe_stderr
@ -62,6 +60,9 @@ class Segment:
text: str
LANGUAGES = tokenizer.LANGUAGES
@dataclass()
class TranscriptionOptions:
language: Optional[str] = None
@ -70,7 +71,7 @@ class TranscriptionOptions:
word_level_timings: bool = False
temperature: Tuple[float, ...] = DEFAULT_WHISPER_TEMPERATURE
initial_prompt: str = ''
openai_access_token: Optional[str] = None
openai_access_token: str = ''
@dataclass()

2
cli.py
View file

@ -1,2 +0,0 @@
if __name__ == '__main__':
pass

View file

@ -6,9 +6,10 @@ import platform
import sys
from typing import TextIO
from PyQt6.QtCore import QTranslator, QLocale
from appdirs import user_log_dir
from buzz.cli import parse_command_line
# Check for segfaults if not running in frozen mode
if getattr(sys, 'frozen', False) is False:
faulthandler.enable()
@ -51,5 +52,5 @@ if __name__ == "__main__":
from buzz.gui import Application
app = Application()
parse_command_line(app)
sys.exit(app.exec())