From 32af05da309ac96c375c161f85f0646ab694eb6f Mon Sep 17 00:00:00 2001 From: Chidi Williams Date: Sat, 29 Apr 2023 21:23:20 +0000 Subject: [PATCH] Add CLI (#424) --- .coveragerc | 2 +- README.md | 22 ++++- buzz/cli.py | 157 ++++++++++++++++++++++++++++++++++++ buzz/gui.py | 22 +++-- buzz/settings/settings.py | 3 +- buzz/store/keyring_store.py | 4 +- buzz/transcriber.py | 13 +-- cli.py | 2 - main.py | 5 +- 9 files changed, 207 insertions(+), 23 deletions(-) create mode 100644 buzz/cli.py delete mode 100644 cli.py diff --git a/.coveragerc b/.coveragerc index f161d58..7772715 100644 --- a/.coveragerc +++ b/.coveragerc @@ -7,4 +7,4 @@ omit = directory = coverage/html [report] -fail_under = 76 +fail_under = 74 diff --git a/README.md b/README.md index 21507ff..9da5bad 100644 --- a/README.md +++ b/README.md @@ -20,11 +20,13 @@ OpenAI's [Whisper](https://github.com/openai/whisper). - Import audio and video files and export transcripts to TXT, SRT, and VTT ([Demo](https://www.loom.com/share/cf263b099ac3481082bb56d19b7c87fe)) -- Transcription and translation from your computer's microphones to text (Resource-intensive and may not be real-time, [Demo](https://www.loom.com/share/564b753eb4d44b55b985b8abd26b55f7)) +- Transcription and translation from your computer's microphones to text (Resource-intensive and may not be + real-time, [Demo](https://www.loom.com/share/564b753eb4d44b55b985b8abd26b55f7)) - Supports [Whisper](https://github.com/openai/whisper#available-models-and-languages), [Whisper.cpp](https://github.com/ggerganov/whisper.cpp), [Faster Whisper](https://github.com/guillaumekln/faster-whisper), [Whisper-compatible Hugging Face models](https://huggingface.co/models?other=whisper), and the [OpenAI Whisper API](https://platform.openai.com/docs/api-reference/introduction) +- [Command-Line Interface](#command-line-interface) - Available on Mac, Windows, and Linux ## Installation @@ -129,6 +131,24 @@ and [Virtual Audio Cable](https://vac.muzychenko.net/en/)). 6. Open Buzz, select BlackHole as your microphone, and record as before to see transcriptions from the audio playing through BlackHole. +## Command-Line Interface + +### `add` + +Start a new transcription task + +Examples: + +```shell +# Translate two MP3 files from French to English using OpenAI Whisper API +buzz add --task translate --language fr --model-type openaiapi /Users/user/Downloads/1b3b03e4-8db5-ea2c-ace5-b71ff32e3304.mp3 /Users/user/Downloads/koaf9083k1lkpsfdi0.mp3 + +# Transcribe an MP4 using Whisper.cpp "small" model and immediately export to SRT and VTT files +buzz add --task transcribe --model-type whispercpp --model-size small --prompt "My initial prompt" --srt --vtt /Users/user/Downloads/buzz/1b3b03e4-8db5-ea2c-ace5-b71ff32e3304.mp4 +``` + +Run `buzz add --help` to see all available options. + ## Build To build/run Buzz locally from source, first install the requirements: diff --git a/buzz/cli.py b/buzz/cli.py new file mode 100644 index 0000000..8be7b6a --- /dev/null +++ b/buzz/cli.py @@ -0,0 +1,157 @@ +import enum +import sys +import typing + +from PyQt6.QtCore import QCommandLineParser, QCommandLineOption + +from buzz.gui import Application +from buzz.model_loader import ModelType, WhisperModelSize, TranscriptionModel, get_local_model_path +from buzz.store.keyring_store import KeyringStore +from buzz.transcriber import Task, FileTranscriptionTask, FileTranscriptionOptions, TranscriptionOptions, LANGUAGES, \ + OutputFormat + + +class CommandLineError(Exception): + def __init__(self, message: str): + super().__init__(message) + + +class CommandLineModelType(enum.Enum): + WHISPER = 'whisper' + WHISPER_CPP = 'whispercpp' + HUGGING_FACE = 'huggingface' + FASTER_WHISPER = 'fasterwhisper' + OPEN_AI_WHISPER_API = 'openaiapi' + + +def parse_command_line(app: Application): + parser = QCommandLineParser() + try: + parse(app, parser) + except CommandLineError as exc: + print(f'Error: {str(exc)}\n', file=sys.stderr) + print(parser.helpText()) + sys.exit(1) + + +def parse(app: Application, parser: QCommandLineParser): + parser.addPositionalArgument('', 'One of the following commands:\n- add') + parser.parse(app.arguments()) + + args = parser.positionalArguments() + if len(args) == 0: + parser.addHelpOption() + parser.addVersionOption() + + parser.process(app) + return + + command = args[0] + if command == "add": + parser.clearPositionalArguments() + + parser.addPositionalArgument('files', 'Input file paths', '[file file file...]') + + task_option = QCommandLineOption(['t', 'task'], + f'The task to perform. Allowed: {join_values(Task)}. Default: {Task.TRANSCRIBE.value}.', + 'task', + Task.TRANSCRIBE.value) + model_type_option = QCommandLineOption(['m', 'model-type'], + f'Model type. Allowed: {join_values(CommandLineModelType)}. Default: {CommandLineModelType.WHISPER.value}.', + 'model-type', + CommandLineModelType.WHISPER.value) + model_size_option = QCommandLineOption(['s', 'model-size'], + f'Model size. Use only when --model-type is whisper, whispercpp, or fasterwhisper. Allowed: {join_values(WhisperModelSize)}. Default: {WhisperModelSize.TINY.value}.', + 'model-size', WhisperModelSize.TINY.value) + hugging_face_model_id_option = QCommandLineOption(['hfid'], + f'Hugging Face model ID. Use only when --model-type is huggingface. Example: "openai/whisper-tiny"', + 'id') + language_option = QCommandLineOption(['l', 'language'], + f'Language code. Allowed: {", ".join(sorted([k + " (" + LANGUAGES[k].title() + ")" for k in LANGUAGES]))}. Leave empty to detect language.', + 'code', '') + initial_prompt_option = QCommandLineOption(['p', 'prompt'], f'Initial prompt', 'prompt', '') + open_ai_access_token_option = QCommandLineOption('openai-token', + f'OpenAI access token. Use only when --model-type is {CommandLineModelType.OPEN_AI_WHISPER_API.value}. Defaults to your previously saved access token, if one exists.', + 'token') + srt_option = QCommandLineOption(['srt'], 'Output result in an SRT file.') + vtt_option = QCommandLineOption(['vtt'], 'Output result in a VTT file.') + txt_option = QCommandLineOption('txt', 'Output result in a TXT file.') + + parser.addOptions( + [task_option, model_type_option, model_size_option, hugging_face_model_id_option, language_option, + initial_prompt_option, open_ai_access_token_option, srt_option, vtt_option, txt_option]) + + parser.addHelpOption() + parser.addVersionOption() + + parser.process(app) + + # slice after first argument, the command + file_paths = parser.positionalArguments()[1:] + if len(file_paths) == 0: + raise CommandLineError('No input files') + + task = parse_enum_option(task_option, parser, Task) + + model_type = parse_enum_option(model_type_option, parser, CommandLineModelType) + model_size = parse_enum_option(model_size_option, parser, WhisperModelSize) + + hugging_face_model_id = parser.value(hugging_face_model_id_option) + + if hugging_face_model_id == '' and model_type == CommandLineModelType.HUGGING_FACE: + raise CommandLineError('--hfid is required when --model-type is huggingface') + + model = TranscriptionModel(model_type=ModelType[model_type.name], whisper_model_size=model_size, + hugging_face_model_id=hugging_face_model_id) + model_path = get_local_model_path(model) + + if model_path is None: + raise CommandLineError('Model not found') + + language = parser.value(language_option) + if language == '': + language = None + elif LANGUAGES.get(language) is None: + raise CommandLineError('Invalid language option') + + initial_prompt = parser.value(initial_prompt_option) + + output_formats: typing.Set[OutputFormat] = set() + if parser.isSet(srt_option): + output_formats.add(OutputFormat.SRT) + if parser.isSet(vtt_option): + output_formats.add(OutputFormat.VTT) + if parser.isSet(txt_option): + output_formats.add(OutputFormat.TXT) + + openai_access_token = parser.value(open_ai_access_token_option) + if model.model_type == ModelType.OPEN_AI_WHISPER_API and openai_access_token == '': + openai_access_token = KeyringStore().get_password(key=KeyringStore.Key.OPENAI_API_KEY) + + if openai_access_token == '': + raise CommandLineError('No OpenAI access token found') + + transcription_options = TranscriptionOptions(model=model, task=task, language=language, + initial_prompt=initial_prompt, + openai_access_token=openai_access_token) + file_transcription_options = FileTranscriptionOptions(file_paths=file_paths, output_formats=output_formats) + + for file_path in file_paths: + transcription_task = FileTranscriptionTask(file_path=file_path, model_path=model_path, + transcription_options=transcription_options, + file_transcription_options=file_transcription_options) + app.add_task(transcription_task) + + +T = typing.TypeVar("T", bound=enum.Enum) + + +def parse_enum_option(option: QCommandLineOption, parser: QCommandLineParser, enum_class: typing.Type[T]) -> T: + try: + return enum_class(parser.value(option)) + except ValueError: + raise CommandLineError(f'Invalid value for --{option.names()[-1]} option.') + + +def join_values(enum_class: typing.Type[enum.Enum]) -> str: + return ', '.join([v.value for v in enum_class]) diff --git a/buzz/gui.py b/buzz/gui.py index 3d61c03..5b53c47 100644 --- a/buzz/gui.py +++ b/buzz/gui.py @@ -3,11 +3,9 @@ import json import logging import os import sys -from datetime import datetime from enum import auto from typing import Dict, List, Optional, Tuple -import humanize import sounddevice from PyQt6 import QtGui from PyQt6.QtCore import (QObject, Qt, QThread, @@ -18,10 +16,9 @@ from PyQt6.QtGui import (QAction, QCloseEvent, QDesktopServices, QIcon, from PyQt6.QtNetwork import QNetworkAccessManager, QNetworkReply, QNetworkRequest from PyQt6.QtWidgets import (QApplication, QCheckBox, QComboBox, QDialog, QDialogButtonBox, QFileDialog, QLabel, QMainWindow, QMessageBox, QPlainTextEdit, - QProgressDialog, QPushButton, QVBoxLayout, QHBoxLayout, QWidget, QGroupBox, QTableWidget, + QPushButton, QVBoxLayout, QHBoxLayout, QWidget, QGroupBox, QTableWidget, QMenuBar, QFormLayout, QTableWidgetItem, QAbstractItemView, QListWidget, QListWidgetItem, QSizePolicy) -from whisper import tokenizer from buzz.cache import TasksCache from .__version__ import VERSION @@ -41,7 +38,7 @@ from .transcriber import (SUPPORTED_OUTPUT_FORMATS, FileTranscriptionOptions, Ou Task, TranscriptionOptions, FileTranscriberQueueWorker, FileTranscriptionTask, RecordingTranscriber, LOADED_WHISPER_DLL, - DEFAULT_WHISPER_TEMPERATURE) + DEFAULT_WHISPER_TEMPERATURE, LANGUAGES) from .widgets.line_edit import LineEdit from .widgets.model_download_progress_dialog import ModelDownloadProgressDialog from .widgets.model_type_combo_box import ModelTypeComboBox @@ -112,7 +109,7 @@ class LanguagesComboBox(QComboBox): super().__init__(parent) whisper_languages = sorted( - [(lang, tokenizer.LANGUAGES[lang].title()) for lang in tokenizer.LANGUAGES], key=lambda lang: lang[1]) + [(lang, LANGUAGES[lang].title()) for lang in LANGUAGES], key=lambda lang: lang[1]) self.languages = [('', _('Detect Language'))] + whisper_languages self.addItems([lang[1] for lang in self.languages]) @@ -259,7 +256,7 @@ class FileTranscriberWidget(QWidget): self.transcription_options = transcription_options self.word_level_timings_checkbox.setDisabled( self.transcription_options.model.model_type == ModelType.HUGGING_FACE or self.transcription_options.model.model_type == ModelType.OPEN_AI_WHISPER_API) - if self.transcription_options.openai_access_token is not None: + if self.transcription_options.openai_access_token != '': self.openai_access_token_changed.emit(self.transcription_options.openai_access_token) def on_click_run(self): @@ -950,7 +947,7 @@ class MainWindow(QMainWindow): for file_path in file_transcription_options.file_paths: task = FileTranscriptionTask( file_path, transcription_options, file_transcription_options, model_path) - self.transcriber_worker.add_task(task) + self.add_task(task) def update_task_table_row(self, task: FileTranscriptionTask): self.table_widget.upsert_task(task) @@ -1053,6 +1050,9 @@ class MainWindow(QMainWindow): transcription_task=task, parent=self, flags=Qt.WindowType.Window) transcription_viewer_widget.show() + def add_task(self, task: FileTranscriptionTask): + self.transcriber_worker.add_task(task) + def load_tasks_from_cache(self): tasks = self.tasks_cache.load() for task in tasks: @@ -1378,9 +1378,15 @@ class Application(QApplication): def __init__(self) -> None: super().__init__(sys.argv) + self.setApplicationName(APP_NAME) + self.setApplicationVersion(VERSION) + self.window = MainWindow() self.window.show() + def add_task(self, task: FileTranscriptionTask): + self.window.add_task(task) + class AdvancedSettingsDialog(QDialog): transcription_options: TranscriptionOptions diff --git a/buzz/settings/settings.py b/buzz/settings/settings.py index db64714..524b014 100644 --- a/buzz/settings/settings.py +++ b/buzz/settings/settings.py @@ -7,7 +7,8 @@ APP_NAME = 'Buzz' class Settings: - settings = QSettings(APP_NAME) + def __init__(self): + self.settings = QSettings(APP_NAME) class Key(enum.Enum): RECORDING_TRANSCRIBER_TASK = 'recording-transcriber/task' diff --git a/buzz/store/keyring_store.py b/buzz/store/keyring_store.py index 1b4adb1..b0795f8 100644 --- a/buzz/store/keyring_store.py +++ b/buzz/store/keyring_store.py @@ -11,9 +11,9 @@ class KeyringStore: class Key(enum.Enum): OPENAI_API_KEY = 'OpenAI API key' - def get_password(self, username: Key) -> str: + def get_password(self, key: Key) -> str: try: - password = keyring.get_password(APP_NAME, username=username.value) + password = keyring.get_password(APP_NAME, username=key.value) if password is None: return '' return password diff --git a/buzz/transcriber.py b/buzz/transcriber.py index c816c82..12e99ae 100644 --- a/buzz/transcriber.py +++ b/buzz/transcriber.py @@ -5,14 +5,12 @@ import json import logging import multiprocessing import os -import platform import queue import re -import subprocess import sys import tempfile import threading -from abc import ABC, abstractmethod +from abc import abstractmethod from dataclasses import dataclass, field from multiprocessing.connection import Connection from random import randint @@ -20,16 +18,16 @@ from threading import Thread from typing import Any, List, Optional, Tuple, Union, Set import faster_whisper -import openai - import ffmpeg import numpy as np +import openai import sounddevice import stable_whisper import tqdm import whisper from PyQt6.QtCore import QObject, QProcess, pyqtSignal, pyqtSlot, QThread from sounddevice import PortAudioError +from whisper import tokenizer from . import transformers_whisper from .conn import pipe_stderr @@ -62,6 +60,9 @@ class Segment: text: str +LANGUAGES = tokenizer.LANGUAGES + + @dataclass() class TranscriptionOptions: language: Optional[str] = None @@ -70,7 +71,7 @@ class TranscriptionOptions: word_level_timings: bool = False temperature: Tuple[float, ...] = DEFAULT_WHISPER_TEMPERATURE initial_prompt: str = '' - openai_access_token: Optional[str] = None + openai_access_token: str = '' @dataclass() diff --git a/cli.py b/cli.py deleted file mode 100644 index 54b08a6..0000000 --- a/cli.py +++ /dev/null @@ -1,2 +0,0 @@ -if __name__ == '__main__': - pass diff --git a/main.py b/main.py index 655a68a..fbf6b04 100644 --- a/main.py +++ b/main.py @@ -6,9 +6,10 @@ import platform import sys from typing import TextIO -from PyQt6.QtCore import QTranslator, QLocale from appdirs import user_log_dir +from buzz.cli import parse_command_line + # Check for segfaults if not running in frozen mode if getattr(sys, 'frozen', False) is False: faulthandler.enable() @@ -51,5 +52,5 @@ if __name__ == "__main__": from buzz.gui import Application app = Application() - + parse_command_line(app) sys.exit(app.exec())