Add CLI (#424)

2024-06-26 11:40:09 +02:00 · 2023-04-29 21:23:20 +00:00 · 2023-04-29 21:23:20 +00:00 · 32af05da30
parent 66bd9a1834
commit 32af05da30
9 changed files with 207 additions and 23 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -7,4 +7,4 @@ omit =
 directory = coverage/html

 [report]
-fail_under = 76
+fail_under = 74
--- a/README.md
+++ b/README.md
@ -20,11 +20,13 @@ OpenAI's [Whisper](https://github.com/openai/whisper).

 - Import audio and video files and export transcripts to TXT, SRT, and
  VTT ([Demo](https://www.loom.com/share/cf263b099ac3481082bb56d19b7c87fe))
- Transcription and translation from your computer's microphones to text (Resource-intensive and may not be real-time, [Demo](https://www.loom.com/share/564b753eb4d44b55b985b8abd26b55f7))
+- Transcription and translation from your computer's microphones to text (Resource-intensive and may not be
+  real-time, [Demo](https://www.loom.com/share/564b753eb4d44b55b985b8abd26b55f7))
 - Supports [Whisper](https://github.com/openai/whisper#available-models-and-languages),
  [Whisper.cpp](https://github.com/ggerganov/whisper.cpp), [Faster Whisper](https://github.com/guillaumekln/faster-whisper),
  [Whisper-compatible Hugging Face models](https://huggingface.co/models?other=whisper), and
  the [OpenAI Whisper API](https://platform.openai.com/docs/api-reference/introduction)
+- [Command-Line Interface](#command-line-interface)
 - Available on Mac, Windows, and Linux

 ## Installation
@ -129,6 +131,24 @@ and [Virtual Audio Cable](https://vac.muzychenko.net/en/)).
 6. Open Buzz, select BlackHole as your microphone, and record as before to see transcriptions from the audio playing
   through BlackHole.

+## Command-Line Interface
+
+### `add`
+
+Start a new transcription task
+
+Examples:
+
+```shell
+# Translate two MP3 files from French to English using OpenAI Whisper API
+buzz add --task translate --language fr --model-type openaiapi /Users/user/Downloads/1b3b03e4-8db5-ea2c-ace5-b71ff32e3304.mp3 /Users/user/Downloads/koaf9083k1lkpsfdi0.mp3
+
+# Transcribe an MP4 using Whisper.cpp "small" model and immediately export to SRT and VTT files
+buzz add --task transcribe --model-type whispercpp --model-size small --prompt "My initial prompt" --srt --vtt /Users/user/Downloads/buzz/1b3b03e4-8db5-ea2c-ace5-b71ff32e3304.mp4
+```
+
+Run `buzz add --help` to see all available options.
+
 ## Build

 To build/run Buzz locally from source, first install the requirements:
--- a/buzz/cli.py
+++ b/buzz/cli.py
@ -0,0 +1,157 @@
+import enum
+import sys
+import typing
+
+from PyQt6.QtCore import QCommandLineParser, QCommandLineOption
+
+from buzz.gui import Application
+from buzz.model_loader import ModelType, WhisperModelSize, TranscriptionModel, get_local_model_path
+from buzz.store.keyring_store import KeyringStore
+from buzz.transcriber import Task, FileTranscriptionTask, FileTranscriptionOptions, TranscriptionOptions, LANGUAGES, \
+    OutputFormat
+
+
+class CommandLineError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+
+
+class CommandLineModelType(enum.Enum):
+    WHISPER = 'whisper'
+    WHISPER_CPP = 'whispercpp'
+    HUGGING_FACE = 'huggingface'
+    FASTER_WHISPER = 'fasterwhisper'
+    OPEN_AI_WHISPER_API = 'openaiapi'
+
+
+def parse_command_line(app: Application):
+    parser = QCommandLineParser()
+    try:
+        parse(app, parser)
+    except CommandLineError as exc:
+        print(f'Error: {str(exc)}\n', file=sys.stderr)
+        print(parser.helpText())
+        sys.exit(1)
+
+
+def parse(app: Application, parser: QCommandLineParser):
+    parser.addPositionalArgument('<command>', 'One of the following commands:\n- add')
+    parser.parse(app.arguments())
+
+    args = parser.positionalArguments()
+    if len(args) == 0:
+        parser.addHelpOption()
+        parser.addVersionOption()
+
+        parser.process(app)
+        return
+
+    command = args[0]
+    if command == "add":
+        parser.clearPositionalArguments()
+
+        parser.addPositionalArgument('files', 'Input file paths', '[file file file...]')
+
+        task_option = QCommandLineOption(['t', 'task'],
+                                         f'The task to perform. Allowed: {join_values(Task)}. Default: {Task.TRANSCRIBE.value}.',
+                                         'task',
+                                         Task.TRANSCRIBE.value)
+        model_type_option = QCommandLineOption(['m', 'model-type'],
+                                               f'Model type. Allowed: {join_values(CommandLineModelType)}. Default: {CommandLineModelType.WHISPER.value}.',
+                                               'model-type',
+                                               CommandLineModelType.WHISPER.value)
+        model_size_option = QCommandLineOption(['s', 'model-size'],
+                                               f'Model size. Use only when --model-type is whisper, whispercpp, or fasterwhisper. Allowed: {join_values(WhisperModelSize)}. Default: {WhisperModelSize.TINY.value}.',
+                                               'model-size', WhisperModelSize.TINY.value)
+        hugging_face_model_id_option = QCommandLineOption(['hfid'],
+                                                          f'Hugging Face model ID. Use only when --model-type is huggingface. Example: "openai/whisper-tiny"',
+                                                          'id')
+        language_option = QCommandLineOption(['l', 'language'],
+                                             f'Language code. Allowed: {", ".join(sorted([k + " (" + LANGUAGES[k].title() + ")" for k in LANGUAGES]))}. Leave empty to detect language.',
+                                             'code', '')
+        initial_prompt_option = QCommandLineOption(['p', 'prompt'], f'Initial prompt', 'prompt', '')
+        open_ai_access_token_option = QCommandLineOption('openai-token',
+                                                         f'OpenAI access token. Use only when --model-type is {CommandLineModelType.OPEN_AI_WHISPER_API.value}. Defaults to your previously saved access token, if one exists.',
+                                                         'token')
+        srt_option = QCommandLineOption(['srt'], 'Output result in an SRT file.')
+        vtt_option = QCommandLineOption(['vtt'], 'Output result in a VTT file.')
+        txt_option = QCommandLineOption('txt', 'Output result in a TXT file.')
+
+        parser.addOptions(
+            [task_option, model_type_option, model_size_option, hugging_face_model_id_option, language_option,
+             initial_prompt_option, open_ai_access_token_option, srt_option, vtt_option, txt_option])
+
+        parser.addHelpOption()
+        parser.addVersionOption()
+
+        parser.process(app)
+
+        # slice after first argument, the command
+        file_paths = parser.positionalArguments()[1:]
+        if len(file_paths) == 0:
+            raise CommandLineError('No input files')
+
+        task = parse_enum_option(task_option, parser, Task)
+
+        model_type = parse_enum_option(model_type_option, parser, CommandLineModelType)
+        model_size = parse_enum_option(model_size_option, parser, WhisperModelSize)
+
+        hugging_face_model_id = parser.value(hugging_face_model_id_option)
+
+        if hugging_face_model_id == '' and model_type == CommandLineModelType.HUGGING_FACE:
+            raise CommandLineError('--hfid is required when --model-type is huggingface')
+
+        model = TranscriptionModel(model_type=ModelType[model_type.name], whisper_model_size=model_size,
+                                   hugging_face_model_id=hugging_face_model_id)
+        model_path = get_local_model_path(model)
+
+        if model_path is None:
+            raise CommandLineError('Model not found')
+
+        language = parser.value(language_option)
+        if language == '':
+            language = None
+        elif LANGUAGES.get(language) is None:
+            raise CommandLineError('Invalid language option')
+
+        initial_prompt = parser.value(initial_prompt_option)
+
+        output_formats: typing.Set[OutputFormat] = set()
+        if parser.isSet(srt_option):
+            output_formats.add(OutputFormat.SRT)
+        if parser.isSet(vtt_option):
+            output_formats.add(OutputFormat.VTT)
+        if parser.isSet(txt_option):
+            output_formats.add(OutputFormat.TXT)
+
+        openai_access_token = parser.value(open_ai_access_token_option)
+        if model.model_type == ModelType.OPEN_AI_WHISPER_API and openai_access_token == '':
+            openai_access_token = KeyringStore().get_password(key=KeyringStore.Key.OPENAI_API_KEY)
+
+            if openai_access_token == '':
+                raise CommandLineError('No OpenAI access token found')
+
+        transcription_options = TranscriptionOptions(model=model, task=task, language=language,
+                                                     initial_prompt=initial_prompt,
+                                                     openai_access_token=openai_access_token)
+        file_transcription_options = FileTranscriptionOptions(file_paths=file_paths, output_formats=output_formats)
+
+        for file_path in file_paths:
+            transcription_task = FileTranscriptionTask(file_path=file_path, model_path=model_path,
+                                                       transcription_options=transcription_options,
+                                                       file_transcription_options=file_transcription_options)
+            app.add_task(transcription_task)
+
+
+T = typing.TypeVar("T", bound=enum.Enum)
+
+
+def parse_enum_option(option: QCommandLineOption, parser: QCommandLineParser, enum_class: typing.Type[T]) -> T:
+    try:
+        return enum_class(parser.value(option))
+    except ValueError:
+        raise CommandLineError(f'Invalid value for --{option.names()[-1]} option.')
+
+
+def join_values(enum_class: typing.Type[enum.Enum]) -> str:
+    return ', '.join([v.value for v in enum_class])
--- a/buzz/gui.py
+++ b/buzz/gui.py
@ -3,11 +3,9 @@ import json
 import logging
 import os
 import sys
-from datetime import datetime
 from enum import auto
 from typing import Dict, List, Optional, Tuple

-import humanize
 import sounddevice
 from PyQt6 import QtGui
 from PyQt6.QtCore import (QObject, Qt, QThread,
@ -18,10 +16,9 @@ from PyQt6.QtGui import (QAction, QCloseEvent, QDesktopServices, QIcon,
 from PyQt6.QtNetwork import QNetworkAccessManager, QNetworkReply, QNetworkRequest
 from PyQt6.QtWidgets import (QApplication, QCheckBox, QComboBox, QDialog,
                             QDialogButtonBox, QFileDialog, QLabel, QMainWindow, QMessageBox, QPlainTextEdit,
-                             QProgressDialog, QPushButton, QVBoxLayout, QHBoxLayout, QWidget, QGroupBox, QTableWidget,
+                             QPushButton, QVBoxLayout, QHBoxLayout, QWidget, QGroupBox, QTableWidget,
                             QMenuBar, QFormLayout, QTableWidgetItem,
                             QAbstractItemView, QListWidget, QListWidgetItem, QSizePolicy)
-from whisper import tokenizer

 from buzz.cache import TasksCache
 from .__version__ import VERSION
@ -41,7 +38,7 @@ from .transcriber import (SUPPORTED_OUTPUT_FORMATS, FileTranscriptionOptions, Ou
                          Task,
                          TranscriptionOptions,
                          FileTranscriberQueueWorker, FileTranscriptionTask, RecordingTranscriber, LOADED_WHISPER_DLL,
-                          DEFAULT_WHISPER_TEMPERATURE)
+                          DEFAULT_WHISPER_TEMPERATURE, LANGUAGES)
 from .widgets.line_edit import LineEdit
 from .widgets.model_download_progress_dialog import ModelDownloadProgressDialog
 from .widgets.model_type_combo_box import ModelTypeComboBox
@ -112,7 +109,7 @@ class LanguagesComboBox(QComboBox):
        super().__init__(parent)

        whisper_languages = sorted(
-            [(lang, tokenizer.LANGUAGES[lang].title()) for lang in tokenizer.LANGUAGES], key=lambda lang: lang[1])
+            [(lang, LANGUAGES[lang].title()) for lang in LANGUAGES], key=lambda lang: lang[1])
        self.languages = [('', _('Detect Language'))] + whisper_languages

        self.addItems([lang[1] for lang in self.languages])
@ -259,7 +256,7 @@ class FileTranscriberWidget(QWidget):
        self.transcription_options = transcription_options
        self.word_level_timings_checkbox.setDisabled(
            self.transcription_options.model.model_type == ModelType.HUGGING_FACE or self.transcription_options.model.model_type == ModelType.OPEN_AI_WHISPER_API)
-        if self.transcription_options.openai_access_token is not None:
+        if self.transcription_options.openai_access_token != '':
            self.openai_access_token_changed.emit(self.transcription_options.openai_access_token)

    def on_click_run(self):
@ -950,7 +947,7 @@ class MainWindow(QMainWindow):
        for file_path in file_transcription_options.file_paths:
            task = FileTranscriptionTask(
                file_path, transcription_options, file_transcription_options, model_path)
-            self.transcriber_worker.add_task(task)
+            self.add_task(task)

    def update_task_table_row(self, task: FileTranscriptionTask):
        self.table_widget.upsert_task(task)
@ -1053,6 +1050,9 @@ class MainWindow(QMainWindow):
            transcription_task=task, parent=self, flags=Qt.WindowType.Window)
        transcription_viewer_widget.show()

+    def add_task(self, task: FileTranscriptionTask):
+        self.transcriber_worker.add_task(task)
+
    def load_tasks_from_cache(self):
        tasks = self.tasks_cache.load()
        for task in tasks:
@ -1378,9 +1378,15 @@ class Application(QApplication):
    def __init__(self) -> None:
        super().__init__(sys.argv)

+        self.setApplicationName(APP_NAME)
+        self.setApplicationVersion(VERSION)
+
        self.window = MainWindow()
        self.window.show()

+    def add_task(self, task: FileTranscriptionTask):
+        self.window.add_task(task)
+

 class AdvancedSettingsDialog(QDialog):
    transcription_options: TranscriptionOptions
--- a/buzz/settings/settings.py
+++ b/buzz/settings/settings.py
@ -7,7 +7,8 @@ APP_NAME = 'Buzz'


 class Settings:
-    settings = QSettings(APP_NAME)
+    def __init__(self):
+        self.settings = QSettings(APP_NAME)

    class Key(enum.Enum):
        RECORDING_TRANSCRIBER_TASK = 'recording-transcriber/task'
--- a/buzz/store/keyring_store.py
+++ b/buzz/store/keyring_store.py
@ -11,9 +11,9 @@ class KeyringStore:
    class Key(enum.Enum):
        OPENAI_API_KEY = 'OpenAI API key'

-    def get_password(self, username: Key) -> str:
+    def get_password(self, key: Key) -> str:
        try:
-            password = keyring.get_password(APP_NAME, username=username.value)
+            password = keyring.get_password(APP_NAME, username=key.value)
            if password is None:
                return ''
            return password
--- a/buzz/transcriber.py
+++ b/buzz/transcriber.py
@ -5,14 +5,12 @@ import json
 import logging
 import multiprocessing
 import os
-import platform
 import queue
 import re
-import subprocess
 import sys
 import tempfile
 import threading
-from abc import ABC, abstractmethod
+from abc import abstractmethod
 from dataclasses import dataclass, field
 from multiprocessing.connection import Connection
 from random import randint
@ -20,16 +18,16 @@ from threading import Thread
 from typing import Any, List, Optional, Tuple, Union, Set

 import faster_whisper
-import openai
-
 import ffmpeg
 import numpy as np
+import openai
 import sounddevice
 import stable_whisper
 import tqdm
 import whisper
 from PyQt6.QtCore import QObject, QProcess, pyqtSignal, pyqtSlot, QThread
 from sounddevice import PortAudioError
+from whisper import tokenizer

 from . import transformers_whisper
 from .conn import pipe_stderr
@ -62,6 +60,9 @@ class Segment:
    text: str


+LANGUAGES = tokenizer.LANGUAGES
+
+
@dataclass()
 class TranscriptionOptions:
    language: Optional[str] = None
@ -70,7 +71,7 @@ class TranscriptionOptions:
    word_level_timings: bool = False
    temperature: Tuple[float, ...] = DEFAULT_WHISPER_TEMPERATURE
    initial_prompt: str = ''
-    openai_access_token: Optional[str] = None
+    openai_access_token: str = ''


@dataclass()
--- a/cli.py
+++ b/cli.py
@ -1,2 +0,0 @@
-if __name__ == '__main__':
-    pass
--- a/main.py
+++ b/main.py
@ -6,9 +6,10 @@ import platform
 import sys
 from typing import TextIO

-from PyQt6.QtCore import QTranslator, QLocale
 from appdirs import user_log_dir

+from buzz.cli import parse_command_line
+
 # Check for segfaults if not running in frozen mode
 if getattr(sys, 'frozen', False) is False:
    faulthandler.enable()
@ -51,5 +52,5 @@ if __name__ == "__main__":
    from buzz.gui import Application

    app = Application()
-
+    parse_command_line(app)
    sys.exit(app.exec())