From 32af05da309ac96c375c161f85f0646ab694eb6f Mon Sep 17 00:00:00 2001
From: Chidi Williams <williamschidi1@gmail.com>
Date: Sat, 29 Apr 2023 21:23:20 +0000
Subject: [PATCH] Add CLI (#424)

---
 .coveragerc                 |   2 +-
 README.md                   |  22 ++++-
 buzz/cli.py                 | 157 ++++++++++++++++++++++++++++++++++++
 buzz/gui.py                 |  22 +++--
 buzz/settings/settings.py   |   3 +-
 buzz/store/keyring_store.py |   4 +-
 buzz/transcriber.py         |  13 +--
 cli.py                      |   2 -
 main.py                     |   5 +-
 9 files changed, 207 insertions(+), 23 deletions(-)
 create mode 100644 buzz/cli.py
 delete mode 100644 cli.py

diff --git a/.coveragerc b/.coveragerc
index f161d58..7772715 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -7,4 +7,4 @@ omit =
 directory = coverage/html
 
 [report]
-fail_under = 76
+fail_under = 74
diff --git a/README.md b/README.md
index 21507ff..9da5bad 100644
--- a/README.md
+++ b/README.md
@@ -20,11 +20,13 @@ OpenAI's [Whisper](https://github.com/openai/whisper).
 
 - Import audio and video files and export transcripts to TXT, SRT, and
   VTT ([Demo](https://www.loom.com/share/cf263b099ac3481082bb56d19b7c87fe))
-- Transcription and translation from your computer's microphones to text (Resource-intensive and may not be real-time, [Demo](https://www.loom.com/share/564b753eb4d44b55b985b8abd26b55f7))
+- Transcription and translation from your computer's microphones to text (Resource-intensive and may not be
+  real-time, [Demo](https://www.loom.com/share/564b753eb4d44b55b985b8abd26b55f7))
 - Supports [Whisper](https://github.com/openai/whisper#available-models-and-languages),
   [Whisper.cpp](https://github.com/ggerganov/whisper.cpp), [Faster Whisper](https://github.com/guillaumekln/faster-whisper),
   [Whisper-compatible Hugging Face models](https://huggingface.co/models?other=whisper), and
   the [OpenAI Whisper API](https://platform.openai.com/docs/api-reference/introduction)
+- [Command-Line Interface](#command-line-interface)
 - Available on Mac, Windows, and Linux
 
 ## Installation
@@ -129,6 +131,24 @@ and [Virtual Audio Cable](https://vac.muzychenko.net/en/)).
 6. Open Buzz, select BlackHole as your microphone, and record as before to see transcriptions from the audio playing
    through BlackHole.
 
+## Command-Line Interface
+
+### `add`
+
+Start a new transcription task
+
+Examples:
+
+```shell
+# Translate two MP3 files from French to English using OpenAI Whisper API
+buzz add --task translate --language fr --model-type openaiapi /Users/user/Downloads/1b3b03e4-8db5-ea2c-ace5-b71ff32e3304.mp3 /Users/user/Downloads/koaf9083k1lkpsfdi0.mp3
+
+# Transcribe an MP4 using Whisper.cpp "small" model and immediately export to SRT and VTT files
+buzz add --task transcribe --model-type whispercpp --model-size small --prompt "My initial prompt" --srt --vtt /Users/user/Downloads/buzz/1b3b03e4-8db5-ea2c-ace5-b71ff32e3304.mp4
+```
+
+Run `buzz add --help` to see all available options.
+
 ## Build
 
 To build/run Buzz locally from source, first install the requirements:
diff --git a/buzz/cli.py b/buzz/cli.py
new file mode 100644
index 0000000..8be7b6a
--- /dev/null
+++ b/buzz/cli.py
@@ -0,0 +1,157 @@
+import enum
+import sys
+import typing
+
+from PyQt6.QtCore import QCommandLineParser, QCommandLineOption
+
+from buzz.gui import Application
+from buzz.model_loader import ModelType, WhisperModelSize, TranscriptionModel, get_local_model_path
+from buzz.store.keyring_store import KeyringStore
+from buzz.transcriber import Task, FileTranscriptionTask, FileTranscriptionOptions, TranscriptionOptions, LANGUAGES, \
+    OutputFormat
+
+
+class CommandLineError(Exception):
+    def __init__(self, message: str):
+        super().__init__(message)
+
+
+class CommandLineModelType(enum.Enum):
+    WHISPER = 'whisper'
+    WHISPER_CPP = 'whispercpp'
+    HUGGING_FACE = 'huggingface'
+    FASTER_WHISPER = 'fasterwhisper'
+    OPEN_AI_WHISPER_API = 'openaiapi'
+
+
+def parse_command_line(app: Application):
+    parser = QCommandLineParser()
+    try:
+        parse(app, parser)
+    except CommandLineError as exc:
+        print(f'Error: {str(exc)}\n', file=sys.stderr)
+        print(parser.helpText())
+        sys.exit(1)
+
+
+def parse(app: Application, parser: QCommandLineParser):
+    parser.addPositionalArgument('<command>', 'One of the following commands:\n- add')
+    parser.parse(app.arguments())
+
+    args = parser.positionalArguments()
+    if len(args) == 0:
+        parser.addHelpOption()
+        parser.addVersionOption()
+
+        parser.process(app)
+        return
+
+    command = args[0]
+    if command == "add":
+        parser.clearPositionalArguments()
+
+        parser.addPositionalArgument('files', 'Input file paths', '[file file file...]')
+
+        task_option = QCommandLineOption(['t', 'task'],
+                                         f'The task to perform. Allowed: {join_values(Task)}. Default: {Task.TRANSCRIBE.value}.',
+                                         'task',
+                                         Task.TRANSCRIBE.value)
+        model_type_option = QCommandLineOption(['m', 'model-type'],
+                                               f'Model type. Allowed: {join_values(CommandLineModelType)}. Default: {CommandLineModelType.WHISPER.value}.',
+                                               'model-type',
+                                               CommandLineModelType.WHISPER.value)
+        model_size_option = QCommandLineOption(['s', 'model-size'],
+                                               f'Model size. Use only when --model-type is whisper, whispercpp, or fasterwhisper. Allowed: {join_values(WhisperModelSize)}. Default: {WhisperModelSize.TINY.value}.',
+                                               'model-size', WhisperModelSize.TINY.value)
+        hugging_face_model_id_option = QCommandLineOption(['hfid'],
+                                                          f'Hugging Face model ID. Use only when --model-type is huggingface. Example: "openai/whisper-tiny"',
+                                                          'id')
+        language_option = QCommandLineOption(['l', 'language'],
+                                             f'Language code. Allowed: {", ".join(sorted([k + " (" + LANGUAGES[k].title() + ")" for k in LANGUAGES]))}. Leave empty to detect language.',
+                                             'code', '')
+        initial_prompt_option = QCommandLineOption(['p', 'prompt'], f'Initial prompt', 'prompt', '')
+        open_ai_access_token_option = QCommandLineOption('openai-token',
+                                                         f'OpenAI access token. Use only when --model-type is {CommandLineModelType.OPEN_AI_WHISPER_API.value}. Defaults to your previously saved access token, if one exists.',
+                                                         'token')
+        srt_option = QCommandLineOption(['srt'], 'Output result in an SRT file.')
+        vtt_option = QCommandLineOption(['vtt'], 'Output result in a VTT file.')
+        txt_option = QCommandLineOption('txt', 'Output result in a TXT file.')
+
+        parser.addOptions(
+            [task_option, model_type_option, model_size_option, hugging_face_model_id_option, language_option,
+             initial_prompt_option, open_ai_access_token_option, srt_option, vtt_option, txt_option])
+
+        parser.addHelpOption()
+        parser.addVersionOption()
+
+        parser.process(app)
+
+        # slice after first argument, the command
+        file_paths = parser.positionalArguments()[1:]
+        if len(file_paths) == 0:
+            raise CommandLineError('No input files')
+
+        task = parse_enum_option(task_option, parser, Task)
+
+        model_type = parse_enum_option(model_type_option, parser, CommandLineModelType)
+        model_size = parse_enum_option(model_size_option, parser, WhisperModelSize)
+
+        hugging_face_model_id = parser.value(hugging_face_model_id_option)
+
+        if hugging_face_model_id == '' and model_type == CommandLineModelType.HUGGING_FACE:
+            raise CommandLineError('--hfid is required when --model-type is huggingface')
+
+        model = TranscriptionModel(model_type=ModelType[model_type.name], whisper_model_size=model_size,
+                                   hugging_face_model_id=hugging_face_model_id)
+        model_path = get_local_model_path(model)
+
+        if model_path is None:
+            raise CommandLineError('Model not found')
+
+        language = parser.value(language_option)
+        if language == '':
+            language = None
+        elif LANGUAGES.get(language) is None:
+            raise CommandLineError('Invalid language option')
+
+        initial_prompt = parser.value(initial_prompt_option)
+
+        output_formats: typing.Set[OutputFormat] = set()
+        if parser.isSet(srt_option):
+            output_formats.add(OutputFormat.SRT)
+        if parser.isSet(vtt_option):
+            output_formats.add(OutputFormat.VTT)
+        if parser.isSet(txt_option):
+            output_formats.add(OutputFormat.TXT)
+
+        openai_access_token = parser.value(open_ai_access_token_option)
+        if model.model_type == ModelType.OPEN_AI_WHISPER_API and openai_access_token == '':
+            openai_access_token = KeyringStore().get_password(key=KeyringStore.Key.OPENAI_API_KEY)
+
+            if openai_access_token == '':
+                raise CommandLineError('No OpenAI access token found')
+
+        transcription_options = TranscriptionOptions(model=model, task=task, language=language,
+                                                     initial_prompt=initial_prompt,
+                                                     openai_access_token=openai_access_token)
+        file_transcription_options = FileTranscriptionOptions(file_paths=file_paths, output_formats=output_formats)
+
+        for file_path in file_paths:
+            transcription_task = FileTranscriptionTask(file_path=file_path, model_path=model_path,
+                                                       transcription_options=transcription_options,
+                                                       file_transcription_options=file_transcription_options)
+            app.add_task(transcription_task)
+
+
+T = typing.TypeVar("T", bound=enum.Enum)
+
+
+def parse_enum_option(option: QCommandLineOption, parser: QCommandLineParser, enum_class: typing.Type[T]) -> T:
+    try:
+        return enum_class(parser.value(option))
+    except ValueError:
+        raise CommandLineError(f'Invalid value for --{option.names()[-1]} option.')
+
+
+def join_values(enum_class: typing.Type[enum.Enum]) -> str:
+    return ', '.join([v.value for v in enum_class])
diff --git a/buzz/gui.py b/buzz/gui.py
index 3d61c03..5b53c47 100644
--- a/buzz/gui.py
+++ b/buzz/gui.py
@@ -3,11 +3,9 @@ import json
 import logging
 import os
 import sys
-from datetime import datetime
 from enum import auto
 from typing import Dict, List, Optional, Tuple
 
-import humanize
 import sounddevice
 from PyQt6 import QtGui
 from PyQt6.QtCore import (QObject, Qt, QThread,
@@ -18,10 +16,9 @@ from PyQt6.QtGui import (QAction, QCloseEvent, QDesktopServices, QIcon,
 from PyQt6.QtNetwork import QNetworkAccessManager, QNetworkReply, QNetworkRequest
 from PyQt6.QtWidgets import (QApplication, QCheckBox, QComboBox, QDialog,
                              QDialogButtonBox, QFileDialog, QLabel, QMainWindow, QMessageBox, QPlainTextEdit,
-                             QProgressDialog, QPushButton, QVBoxLayout, QHBoxLayout, QWidget, QGroupBox, QTableWidget,
+                             QPushButton, QVBoxLayout, QHBoxLayout, QWidget, QGroupBox, QTableWidget,
                              QMenuBar, QFormLayout, QTableWidgetItem,
                              QAbstractItemView, QListWidget, QListWidgetItem, QSizePolicy)
-from whisper import tokenizer
 
 from buzz.cache import TasksCache
 from .__version__ import VERSION
@@ -41,7 +38,7 @@ from .transcriber import (SUPPORTED_OUTPUT_FORMATS, FileTranscriptionOptions, Ou
                           Task,
                           TranscriptionOptions,
                           FileTranscriberQueueWorker, FileTranscriptionTask, RecordingTranscriber, LOADED_WHISPER_DLL,
-                          DEFAULT_WHISPER_TEMPERATURE)
+                          DEFAULT_WHISPER_TEMPERATURE, LANGUAGES)
 from .widgets.line_edit import LineEdit
 from .widgets.model_download_progress_dialog import ModelDownloadProgressDialog
 from .widgets.model_type_combo_box import ModelTypeComboBox
@@ -112,7 +109,7 @@ class LanguagesComboBox(QComboBox):
         super().__init__(parent)
 
         whisper_languages = sorted(
-            [(lang, tokenizer.LANGUAGES[lang].title()) for lang in tokenizer.LANGUAGES], key=lambda lang: lang[1])
+            [(lang, LANGUAGES[lang].title()) for lang in LANGUAGES], key=lambda lang: lang[1])
         self.languages = [('', _('Detect Language'))] + whisper_languages
 
         self.addItems([lang[1] for lang in self.languages])
@@ -259,7 +256,7 @@ class FileTranscriberWidget(QWidget):
         self.transcription_options = transcription_options
         self.word_level_timings_checkbox.setDisabled(
             self.transcription_options.model.model_type == ModelType.HUGGING_FACE or self.transcription_options.model.model_type == ModelType.OPEN_AI_WHISPER_API)
-        if self.transcription_options.openai_access_token is not None:
+        if self.transcription_options.openai_access_token != '':
             self.openai_access_token_changed.emit(self.transcription_options.openai_access_token)
 
     def on_click_run(self):
@@ -950,7 +947,7 @@ class MainWindow(QMainWindow):
         for file_path in file_transcription_options.file_paths:
             task = FileTranscriptionTask(
                 file_path, transcription_options, file_transcription_options, model_path)
-            self.transcriber_worker.add_task(task)
+            self.add_task(task)
 
     def update_task_table_row(self, task: FileTranscriptionTask):
         self.table_widget.upsert_task(task)
@@ -1053,6 +1050,9 @@ class MainWindow(QMainWindow):
             transcription_task=task, parent=self, flags=Qt.WindowType.Window)
         transcription_viewer_widget.show()
 
+    def add_task(self, task: FileTranscriptionTask):
+        self.transcriber_worker.add_task(task)
+
     def load_tasks_from_cache(self):
         tasks = self.tasks_cache.load()
         for task in tasks:
@@ -1378,9 +1378,15 @@ class Application(QApplication):
     def __init__(self) -> None:
         super().__init__(sys.argv)
 
+        self.setApplicationName(APP_NAME)
+        self.setApplicationVersion(VERSION)
+
         self.window = MainWindow()
         self.window.show()
 
+    def add_task(self, task: FileTranscriptionTask):
+        self.window.add_task(task)
+
 
 class AdvancedSettingsDialog(QDialog):
     transcription_options: TranscriptionOptions
diff --git a/buzz/settings/settings.py b/buzz/settings/settings.py
index db64714..524b014 100644
--- a/buzz/settings/settings.py
+++ b/buzz/settings/settings.py
@@ -7,7 +7,8 @@ APP_NAME = 'Buzz'
 
 
 class Settings:
-    settings = QSettings(APP_NAME)
+    def __init__(self):
+        self.settings = QSettings(APP_NAME)
 
     class Key(enum.Enum):
         RECORDING_TRANSCRIBER_TASK = 'recording-transcriber/task'
diff --git a/buzz/store/keyring_store.py b/buzz/store/keyring_store.py
index 1b4adb1..b0795f8 100644
--- a/buzz/store/keyring_store.py
+++ b/buzz/store/keyring_store.py
@@ -11,9 +11,9 @@ class KeyringStore:
     class Key(enum.Enum):
         OPENAI_API_KEY = 'OpenAI API key'
 
-    def get_password(self, username: Key) -> str:
+    def get_password(self, key: Key) -> str:
         try:
-            password = keyring.get_password(APP_NAME, username=username.value)
+            password = keyring.get_password(APP_NAME, username=key.value)
             if password is None:
                 return ''
             return password
diff --git a/buzz/transcriber.py b/buzz/transcriber.py
index c816c82..12e99ae 100644
--- a/buzz/transcriber.py
+++ b/buzz/transcriber.py
@@ -5,14 +5,12 @@ import json
 import logging
 import multiprocessing
 import os
-import platform
 import queue
 import re
-import subprocess
 import sys
 import tempfile
 import threading
-from abc import ABC, abstractmethod
+from abc import abstractmethod
 from dataclasses import dataclass, field
 from multiprocessing.connection import Connection
 from random import randint
@@ -20,16 +18,16 @@ from threading import Thread
 from typing import Any, List, Optional, Tuple, Union, Set
 
 import faster_whisper
-import openai
-
 import ffmpeg
 import numpy as np
+import openai
 import sounddevice
 import stable_whisper
 import tqdm
 import whisper
 from PyQt6.QtCore import QObject, QProcess, pyqtSignal, pyqtSlot, QThread
 from sounddevice import PortAudioError
+from whisper import tokenizer
 
 from . import transformers_whisper
 from .conn import pipe_stderr
@@ -62,6 +60,9 @@ class Segment:
     text: str
 
 
+LANGUAGES = tokenizer.LANGUAGES
+
+
 @dataclass()
 class TranscriptionOptions:
     language: Optional[str] = None
@@ -70,7 +71,7 @@ class TranscriptionOptions:
     word_level_timings: bool = False
     temperature: Tuple[float, ...] = DEFAULT_WHISPER_TEMPERATURE
     initial_prompt: str = ''
-    openai_access_token: Optional[str] = None
+    openai_access_token: str = ''
 
 
 @dataclass()
diff --git a/cli.py b/cli.py
deleted file mode 100644
index 54b08a6..0000000
--- a/cli.py
+++ /dev/null
@@ -1,2 +0,0 @@
-if __name__ == '__main__':
-    pass
diff --git a/main.py b/main.py
index 655a68a..fbf6b04 100644
--- a/main.py
+++ b/main.py
@@ -6,9 +6,10 @@ import platform
 import sys
 from typing import TextIO
 
-from PyQt6.QtCore import QTranslator, QLocale
 from appdirs import user_log_dir
 
+from buzz.cli import parse_command_line
+
 # Check for segfaults if not running in frozen mode
 if getattr(sys, 'frozen', False) is False:
     faulthandler.enable()
@@ -51,5 +52,5 @@ if __name__ == "__main__":
     from buzz.gui import Application
 
     app = Application()
-
+    parse_command_line(app)
     sys.exit(app.exec())