diff --git a/README.md b/README.md index 25c931f2..32772f49 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Buzz -Buzz transcribes audio from your computer's microphones to text using OpenAI's [Whisper](https://github.com/openai/whisper). +Buzz transcribes audio from your computer's microphones to text using OpenAI's [Whisper](https://github.com/openai/whisper). Buzz works by splitting audio recordings into chunks and transcribing the chunks to text using Whisper. ## Requirements diff --git a/gui.py b/gui.py index 133c763a..84b94a9e 100644 --- a/gui.py +++ b/gui.py @@ -1,5 +1,5 @@ import enum -from typing import List, Tuple +from typing import List, Optional, Tuple import pyaudio import whisper @@ -18,7 +18,7 @@ class Label(QLabel): class AudioDevicesComboBox(QComboBox): - """AudioDevicesComboBox is a combo box for selecting audio devices.""" + """AudioDevicesComboBox displays a list of available audio input devices""" deviceChanged = pyqtSignal(int) def __init__(self, *args) -> None: @@ -43,35 +43,56 @@ class AudioDevicesComboBox(QComboBox): def on_index_changed(self, index: int): self.deviceChanged.emit(self.audio_devices[index][0]) + def get_default_device_id(self): + return self.audio_devices[0][0] + class LanguagesComboBox(QComboBox): + """LanguagesComboBox displays a list of languages available to use with Whisper""" languageChanged = pyqtSignal(str) - def __init__(self, *args) -> None: + def __init__(self, default_language: str, *args) -> None: super().__init__(*args) - self.languages = {'none': 'Detect language', **tokenizer.LANGUAGES} + self.languages = {'': 'Detect language', **tokenizer.LANGUAGES} self.addItems(map(lambda lang: lang.title(), self.languages.values())) self.currentIndexChanged.connect(self.on_index_changed) + self.setCurrentText(self.languages.get(default_language, '').title()) def on_index_changed(self, index: int): - key = list(self.languages.values())[index] - self.languageChanged.emit( - self.languages.get(key) if key != 'none' else None) + key = list(self.languages.keys())[index] + self.languageChanged.emit(key) class ModelsComboBox(QComboBox): - """ModelsComboBox displays the list of available Whisper models for selection - """ - modelChanged = pyqtSignal(str) + """ModelsComboBox displays the list of available Whisper models for selection""" + modelNameChanged = pyqtSignal(str) + + def __init__(self, default_model_name: str, *args) -> None: + super().__init__(*args) + self.models = whisper.available_models() + self.addItems(map(self.label, self.models)) + self.currentIndexChanged.connect(self.on_index_changed) + self.setCurrentText(default_model_name) + + def on_index_changed(self, index: int): + self.modelNameChanged.emit(self.models[index]) + + def label(self, model_name: str): + name, lang = (model_name.split('.') + [None])[:2] + if lang: + return "%s (%s)" % (name.title(), lang.upper()) + return name.title() + + +class TextDisplayBox(QTextEdit): + """TextDisplayBox is a read-only textbox""" def __init__(self, *args) -> None: super().__init__(*args) - self.models = whisper.available_models() - self.addItems(self.models) - self.currentIndexChanged.connect(self.on_index_changed) - - def on_index_changed(self, index: int): - self.modelChanged.emit(self.models[index]) + self.setReadOnly(True) + self.setPlaceholderText('Click Record to begin...') + self.setStyleSheet( + 'QTextEdit { padding-left: 5; padding-top: 5; padding-bottom: 5; padding-right: 5; background-color: #151515; border-radius: 6; background-color: #1e1e1e; }') class RecordButton(QPushButton): @@ -88,7 +109,7 @@ class RecordButton(QPushButton): self.statusChanged.connect(self.on_status_changed) def on_click_record(self): - current_status: self.Status + current_status: RecordButton.Status if self.current_status == self.Status.RECORDING: current_status = self.Status.STOPPED else: @@ -113,9 +134,10 @@ class TranscriberWorker(QObject): text = pyqtSignal(str) finished = pyqtSignal() - def __init__(self, input_device_index: int = None, *args) -> None: + def __init__(self, model_name: str, language: Optional[str], input_device_index: Optional[int], *args) -> None: super().__init__(*args) - self.transcriber = Transcriber(text_callback=self.on_next_text) + self.transcriber = Transcriber( + model_name=model_name, language=language, text_callback=self.on_next_text) self.input_device_index = input_device_index def run(self): @@ -132,15 +154,14 @@ class TranscriberWorker(QObject): class Application(QApplication): current_status = RecordButton.Status.STOPPED - selected_device_id: int = None + thread: Optional[QThread] = None + selected_model_name = 'tiny' + selected_language = 'en' + selected_device_id: int def __init__(self) -> None: super().__init__([]) - self.setStyleSheet("""QComboBox { - color: #eee; - }""") - self.window = QWidget() self.window.setFixedSize(400, 400) @@ -150,37 +171,41 @@ class Application(QApplication): self.audio_devices_combo_box = AudioDevicesComboBox() self.audio_devices_combo_box.deviceChanged.connect( self.on_device_changed) + self.selected_device_id = self.audio_devices_combo_box.get_default_device_id() - self.record_button = RecordButton() - self.record_button.statusChanged.connect(self.on_status_changed) + record_button = RecordButton() + record_button.statusChanged.connect(self.on_status_changed) - self.text_box = self.text_box() + self.text_box = TextDisplayBox() + + models_combo_box = ModelsComboBox( + default_model_name=self.selected_model_name) + models_combo_box.modelNameChanged.connect(self.on_model_changed) + + languages_combo_box = LanguagesComboBox( + default_language=self.selected_language) + languages_combo_box.languageChanged.connect(self.on_language_changed) layout.addWidget(Label('Model:'), 0, 0, 1, 3) - layout.addWidget(ModelsComboBox(), 0, 3, 1, 9) + layout.addWidget(models_combo_box, 0, 3, 1, 9) layout.addWidget(Label('Language:'), 1, 0, 1, 3) - layout.addWidget(LanguagesComboBox(), 1, 3, 1, 9) + layout.addWidget(languages_combo_box, 1, 3, 1, 9) layout.addWidget(Label('Microphone:'), 2, 0, 1, 3) layout.addWidget(self.audio_devices_combo_box, 2, 3, 1, 9) - layout.addWidget(self.record_button, 3, 9, 1, 3) + layout.addWidget(record_button, 3, 9, 1, 3) layout.addWidget(self.text_box, 4, 0, 1, 12) self.window.show() - def text_box(self): - box = QTextEdit() - box.setReadOnly(True) - box.setPlaceholderText('Click Record to begin...') - box.setStyleSheet( - 'QTextEdit { padding-left:10; padding-top:10; padding-bottom:10; padding-right:10; background-color: #151515; border-radius: 6; }') - return box - + # TODO: might be great to send when the text has been updated rather than appending def on_next_text(self, text: str): - self.text_box.append(text) + self.text_box.moveCursor(QTextCursor.MoveOperation.End) + self.text_box.insertPlainText(text) + self.text_box.moveCursor(QTextCursor.MoveOperation.End) def on_device_changed(self, device_id: int): self.selected_device_id = device_id @@ -193,26 +218,47 @@ class Application(QApplication): self.audio_devices_combo_box.setDisabled(False) self.stop_recording() + def on_model_changed(self, model_name: str): + self.selected_model_name = model_name + + def on_language_changed(self, language: str): + self.selected_language = language + def start_recording(self): - # Clear text box placeholder + # Clear text box placeholder because the first chunk takes a while to process self.text_box.setPlaceholderText('') - # Thread needs to be attached to app object to live after end of method + # Transcribing the recording chunks is a blocking + # process, so we handle this in a new thread + + # Wait for previous thread to complete in case stop_recording isn't yet done + if self.thread != None: + self.thread.wait() + self.thread = QThread() self.transcriber_worker = TranscriberWorker( - input_device_index=self.selected_device_id) + input_device_index=self.selected_device_id, + model_name=self.selected_model_name, + language=self.selected_language if self.selected_language != '' else None, + ) self.transcriber_worker.moveToThread(self.thread) + # Connect worker and thread such that the worker runs once + # the thread starts and the thread quits once the worker finishes self.thread.started.connect(self.transcriber_worker.run) self.transcriber_worker.finished.connect(self.thread.quit) self.transcriber_worker.finished.connect( self.transcriber_worker.deleteLater) - self.thread.finished.connect(self.thread.deleteLater) + self.thread.finished.connect(self.clean_up_thread) self.transcriber_worker.text.connect(self.on_next_text) self.thread.start() + def clean_up_thread(self): + self.thread.deleteLater() + self.thread = None + def stop_recording(self): self.transcriber_worker.stop_recording() diff --git a/transcriber.py b/transcriber.py index e8439598..f31f99ee 100644 --- a/transcriber.py +++ b/transcriber.py @@ -1,10 +1,10 @@ import logging import os -import platform +import sys import tempfile import wave from datetime import datetime -from typing import Callable +from typing import Callable, Optional import pyaudio import whisper @@ -15,12 +15,15 @@ os.environ["PATH"] += os.pathsep + "/usr/local/bin" class Transcriber: + """Transcriber records audio from a system microphone and transcribes it into text using Whisper.""" + # Number of times the queue is greater than the frames_per_chunk # after which the transcriber will stop queueing new frames chunk_drop_factor = 5 - def __init__(self, model_name="tiny", language=None, text_callback: Callable[[str], None] = print) -> None: + def __init__(self, model_name: str, language: Optional[str], text_callback: Callable[[str], None]) -> None: self.pyaudio = pyaudio.PyAudio() + self.model_name = model_name self.model = whisper.load_model(model_name) self.stream = None self.frames = [] @@ -29,8 +32,9 @@ class Transcriber: self.language = language def start_recording(self, frames_per_buffer=1024, sample_format=pyaudio.paInt16, - channels=1, rate=44100, chunk_duration=4, input_device_index=None): - logging.debug("Recording...") + channels=1, rate=44100, chunk_duration=5, input_device_index: Optional[int] = None): + logging.debug("Recording with language \"%s\", model \"%s\"" % + (self.language, self.model_name)) self.stream = self.pyaudio.open(format=sample_format, channels=channels, rate=rate, @@ -77,7 +81,7 @@ class Transcriber: except KeyboardInterrupt as e: self.stop_recording() os.remove(chunk_path) - raise e + sys.exit(0) def stream_callback(self, in_data, frame_count, time_info, status): # Append new frame only if the queue is not larger than the chunk drop factor @@ -86,11 +90,12 @@ class Transcriber: return in_data, pyaudio.paContinue def stop_recording(self): - logging.debug("Ending recording...") - self.stopped = True - self.stream.stop_stream() - self.stream.close() - self.pyaudio.terminate() + if self.stream != None: + logging.debug("Ending recording...") + self.stopped = True + self.stream.stop_stream() + self.stream.close() + self.pyaudio.terminate() def write_chunk(self, path, channels, rate, frames): logging.debug('Writing chunk to path: %s' % path) @@ -103,11 +108,9 @@ class Transcriber: wavefile.close() return path - def chunk_path(self): + def chunk_path(self) -> str: + """Returns the path where a chunk should be saved using the + system's temp directory and a unique filename. + """ chunk_id = "clip-%s.wav" % (datetime.utcnow().strftime('%Y%m%d%H%M%S')) return os.path.join(tempfile.gettempdir(), chunk_id) - - # https://stackoverflow.com/a/43418319/9830227 - def tmp_dir(self): - # return tempfile.gettempdir() - return "/tmp" if platform.system() == "Darwin" else tempfile.gettempdir()