From dff4215212e35375ff2d88a3fcf87b76859405b0 Mon Sep 17 00:00:00 2001 From: Chidi Williams Date: Sun, 25 Sep 2022 17:46:39 +0100 Subject: [PATCH] Add chunk drop factor and more logs --- Buzz.spec | 6 +++++- README.md | 30 ++++++++++++++++++++++++++++++ transcriber.py | 35 ++++++++++++++++++++++++++--------- 3 files changed, 61 insertions(+), 10 deletions(-) create mode 100644 README.md diff --git a/Buzz.spec b/Buzz.spec index 7aa8dadf..9a265162 100644 --- a/Buzz.spec +++ b/Buzz.spec @@ -22,7 +22,7 @@ a = Analysis( pathex=[], binaries=[], datas=datas, - hiddenimports=['apiclient', 'pytorch', '“sklearn.utils._cython_blas”', '“sklearn.neighbors.typedefs”', + hiddenimports=['apiclient', 'pyaudio', 'pytorch', '“sklearn.utils._cython_blas”', '“sklearn.neighbors.typedefs”', '“sklearn.neighbors.quad_tree”', '“sklearn.tree”', '“sklearn.tree._utils”'], hookspath=[], hooksconfig={}, @@ -67,4 +67,8 @@ app = BUNDLE( name='Buzz.app', icon=None, bundle_identifier=None, + version='0.0.1', + info_plist={ + 'NSMicrophoneUsageDescription': 'Please provide microphone access to continue' + } ) diff --git a/README.md b/README.md new file mode 100644 index 00000000..016513cb --- /dev/null +++ b/README.md @@ -0,0 +1,30 @@ +# Buzz + +Buzz transcribes audio from your computer's microphones to text using OpenAI's [Whisper](https://github.com/openai/whisper). + +## Setup + +Whisper [requires ffmpeg to be installed on your computer](https://github.com/openai/whisper#setup): + +```text +# on Ubuntu or Debian +sudo apt update && sudo apt install ffmpeg + +# on MacOS using Homebrew (https://brew.sh/) +brew install ffmpeg + +# on Windows using Chocolatey (https://chocolatey.org/) +choco install ffmpeg + +# on Windows using Scoop (https://scoop.sh/) +scoop install ffmpeg +``` + +## Build + +To build Buzz, run: + +```shell +pip install -r requirements.txt +make buzz +``` diff --git a/transcriber.py b/transcriber.py index 50b212b6..4a5a6b52 100644 --- a/transcriber.py +++ b/transcriber.py @@ -1,5 +1,7 @@ import logging import os +import platform +import tempfile import wave from datetime import datetime from typing import Callable @@ -9,6 +11,10 @@ import whisper class Transcriber: + # Number of times the queue is greater than the frames_per_chunk + # after which the transcriber will stop queueing new frames + chunk_drop_factor = 5 + def __init__(self, model_name="tiny", text_callback: Callable[[str], None] = print) -> None: self.pyaudio = pyaudio.PyAudio() self.model = whisper.load_model(model_name) @@ -30,23 +36,26 @@ class Transcriber: self.stream.start_stream() - frames_per_chunk = int(rate / frames_per_buffer * chunk_duration) + self.frames_per_chunk = int(rate / frames_per_buffer * chunk_duration) while True: if self.stopped: self.frames = [] logging.debug("Recording stopped. Exiting...") return - if len(self.frames) > frames_per_chunk: + if len(self.frames) > self.frames_per_chunk: logging.debug("Buffer size: %d. Transcribing next %d frames..." % - (len(self.frames), frames_per_chunk)) + (len(self.frames), self.frames_per_chunk)) chunk_path = self.chunk_path() try: clip = [] - for i in range(0, frames_per_chunk): + # TODO: Breaking the audio into chunks might make it more difficult for + # Whisper to work. Could it be helpful to re-use a section of the previous + # chunk in the next iteration? + for i in range(0, self.frames_per_chunk): clip.append(self.frames[i]) frames = b''.join(clip) - # TODO: Can we pass the chunk to whisper in-memory? + # TODO: Can the chunk be passed to whisper in-memory instead? self.write_chunk(chunk_path, channels, rate, frames) result = self.model.transcribe( @@ -58,14 +67,17 @@ class Transcriber: os.remove(chunk_path) - self.frames = self.frames[frames_per_chunk:] + # TODO: Implement dropping frames if the queue gets too large + self.frames = self.frames[self.frames_per_chunk:] except KeyboardInterrupt as e: self.stop_recording() os.remove(chunk_path) raise e def stream_callback(self, in_data, frame_count, time_info, status): - self.frames.append(in_data) + # Append new frame only if the queue is not larger than the chunk drop factor + if (len(self.frames) / self.frames_per_chunk) < self.chunk_drop_factor: + self.frames.append(in_data) return in_data, pyaudio.paContinue def stop_recording(self): @@ -76,6 +88,7 @@ class Transcriber: self.pyaudio.terminate() def write_chunk(self, path, channels, rate, frames): + logging.debug('Writing chunk to path: %s' % path) wavefile = wave.open(path, 'wb') wavefile.setnchannels(channels) wavefile.setsampwidth( @@ -86,6 +99,10 @@ class Transcriber: return path def chunk_path(self): - base_dir = os.path.dirname(__file__) chunk_id = "clip-%s.wav" % (datetime.utcnow().strftime('%Y%m%d%H%M%S')) - return os.path.join(base_dir, chunk_id) + return os.path.join(self.tmp_dir(), chunk_id) + + # https://stackoverflow.com/a/43418319/9830227 + def tmp_dir(self): + # return tempfile.gettempdir() + return "/tmp" if platform.system() == "Darwin" else tempfile.gettempdir()