From dff4215212e35375ff2d88a3fcf87b76859405b0 Mon Sep 17 00:00:00 2001
From: Chidi Williams <williamschidi1@gmail.com>
Date: Sun, 25 Sep 2022 17:46:39 +0100
Subject: [PATCH] Add chunk drop factor and more logs

---
 Buzz.spec      |  6 +++++-
 README.md      | 30 ++++++++++++++++++++++++++++++
 transcriber.py | 35 ++++++++++++++++++++++++++---------
 3 files changed, 61 insertions(+), 10 deletions(-)
 create mode 100644 README.md

diff --git a/Buzz.spec b/Buzz.spec
index 7aa8dadf..9a265162 100644
--- a/Buzz.spec
+++ b/Buzz.spec
@@ -22,7 +22,7 @@ a = Analysis(
     pathex=[],
     binaries=[],
     datas=datas,
-    hiddenimports=['apiclient', 'pytorch', '“sklearn.utils._cython_blas”', '“sklearn.neighbors.typedefs”',
+    hiddenimports=['apiclient', 'pyaudio', 'pytorch', '“sklearn.utils._cython_blas”', '“sklearn.neighbors.typedefs”',
                    '“sklearn.neighbors.quad_tree”', '“sklearn.tree”', '“sklearn.tree._utils”'],
     hookspath=[],
     hooksconfig={},
@@ -67,4 +67,8 @@ app = BUNDLE(
     name='Buzz.app',
     icon=None,
     bundle_identifier=None,
+    version='0.0.1',
+    info_plist={
+        'NSMicrophoneUsageDescription': 'Please provide microphone access to continue'
+    }
 )
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..016513cb
--- /dev/null
+++ b/README.md
@@ -0,0 +1,30 @@
+# Buzz
+
+Buzz transcribes audio from your computer's microphones to text using OpenAI's [Whisper](https://github.com/openai/whisper).
+
+## Setup
+
+Whisper [requires ffmpeg to be installed on your computer](https://github.com/openai/whisper#setup):
+
+```text
+# on Ubuntu or Debian
+sudo apt update && sudo apt install ffmpeg
+
+# on MacOS using Homebrew (https://brew.sh/)
+brew install ffmpeg
+
+# on Windows using Chocolatey (https://chocolatey.org/)
+choco install ffmpeg
+
+# on Windows using Scoop (https://scoop.sh/)
+scoop install ffmpeg
+```
+
+## Build
+
+To build Buzz, run:
+
+```shell
+pip install -r requirements.txt
+make buzz
+```
diff --git a/transcriber.py b/transcriber.py
index 50b212b6..4a5a6b52 100644
--- a/transcriber.py
+++ b/transcriber.py
@@ -1,5 +1,7 @@
 import logging
 import os
+import platform
+import tempfile
 import wave
 from datetime import datetime
 from typing import Callable
@@ -9,6 +11,10 @@ import whisper
 
 
 class Transcriber:
+    # Number of times the queue is greater than the frames_per_chunk
+    # after which the transcriber will stop queueing new frames
+    chunk_drop_factor = 5
+
     def __init__(self, model_name="tiny", text_callback: Callable[[str], None] = print) -> None:
         self.pyaudio = pyaudio.PyAudio()
         self.model = whisper.load_model(model_name)
@@ -30,23 +36,26 @@ class Transcriber:
 
         self.stream.start_stream()
 
-        frames_per_chunk = int(rate / frames_per_buffer * chunk_duration)
+        self.frames_per_chunk = int(rate / frames_per_buffer * chunk_duration)
         while True:
             if self.stopped:
                 self.frames = []
                 logging.debug("Recording stopped. Exiting...")
                 return
-            if len(self.frames) > frames_per_chunk:
+            if len(self.frames) > self.frames_per_chunk:
                 logging.debug("Buffer size: %d. Transcribing next %d frames..." %
-                              (len(self.frames), frames_per_chunk))
+                              (len(self.frames), self.frames_per_chunk))
                 chunk_path = self.chunk_path()
                 try:
                     clip = []
-                    for i in range(0, frames_per_chunk):
+                    # TODO: Breaking the audio into chunks might make it more difficult for
+                    # Whisper to work. Could it be helpful to re-use a section of the previous
+                    # chunk in the next iteration?
+                    for i in range(0, self.frames_per_chunk):
                         clip.append(self.frames[i])
                     frames = b''.join(clip)
 
-                    # TODO: Can we pass the chunk to whisper in-memory?
+                    # TODO: Can the chunk be passed to whisper in-memory instead?
                     self.write_chunk(chunk_path, channels, rate, frames)
 
                     result = self.model.transcribe(
@@ -58,14 +67,17 @@ class Transcriber:
 
                     os.remove(chunk_path)
 
-                    self.frames = self.frames[frames_per_chunk:]
+                    # TODO: Implement dropping frames if the queue gets too large
+                    self.frames = self.frames[self.frames_per_chunk:]
                 except KeyboardInterrupt as e:
                     self.stop_recording()
                     os.remove(chunk_path)
                     raise e
 
     def stream_callback(self, in_data, frame_count, time_info, status):
-        self.frames.append(in_data)
+        # Append new frame only if the queue is not larger than the chunk drop factor
+        if (len(self.frames) / self.frames_per_chunk) < self.chunk_drop_factor:
+            self.frames.append(in_data)
         return in_data, pyaudio.paContinue
 
     def stop_recording(self):
@@ -76,6 +88,7 @@ class Transcriber:
         self.pyaudio.terminate()
 
     def write_chunk(self, path, channels, rate, frames):
+        logging.debug('Writing chunk to path: %s' % path)
         wavefile = wave.open(path, 'wb')
         wavefile.setnchannels(channels)
         wavefile.setsampwidth(
@@ -86,6 +99,10 @@ class Transcriber:
         return path
 
     def chunk_path(self):
-        base_dir = os.path.dirname(__file__)
         chunk_id = "clip-%s.wav" % (datetime.utcnow().strftime('%Y%m%d%H%M%S'))
-        return os.path.join(base_dir, chunk_id)
+        return os.path.join(self.tmp_dir(), chunk_id)
+
+    # https://stackoverflow.com/a/43418319/9830227
+    def tmp_dir(self):
+        # return tempfile.gettempdir()
+        return "/tmp" if platform.system() == "Darwin" else tempfile.gettempdir()