diff --git a/gui.py b/gui.py index ac5cd346..7c797ccf 100644 --- a/gui.py +++ b/gui.py @@ -130,24 +130,6 @@ class QualityComboBox(QComboBox): self.quality_changed.emit(self.qualities[index]) -class DelaysComboBox(QComboBox): - """DelaysComboBox displays the list of available delays""" - delay_changed = pyqtSignal(int) - - def __init__(self, default_delay: int, parent: Optional[QWidget], *args) -> None: - super().__init__(parent, *args) - self.delays = [5, 10, 20, 30] - self.addItems(map(self.label, self.delays)) - self.currentIndexChanged.connect(self.on_index_changed) - self.setCurrentText(self.label(default_delay)) - - def on_index_changed(self, index: int): - self.delay_changed.emit(self.delays[index]) - - def label(self, delay: int): - return "%ds" % delay - - class TextDisplayBox(QPlainTextEdit): """TextDisplayBox is a read-only textbox""" @@ -260,18 +242,21 @@ class TranscriberWithSignal(QObject): status_changed = pyqtSignal(Status) - def __init__(self, model: whisper.Whisper, language: Optional[str], task: Task, parent: Optional[QWidget], *args) -> None: + def __init__( + self, model: whisper.Whisper, language: Optional[str], + task: Task, parent: Optional[QWidget], input_device_index: Optional[int], + *args, + ) -> None: super().__init__(parent, *args) self.transcriber = RecordingTranscriber( model=model, language=language, - status_callback=self.on_next_status, task=task) - - def start_recording(self, input_device_index: Optional[int], block_duration: int): - self.transcriber.start_recording( + status_callback=self.on_next_status, task=task, input_device_index=input_device_index, - block_duration=block_duration, ) + def start_recording(self): + self.transcriber.start_recording() + def on_next_status(self, status: Status): self.status_changed.emit(status) @@ -471,7 +456,6 @@ class RecordingTranscriberWidget(QWidget): selected_quality = Quality.LOW selected_language: Optional[str] = None selected_device_id: Optional[int] - selected_delay = 10 selected_task = Task.TRANSCRIBE model_download_progress_dialog: Optional[DownloadModelProgressDialog] = None @@ -501,10 +485,6 @@ class RecordingTranscriberWidget(QWidget): parent=self) self.tasks_combo_box.taskChanged.connect(self.on_task_changed) - delays_combo_box = DelaysComboBox( - default_delay=self.selected_delay, parent=self) - delays_combo_box.delay_changed.connect(self.on_delay_changed) - self.timer_label = TimerLabel(self) self.record_button = RecordButton(self) @@ -519,7 +499,6 @@ class RecordingTranscriberWidget(QWidget): ((0, 5, FormLabel('Quality:', self)), (5, 7, self.quality_combo_box)), ((0, 5, FormLabel('Microphone:', self)), (5, 7, self.audio_devices_combo_box)), - ((0, 5, FormLabel('Delay:', self)), (5, 7, delays_combo_box)), ((6, 3, self.timer_label), (9, 3, self.record_button)), ((0, 12, self.text_box),), ) @@ -559,9 +538,6 @@ class RecordingTranscriberWidget(QWidget): def on_task_changed(self, task: Task): self.selected_task = task - def on_delay_changed(self, delay: int): - self.selected_delay = delay - def start_recording(self): self.record_button.setDisabled(True) @@ -589,14 +565,12 @@ class RecordingTranscriberWidget(QWidget): model=model, language=self.selected_language, task=self.selected_task, + input_device_index=self.selected_device_id, parent=self ) self.transcriber.status_changed.connect( self.on_transcriber_status_changed) - self.transcriber.start_recording( - input_device_index=self.selected_device_id, - block_duration=self.selected_delay, - ) + self.transcriber.start_recording() def on_download_model_progress(self, current_size: int, total_size: int): if current_size == total_size: diff --git a/transcriber.py b/transcriber.py index ea663c0d..2cde8ded 100644 --- a/transcriber.py +++ b/transcriber.py @@ -3,9 +3,8 @@ import enum import logging import os import platform -import queue import subprocess -from threading import Thread +from threading import Lock, Thread from typing import Callable, Optional import numpy as np @@ -14,10 +13,6 @@ import whisper import _whisper -# When the app is opened as a .app from Finder, the path doesn't contain /usr/local/bin -# which breaks the call to run `ffmpeg`. This sets the path manually to fix that. -os.environ["PATH"] += os.pathsep + "/usr/local/bin" - class State(enum.Enum): STARTING_NEXT_TRANSCRIPTION = 0 @@ -41,28 +36,32 @@ class RecordingTranscriber: current_thread: Optional[Thread] current_stream: Optional[sounddevice.InputStream] is_running = False - MAX_QUEUE_SIZE = 10 def __init__(self, model: whisper.Whisper, language: Optional[str], - status_callback: Callable[[Status], None], task: Task) -> None: + status_callback: Callable[[Status], None], task: Task, + input_device_index: Optional[int] = None) -> None: self.model = model self.current_stream = None self.status_callback = status_callback self.language = language self.task = task - self.queue: queue.Queue[np.ndarray] = queue.Queue( - RecordingTranscriber.MAX_QUEUE_SIZE, - ) + self.input_device_index = input_device_index + self.sample_rate = self.get_device_sample_rate( + device_id=input_device_index) + self.n_batch_samples = 5 * self.sample_rate # every 5 seconds + # pause queueing if more than 3 batches behind + self.max_queue_size = 3 * self.n_batch_samples + self.queue = np.ndarray([], dtype=np.float32) + self.mutex = Lock() + self.text = '' - def start_recording(self, block_duration=10, input_device_index: Optional[int] = None): - sample_rate = self.get_device_sample_rate(device_id=input_device_index) - - logging.debug("Recording... language: \"%s\", model: \"%s\", task: \"%s\", device: \"%s\", block duration: \"%s\", sample rate: \"%s\"" % - (self.language, self.model._get_name(), self.task, input_device_index, block_duration, sample_rate)) + def start_recording(self): + logging.debug( + f'Recording, language = {self.language}, task = {self.task}, device = {self.input_device_index}, sample rate = {self.sample_rate}') self.current_stream = sounddevice.InputStream( - samplerate=sample_rate, - blocksize=block_duration * sample_rate, - device=input_device_index, dtype="float32", + samplerate=self.sample_rate, + blocksize=1 * self.sample_rate, # 1 sec + device=self.input_device_index, dtype="float32", channels=1, callback=self.stream_callback) self.current_stream.start() @@ -73,20 +72,31 @@ class RecordingTranscriber: def process_queue(self): while self.is_running: - try: - block = self.queue.get(block=False) + self.mutex.acquire() + if self.queue.size >= self.n_batch_samples: + batch = self.queue[:self.n_batch_samples] + self.queue = self.queue[self.n_batch_samples:] + self.mutex.release() + logging.debug( - 'Processing next frame. Current queue size: %d' % self.queue.qsize()) - self.status_callback(Status(State.STARTING_NEXT_TRANSCRIPTION)) - result = self.model.transcribe( - audio=block, language=self.language, task=self.task.value) - text = result.get("text") - logging.debug( - "Received next result of length: %s" % len(text)) + f'Processing next frame, samples = {batch.size}, total samples = {self.queue.size}, amplitude = {self.amplitude(batch)}') self.status_callback( - Status(State.FINISHED_CURRENT_TRANSCRIPTION, text)) - except queue.Empty: - continue + Status(State.STARTING_NEXT_TRANSCRIPTION)) + time_started = datetime.datetime.now() + + result = self.model.transcribe( + audio=batch, language=self.language, task=self.task.value, + initial_prompt=self.text) # prompt model with text from previous transcriptions + batch_text: str = result.get('text') + + logging.debug( + f'Received next result, length = {len(batch_text)}, time taken = {datetime.datetime.now() - time_started}') + self.status_callback( + Status(State.FINISHED_CURRENT_TRANSCRIPTION, batch_text)) + + self.text += f'\n\n{batch_text}' + else: + self.mutex.release() def get_device_sample_rate(self, device_id: Optional[int]) -> int: """Returns the sample rate to be used for recording. It uses the default sample rate @@ -106,13 +116,13 @@ class RecordingTranscriber: def stream_callback(self, in_data, frame_count, time_info, status): # Try to enqueue the next block. If the queue is already full, drop the block. - try: - chunk = in_data.ravel() - logging.debug('Received next chunk: length %s, amplitude %s, status "%s"' - % (len(chunk), (abs(max(chunk)) + abs(min(chunk))) / 2, status)) - self.queue.put(chunk, block=False) - except queue.Full: - return + chunk: np.ndarray = in_data.ravel() + with self.mutex: + if self.queue.size < self.max_queue_size: + self.queue = np.append(self.queue, chunk) + + def amplitude(self, arr: np.ndarray): + return (abs(max(arr)) + abs(min(arr))) / 2 def stop_recording(self): if self.current_stream != None: @@ -120,7 +130,6 @@ class RecordingTranscriber: logging.debug('Closed recording stream') self.is_running = False - self.queue.queue.clear() if self.current_thread != None: logging.debug('Waiting for processing thread to terminate')