Add speed control (#1224)

Co-authored-by: Raivis Dejus <orvils@gmail.com>
This commit is contained in:
Shlomi 2025-09-03 03:38:46 -05:00 committed by GitHub
commit d285e6e43d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 2394 additions and 48 deletions

View file

@ -56,6 +56,9 @@ class Settings:
)
MAIN_WINDOW = "main-window"
TRANSCRIPTION_VIEWER = "transcription-viewer"
AUDIO_PLAYBACK_RATE = "audio/playback-rate"
FORCE_CPU = "force-cpu"
@ -100,16 +103,25 @@ class Settings:
return ""
def value(
self,
key: Key,
default_value: typing.Any,
value_type: typing.Optional[type] = None,
self,
key: Key,
default_value: typing.Any,
value_type: typing.Optional[type] = None,
) -> typing.Any:
return self.settings.value(
val = self.settings.value(
key.value,
default_value,
value_type if value_type is not None else type(default_value),
)
if (value_type is bool or isinstance(default_value, bool)):
if isinstance(val, bool):
return val
if isinstance(val, str):
return val.lower() in ("true", "1", "yes", "on")
if isinstance(val, int):
return val != 0
return bool(val)
return val
def clear(self):
self.settings.clear()

View file

@ -22,6 +22,9 @@ class Shortcut(str, enum.Enum):
VIEW_TRANSCRIPT_TEXT = ("Ctrl+E", _("View Transcript Text"))
VIEW_TRANSCRIPT_TRANSLATION = ("Ctrl+L", _("View Transcript Translation"))
VIEW_TRANSCRIPT_TIMESTAMPS = ("Ctrl+T", _("View Transcript Timestamps"))
SEARCH_TRANSCRIPT = ("Ctrl+F", _("Search Transcript"))
SCROLL_TO_CURRENT_TEXT = ("Ctrl+G", _("Scroll to Current Text"))
TOGGLE_PLAYBACK_CONTROLS = ("Ctrl+P", _("Toggle Playback Controls"))
CLEAR_HISTORY = ("Ctrl+S", _("Clear History"))
STOP_TRANSCRIPTION = ("Ctrl+X", _("Cancel Transcription"))

View file

@ -1,3 +1,4 @@
import logging
from typing import Tuple, Optional
from PyQt6 import QtGui
@ -6,6 +7,7 @@ from PyQt6.QtMultimedia import QAudioOutput, QMediaPlayer
from PyQt6.QtWidgets import QWidget, QSlider, QPushButton, QLabel, QHBoxLayout
from buzz.widgets.icon import PlayIcon, PauseIcon
from buzz.settings.settings import Settings
class AudioPlayer(QWidget):
@ -18,6 +20,10 @@ class AudioPlayer(QWidget):
self.position_ms = 0
self.duration_ms = 0
self.invalid_media = None
self.is_looping = False # Flag to prevent recursive position changes
# Initialize settings
self.settings = Settings()
self.audio_output = QAudioOutput()
self.audio_output.setVolume(100)
@ -26,6 +32,11 @@ class AudioPlayer(QWidget):
self.media_player.setSource(QUrl.fromLocalFile(file_path))
self.media_player.setAudioOutput(self.audio_output)
# Speed control moved to transcription viewer - just set default rate
saved_rate = self.settings.value(Settings.Key.AUDIO_PLAYBACK_RATE, 1.0, float)
saved_rate = max(0.1, min(5.0, saved_rate)) # Ensure valid range
self.media_player.setPlaybackRate(saved_rate)
self.scrubber = QSlider(Qt.Orientation.Horizontal)
self.scrubber.setRange(0, 0)
self.scrubber.sliderMoved.connect(self.on_slider_moved)
@ -36,16 +47,19 @@ class AudioPlayer(QWidget):
self.play_button = QPushButton("")
self.play_button.setIcon(self.play_icon)
self.play_button.clicked.connect(self.toggle_play)
self.play_button.setMaximumWidth(40) # Match other button widths
self.play_button.setMinimumHeight(30) # Match other button heights
self.time_label = QLabel()
self.time_label.setAlignment(Qt.AlignmentFlag.AlignRight)
layout = QHBoxLayout()
layout.addWidget(self.play_button, alignment=Qt.AlignmentFlag.AlignVCenter)
layout.addWidget(self.scrubber, alignment=Qt.AlignmentFlag.AlignVCenter)
layout.addWidget(self.time_label, alignment=Qt.AlignmentFlag.AlignVCenter)
# Create main layout - simplified without speed controls
main_layout = QHBoxLayout()
main_layout.addWidget(self.play_button, alignment=Qt.AlignmentFlag.AlignVCenter)
main_layout.addWidget(self.scrubber, alignment=Qt.AlignmentFlag.AlignVCenter)
main_layout.addWidget(self.time_label, alignment=Qt.AlignmentFlag.AlignVCenter)
self.setLayout(layout)
self.setLayout(main_layout)
# Connect media player signals to the corresponding slots
self.media_player.durationChanged.connect(self.on_duration_changed)
@ -68,10 +82,15 @@ class AudioPlayer(QWidget):
# If a range has been selected as we've reached the end of the range,
# loop back to the start of the range
if self.range_ms is not None:
if self.range_ms is not None and not self.is_looping:
start_range_ms, end_range_ms = self.range_ms
if position_ms > end_range_ms:
# Check if we're at or past the end of the range (with small buffer for precision)
if position_ms >= (end_range_ms - 50): # Within 50ms of end
logging.debug(f"🔄 LOOP: Reached end {end_range_ms}ms, jumping to start {start_range_ms}ms")
self.is_looping = True # Set flag to prevent recursion
self.set_position(start_range_ms)
# Reset flag immediately after setting position
self.is_looping = False
def on_playback_state_changed(self, state: QMediaPlayer.PlaybackState):
if state == QMediaPlayer.PlaybackState.PlayingState:
@ -93,6 +112,10 @@ class AudioPlayer(QWidget):
self.scrubber.setRange(0, 1)
self.scrubber.setDisabled(True)
self.time_label.setDisabled(True)
else:
self.play_button.setEnabled(True)
self.scrubber.setEnabled(True)
self.time_label.setEnabled(True)
def toggle_play(self):
if self.media_player.playbackState() == QMediaPlayer.PlaybackState.PlayingState:
@ -101,13 +124,31 @@ class AudioPlayer(QWidget):
self.media_player.play()
def set_range(self, range_ms: Tuple[int, int]):
"""Set a loop range. Only jump to start if current position is outside the range."""
self.range_ms = range_ms
self.set_position(range_ms[0])
start_range_ms, end_range_ms = range_ms
# Only jump to start if current position is outside the range
if self.position_ms < start_range_ms or self.position_ms > end_range_ms:
logging.debug(f"🔄 LOOP: Position {self.position_ms}ms outside range, jumping to {start_range_ms}ms")
self.set_position(start_range_ms)
def clear_range(self):
"""Clear the current loop range"""
self.range_ms = None
def _reset_looping_flag(self):
"""Reset the looping flag"""
self.is_looping = False
def on_slider_moved(self, position_ms: int):
self.set_position(position_ms)
# Reset range if slider is scrubbed manually
self.range_ms = None
# Only clear range if scrubbed significantly outside the current range
if self.range_ms is not None:
start_range_ms, end_range_ms = self.range_ms
# Clear range if scrubbed more than 2 seconds outside the range
if position_ms < (start_range_ms - 2000) or position_ms > (end_range_ms + 2000):
self.range_ms = None
def set_position(self, position_ms: int):
self.media_player.setPosition(position_ms)

View file

@ -89,6 +89,13 @@ class VisibilityIcon(Icon):
)
class ScrollToCurrentIcon(Icon):
def __init__(self, parent: QWidget):
super().__init__(
get_path("assets/visibility_FILL0_wght700_GRAD0_opsz48.svg"), parent
)
BUZZ_ICON_PATH = get_path("assets/buzz.ico")
BUZZ_LARGE_ICON_PATH = get_path("assets/buzz-icon-1024.png")

View file

@ -182,3 +182,12 @@ class TranscriptionSegmentsEditorWidget(QTableView):
def segments(self) -> list[QSqlRecord]:
return [self.model().record(i) for i in range(self.model().rowCount())]
def highlight_and_scroll_to_row(self, row_index: int):
"""Highlight a specific row and scroll it into view"""
if 0 <= row_index < self.model().rowCount():
# Select the row
self.selectRow(row_index)
# Scroll to the row with better positioning
model_index = self.model().index(row_index, 0)
self.scrollTo(model_index, QAbstractItemView.ScrollHint.PositionAtCenter)

View file

@ -21,9 +21,24 @@ OpenAI's [Whisper](https://github.com/openai/whisper).
VTT ([Demo](https://www.loom.com/share/cf263b099ac3481082bb56d19b7c87fe))
- Transcription and translation from your computer's microphones to text (Resource-intensive and may not be
real-time, [Demo](https://www.loom.com/share/564b753eb4d44b55b985b8abd26b55f7))
- **Advanced Transcription Viewer** with search, playback controls, and speed adjustment
- **Smart Interface** with conditional visibility and state persistence
- **Professional Controls** including loop segments, follow audio, and keyboard shortcuts
- Supports [Whisper](https://github.com/openai/whisper#available-models-and-languages),
[Whisper.cpp](https://github.com/ggerganov/whisper.cpp), [Faster Whisper](https://github.com/guillaumekln/faster-whisper),
[Whisper-compatible Hugging Face models](https://huggingface.co/models?other=whisper), and
the [OpenAI Whisper API](https://platform.openai.com/docs/api-reference/introduction)
- [Command-Line Interface](#command-line-interface)
- Available on Mac, Windows, and Linux
## Transcription Viewer Interface
Buzz features a powerful transcription viewer that makes it easy to work with your transcriptions:
- **🔍 Smart Search**: Find text quickly with real-time search and navigation
- **🎵 Playback Controls**: Loop segments, follow audio, and adjust playback speed
- **⌨️ Keyboard Shortcuts**: Efficient navigation with Ctrl+F, Ctrl+L, and more
- **🎨 Clean Interface**: Conditional visibility keeps the interface uncluttered
- **💾 State Persistence**: Remembers your preferences between sessions
[Learn more about the Transcription Viewer Interface →](usage/5_transcription_viewer)

View file

@ -0,0 +1,118 @@
# Transcription Viewer Interface
The Buzz transcription viewer provides a powerful interface for reviewing, editing, and navigating through your transcriptions. This guide covers all the features available in the transcription viewer.
## Overview
The transcription viewer is organized into several key sections:
- **Top Toolbar**: Contains view mode, export, translate, resize, and search
- **Search Bar**: Find and navigate through transcript text
- **Transcription Segments**: Table view of all transcription segments with timestamps
- **Playback Controls**: Audio playback settings and speed controls (since version 1.3.0)
- **Audio Player**: Standard media player with progress bar
- **Current Segment Display**: Shows the currently selected or playing segment
## Top Toolbar
### View Mode Button
- **Function**: Switch between different viewing modes
- **Options**:
- **Timestamps**: Shows segments in a table format with start/end times
- **Text**: Shows combined text without timestamps
- **Translation**: Shows translated text (if available)
### Export Button
- **Function**: Export transcription in various formats
- **Formats**: SRT, VTT, TXT, JSON, and more
- **Usage**: Click to open export menu and select desired format
### Translate Button
- **Function**: Translate transcription to different languages
- **Usage**: Click to open translation settings and start translation
### Resize Button
- **Function**: Adjust transcription segment boundaries
- **Usage**: Click to open resize dialog for fine-tuning timestamps
- **More information**: See [Edit and Resize](https://chidiwilliams.github.io/buzz/docs/usage/edit_and_resize) section
### Playback Controls Button
(since version 1.3.0)
- **Function**: Show/hide playback control panel
- **Shortcut**: `Ctrl+P` (Windows/Linux) or `Cmd+P` (macOS)
- **Behavior**: Toggle button that shows/hides the playback controls below
### Find Button
(since version 1.3.0)
- **Function**: Show/hide search functionality
- **Shortcut**: `Ctrl+F` (Windows/Linux) or `Cmd+F` (macOS)
- **Behavior**: Toggle button that shows/hides the search bar
### Scroll to Current Button
(since version 1.3.0)
- **Function**: Automatically scroll to the currently playing text
- **Shortcut**: `Ctrl+G` (Windows/Linux) or `Cmd+G` (macOS)
- **Usage**: Click to jump to the current audio position in the transcript
## Search Functionality
(since version 1.3.0)
### Search Bar
The search bar appears below the toolbar when activated and provides:
- **Search Input**: Type text to find in the transcription (wider input field for better usability)
- **Navigation**: Up/down arrows to move between matches
- **Status**: Shows current match position and total matches (e.g., "3 of 15 matches")
- **Clear**: Remove search text and results (larger button for better accessibility)
- **Results**: Displays found text with context
- **Consistent Button Sizing**: All navigation buttons have uniform height for better visual consistency
### Search Shortcuts
- **`Ctrl+F` / `Cmd+F`**: Toggle search bar on/off
- **`Enter`**: Find next match
- **`Shift+Enter`**: Find previous match
- **`Escape`**: Close search bar
### Search Features
- **Real-time Search**: Results update as you type
- **Case-insensitive**: Finds matches regardless of capitalization
- **Word Boundaries**: Respects word boundaries for accurate matching
- **Cross-view Search**: Works in all view modes (Timestamps, Text, Translation)
## Playback Controls
(since version 1.3.0)
### Loop Segment
- **Function**: Automatically loop playback of selected segments
- **Usage**: Check the "Loop Segment" checkbox
- **Behavior**: When enabled, clicking on a transcript segment will set a loop range
- **Visual Feedback**: Loop range is highlighted in the audio player
### Follow Audio
- **Function**: Automatically scroll to current audio position
- **Usage**: Check the "Follow Audio" checkbox
- **Behavior**: Transcript automatically follows the audio playback
- **Benefits**: Easy to follow along with long audio files
### Speed Controls
- **Function**: Adjust audio playback speed
- **Range**: 0.5x to 2.0x speed
- **Controls**:
- **Speed Dropdown**: Select from preset speeds or enter custom value
- **Decrease Button (-)**: Reduce speed by 0.05x increments
- **Increase Button (+)**: Increase speed by 0.05x increments
- **Persistence**: Speed setting is saved between sessions
- **Button Sizing**: Speed control buttons match the size of search navigation buttons for visual consistency
## Keyboard Shortcuts
### Navigation
- **`Ctrl+F` / `Cmd+F`**: Toggle search bar
- **`Ctrl+P` / `Cmd+P`**: Toggle playback controls
- **`Ctrl+G` / `Cmd+G`**: Scroll to current position
- **`Ctrl+O` / `Cmd+O`**: Open file import dialog
### Search
- **`Enter`**: Find next match
- **`Shift+Enter`**: Find previous match
- **`Escape`**: Close search bar

View file

@ -1,12 +1,19 @@
import os
import pytest
from PyQt6.QtCore import QTime
from PyQt6.QtMultimedia import QMediaPlayer
from PyQt6.QtWidgets import QHBoxLayout
from pytestqt.qtbot import QtBot
from buzz.widgets.audio_player import AudioPlayer
from tests.audio import test_audio_path
from buzz.settings.settings import Settings
def assert_approximately_equal(actual, expected, tolerance=0.001):
"""Helper function to compare values with tolerance for floating-point precision"""
assert abs(actual - expected) < tolerance, f"Value {actual} is not approximately equal to {expected}"
class TestAudioPlayer:
@ -42,3 +49,109 @@ class TestAudioPlayer:
widget.on_playback_state_changed(QMediaPlayer.PlaybackState.StoppedState)
assert widget.play_button.icon().themeName() == widget.play_icon.themeName()
def test_should_have_basic_audio_controls(self, qtbot: QtBot):
widget = AudioPlayer(test_audio_path)
qtbot.add_widget(widget)
# Speed controls were moved to transcription viewer - just verify basic audio player functionality
assert widget.play_button is not None
assert widget.scrubber is not None
assert widget.time_label is not None
# Verify the widget loads audio correctly
assert widget.media_player is not None
assert os.path.normpath(widget.media_player.source().toLocalFile()) == os.path.normpath(test_audio_path)
def test_should_change_playback_rate_directly(self, qtbot: QtBot):
widget = AudioPlayer(test_audio_path)
qtbot.add_widget(widget)
# Speed controls moved to transcription viewer - test basic playback rate functionality
initial_rate = widget.media_player.playbackRate()
widget.media_player.setPlaybackRate(1.5)
assert_approximately_equal(widget.media_player.playbackRate(), 1.5)
def test_should_handle_custom_playback_rates(self, qtbot: QtBot):
widget = AudioPlayer(test_audio_path)
qtbot.add_widget(widget)
# Speed controls moved to transcription viewer - test basic playback rate functionality
widget.media_player.setPlaybackRate(1.7)
assert_approximately_equal(widget.media_player.playbackRate(), 1.7)
def test_should_handle_various_playback_rates(self, qtbot: QtBot):
widget = AudioPlayer(test_audio_path)
qtbot.add_widget(widget)
# Speed controls moved to transcription viewer - test basic playback rate functionality
# Test that the media player can handle various playback rates
widget.media_player.setPlaybackRate(0.5)
assert_approximately_equal(widget.media_player.playbackRate(), 0.5)
widget.media_player.setPlaybackRate(2.0)
assert_approximately_equal(widget.media_player.playbackRate(), 2.0)
def test_should_use_single_row_layout(self, qtbot: QtBot):
widget = AudioPlayer(test_audio_path)
qtbot.add_widget(widget)
# Verify the layout structure
layout = widget.layout()
assert isinstance(layout, QHBoxLayout)
# Speed controls moved to transcription viewer - simplified layout
assert layout.count() == 3 # play_button, scrubber, time_label
def test_should_persist_playback_rate_setting(self, qtbot: QtBot):
widget = AudioPlayer(test_audio_path)
qtbot.add_widget(widget)
# Speed controls moved to transcription viewer - test that settings are loaded
# The widget should load the saved playback rate from settings
assert widget.settings is not None
saved_rate = widget.settings.value(Settings.Key.AUDIO_PLAYBACK_RATE, 1.0, float)
assert isinstance(saved_rate, float)
assert 0.1 <= saved_rate <= 5.0
def test_should_handle_range_looping(self, qtbot: QtBot):
widget = AudioPlayer(test_audio_path)
qtbot.add_widget(widget)
# Test range setting and looping functionality
widget.set_range((1000, 3000)) # 1-3 seconds
assert widget.range_ms == (1000, 3000)
# Clear range
widget.clear_range()
assert widget.range_ms is None
def test_should_handle_invalid_media(self, qtbot: QtBot):
widget = AudioPlayer(test_audio_path)
qtbot.add_widget(widget)
widget.set_invalid_media(True)
# Speed controls moved to transcription viewer - just verify invalid media handling
assert widget.invalid_media is True
assert widget.play_button.isEnabled() is False
assert widget.scrubber.isEnabled() is False
assert widget.time_label.isEnabled() is False
def test_should_stop_playback(self, qtbot: QtBot):
widget = AudioPlayer(test_audio_path)
qtbot.add_widget(widget)
# Test stop functionality
widget.stop()
assert widget.media_player.playbackState() == QMediaPlayer.PlaybackState.StoppedState
def test_should_handle_media_status_changes(self, qtbot: QtBot):
widget = AudioPlayer(test_audio_path)
qtbot.add_widget(widget)
# Test media status handling
widget.on_media_status_changed(QMediaPlayer.MediaStatus.LoadedMedia)
assert widget.invalid_media is False
widget.on_media_status_changed(QMediaPlayer.MediaStatus.InvalidMedia)
assert widget.invalid_media is True

View file

@ -37,6 +37,9 @@ class TestShortcutsEditorWidget:
(_("View Transcript Text"), "Ctrl+E"),
(_("View Transcript Translation"), "Ctrl+L"),
(_("View Transcript Timestamps"), "Ctrl+T"),
(_("Search Transcript"), "Ctrl+F"),
(_("Scroll to Current Text"), "Ctrl+G"),
(_("Toggle Playback Controls"), "Ctrl+P"),
(_("Clear History"), "Ctrl+S"),
(_("Cancel Transcription"), "Ctrl+X"),
)

File diff suppressed because it is too large Load diff