Compare commits

...

40 commits

Author SHA1 Message Date
Raivis Dejus
1346c68c72
Pre release polishes (#1416) 2026-03-08 08:47:19 +00:00
Raivis Dejus
36f2d41557
Mac UI adjustments (#1415) 2026-03-07 22:27:52 +00:00
Raivis Dejus
14cacf6acf
Recording transcriber improvements (#1414)
Adding option to hide unconfirmed and variable transcriptions in append and replace mode
2026-03-07 19:26:29 +00:00
Raivis Dejus
c9db73722e
Live recording improvements (#1413) 2026-03-07 14:48:41 +00:00
Raivis Dejus
04c07c6cae
Adding VAD to whisper.cpp to reduce hallucinations on audio w silences (#1412) 2026-03-07 05:58:04 +00:00
Raivis Dejus
981dd3a758
Pre release polishes (#1411) 2026-03-06 19:26:19 +02:00
Raivis Dejus
7f2bf348b6
Adding flatpak release notes (#1407) 2026-03-01 10:16:25 +00:00
Raivis Dejus
a881a70a6f
Recording transcriber improvements (#1405) 2026-02-28 17:32:10 +00:00
Raivis Dejus
187d15b8e8
Add auto update check for Windows adn Mac (#1404) 2026-02-28 14:39:04 +00:00
Raivis Dejus
3869ac08db
1329 improve folder watch (#1402) 2026-02-27 17:49:38 +00:00
Raivis Dejus
f545a84ba6
Add cvs export (#1400) 2026-02-27 14:14:18 +00:00
Raivis Dejus
ff1f521a6a
1389 add folder import (#1398) 2026-02-27 09:11:51 +00:00
Raivis Dejus
b2f98f139e
Youtube download update (#1396) 2026-02-26 20:24:35 +00:00
Raivis Dejus
0f77deb17b
Additional tests (#1393) 2026-02-22 18:00:23 +00:00
Raivis Dejus
4c9b249c50
Recordint transcriber improvements (#1392) 2026-02-22 17:46:20 +02:00
Raivis Dejus
bb546acbf9
Fix for windows crashes (#1387) 2026-02-20 15:47:13 +02:00
Raivis Dejus
ca8b7876fd
Adding translations (#1382) 2026-02-08 16:26:55 +00:00
Raivis Dejus
795da67f20
1026 translation improvements (#1380) 2026-02-08 15:13:21 +02:00
Raivis Dejus
749d9e6e4d
UI glitch fixes for recording transcriber (#1379) 2026-02-07 10:40:40 +00:00
Raivis Dejus
125e924613
Fix recording transcriber (#1377) 2026-02-06 20:50:19 +00:00
Anantharaman R
156ec35246
Added copy-to-clipboard button in recording transcribe widget (#1370) 2026-02-06 20:29:58 +02:00
Raivis Dejus
c4d7971e04
Fix for speech separation error (#1371) 2026-02-06 14:38:28 +02:00
Raivis Dejus
37f5628c49
Speaker identification improvements (#1372) 2026-02-06 10:42:08 +02:00
albanobattistella
7f14fbe576
Update Italian translations in buzz.po (#1365) 2026-01-26 08:13:14 +00:00
Raivis Dejus
a94d8fbd0d
Will validate audio before transcribing (#1364) 2026-01-25 18:44:49 +00:00
Raivis Dejus
0d446a9964
Will increase build workflow timeout (#1363) 2026-01-25 11:37:52 +00:00
Raivis Dejus
6f6bc53c54
Fix for whisper.cpp on older cpus (#1362) 2026-01-25 09:42:09 +00:00
Raivis Dejus
7594763154
Fix for gpt-4o models (#1361) 2026-01-24 18:30:15 +00:00
Raivis Dejus
b14cf0e386
Fix for HF hub SSL sertificate validation on Windows 10 (#1356) 2026-01-17 05:59:27 +00:00
Raivis Dejus
97b1619902
Fix chinease word level timestamps (#1355) 2026-01-16 12:31:48 +00:00
Raivis Dejus
92fc405c4a
1347 add ending extender (#1354) 2026-01-16 10:23:48 +00:00
Raivis Dejus
08ae8ba43f
Fix for HF hub download certificates (#1353) 2026-01-16 09:18:27 +00:00
Ikko Eltociear Ashimine
e9502881fc
docs: add Japanese README (#1352) 2026-01-16 08:22:08 +00:00
Rob Siera
dc27281e34
Fix missing spaces after punctuation in speaker identification (#1344)
Co-authored-by: Robrecht Siera <rob.developer.securemail@holoncom.eu>
2026-01-10 16:58:27 +00:00
Rob Siera
f1bc725e2b
Fix speaker identification chunk size error for long transcriptions (#1342)
Co-authored-by: Robrecht Siera <rob.developer.securemail@holoncom.eu>
2026-01-10 09:38:55 +00:00
Raivis Dejus
43214f5c3d
Update documentation (#1337) 2026-01-05 06:37:30 +00:00
Raivis Dejus
85d70c1e64
Fix wheels (#1336) 2026-01-03 22:16:34 +02:00
Raivis Dejus
b0a53b4c2f
1329 fix folder watch (#1335) 2026-01-03 11:53:33 +00:00
albanobattistella
6f075da3d3
Update buzz.po (#1334) 2026-01-03 11:01:47 +00:00
Raivis Dejus
7099dcd9f1
1329 fix folder watch (#1333) 2026-01-03 08:05:43 +00:00
109 changed files with 18102 additions and 8108 deletions

View file

@ -8,5 +8,12 @@ omit =
deepmultilingualpunctuation/*
ctc_forced_aligner/*
[report]
exclude_also =
if sys.platform == "win32":
if platform.system\(\) == "Windows":
if platform.system\(\) == "Linux":
if platform.system\(\) == "Darwin":
[html]
directory = coverage/html

View file

@ -81,7 +81,7 @@ jobs:
# Add ubuntu-toolchain-r PPA for newer libstdc++6 with GLIBCXX_3.4.32
sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
sudo apt-get update
sudo apt-get install -y gcc-13 g++-13 libstdc++-13-dev
sudo apt-get install -y libstdc++6
fi
sudo apt-get install libyaml-dev libxkbcommon-x11-0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1 libxcb-randr0 libxcb-render-util0 libxcb-xinerama0 libxcb-shape0 libxcb-cursor0 libportaudio2 gettext libpulse0 libgl1-mesa-dev libvulkan-dev ccache
@ -94,6 +94,8 @@ jobs:
run: |
uv run make test
shell: bash
env:
PYTHONFAULTHANDLER: "1"
- name: Upload coverage reports to Codecov with GitHub Action
uses: codecov/codecov-action@v4
@ -105,7 +107,7 @@ jobs:
build:
runs-on: ${{ matrix.os }}
timeout-minutes: 60
timeout-minutes: 90
env:
BUZZ_DISABLE_TELEMETRY: true
strategy:
@ -165,7 +167,7 @@ jobs:
# Add ubuntu-toolchain-r PPA for newer libstdc++6 with GLIBCXX_3.4.32
sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
sudo apt-get update
sudo apt-get install -y gcc-13 g++-13 libstdc++-13-dev
sudo apt-get install -y libstdc++6
fi
sudo apt-get install libyaml-dev libxkbcommon-x11-0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1 libxcb-randr0 libxcb-render-util0 libxcb-xinerama0 libxcb-shape0 libxcb-cursor0 libportaudio2 gettext libpulse0 libgl1-mesa-dev libvulkan-dev ccache
@ -174,15 +176,17 @@ jobs:
- name: Install dependencies
run: uv sync
- uses: AnimMouse/setup-ffmpeg@v1.2.1
- uses: AnimMouse/setup-ffmpeg@v1
id: setup-ffmpeg
with:
version: ${{ matrix.os == 'macos-15-intel' && '7.1.1' || matrix.os == 'macos-latest' && '71' || '7.1' }}
version: ${{ matrix.os == 'macos-15-intel' && '7.1.1' || matrix.os == 'macos-latest' && '80' || '8.0' }}
- name: Install MSVC for Windows
run: |
if [ "$RUNNER_OS" == "Windows" ]; then
uv add msvc-runtime
uv pip install -U torch==2.8.0+cu129 torchaudio==2.8.0+cu129 --index-url https://download.pytorch.org/whl/cu129
uv pip install nvidia-cublas-cu12==12.9.1.4 nvidia-cuda-cupti-cu12==12.9.79 nvidia-cuda-runtime-cu12==12.9.79 --extra-index-url https://pypi.ngc.nvidia.com
uv cache clean
uv run pip cache purge

View file

@ -14,7 +14,7 @@ concurrency:
jobs:
build:
runs-on: ubuntu-latest
runs-on: ubuntu-24.04
timeout-minutes: 90
env:
BUZZ_DISABLE_TELEMETRY: true
@ -24,31 +24,48 @@ jobs:
# Ideas from https://github.com/orgs/community/discussions/25678
- name: Remove unused build tools
run: |
sudo apt-get remove -y '^llvm-.*'
sudo apt-get remove -y 'php.*'
sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel || true
sudo apt-get autoremove -y
sudo apt-get clean
python -m pip cache purge
rm -rf /opt/hostedtoolcache || true
- name: Maximize build space
uses: easimon/maximize-build-space@master
with:
root-reserve-mb: 26000
swap-size-mb: 1024
remove-dotnet: 'true'
remove-android: 'true'
remove-haskell: 'true'
remove-codeql: 'true'
remove-docker-images: 'true'
- name: Check available disk space
run: |
echo "=== Disk space ==="
df -h
echo "=== Memory ==="
free -h
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: snapcore/action-build@v1.3.0
- name: Install Snapcraft and dependencies
run: |
set -x
# Ensure snapd is ready
sudo systemctl start snapd.socket
sudo snap wait system seed.loaded
echo "=== Installing snapcraft ==="
sudo snap install --classic snapcraft
echo "=== Installing gnome extension dependencies ==="
sudo snap install gnome-46-2404 || { echo "Failed to install gnome-46-2404"; sudo journalctl -u snapd --no-pager -n 50; exit 1; }
sudo snap install gnome-46-2404-sdk || { echo "Failed to install gnome-46-2404-sdk"; sudo journalctl -u snapd --no-pager -n 50; exit 1; }
echo "=== Installing build-snaps ==="
sudo snap install --classic astral-uv || { echo "Failed to install astral-uv"; sudo journalctl -u snapd --no-pager -n 50; exit 1; }
echo "=== Installed snaps ==="
snap list
- name: Check disk space before build
run: df -h
- name: Build snap
id: snapcraft
- run: |
sudo apt-get update
sudo apt-get install libportaudio2 libtbb-dev
env:
SNAPCRAFT_BUILD_ENVIRONMENT: host
run: |
sudo -E snapcraft pack --verbose --destructive-mode
echo "snap=$(ls *.snap)" >> $GITHUB_OUTPUT
- run: sudo snap install --devmode *.snap
- run: |
cd $HOME

View file

@ -46,7 +46,7 @@ datas += collect_data_files("whisper")
datas += collect_data_files("demucs", include_py_files=True)
datas += collect_data_files("whisper_diarization", include_py_files=True)
datas += collect_data_files("deepmultilingualpunctuation", include_py_files=True)
datas += collect_data_files("ctc_forced_aligner", include_py_files=True)
datas += collect_data_files("ctc_forced_aligner", include_py_files=True, excludes=["build"])
datas += collect_data_files("nemo", include_py_files=True)
datas += collect_data_files("lightning_fabric", include_py_files=True)
datas += collect_data_files("pytorch_lightning", include_py_files=True)

1
CLAUDE.md Normal file
View file

@ -0,0 +1 @@
- Use uv to run tests and any scripts

View file

@ -52,6 +52,7 @@ Linux versions get also pushed to the snap. To install latest development versio
sudo apt-get install --no-install-recommends libyaml-dev libtbb-dev libxkbcommon-x11-0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1 libxcb-randr0 libxcb-render-util0 libxcb-xinerama0 libxcb-shape0 libxcb-cursor0 libportaudio2 gettext libpulse0 ffmpeg
```
On versions prior to Ubuntu 24.04 install `sudo apt-get install --no-install-recommends libegl1-mesa`
5. Install the dependencies `uv sync`
6. Run Buzz `uv run buzz`

View file

@ -1,5 +1,5 @@
# Change also in pyproject.toml and buzz/__version__.py
version := 1.4.0
version := 1.4.4
mac_app_path := ./dist/Buzz.app
mac_zip_path := ./dist/Buzz-${version}-mac.zip
@ -35,6 +35,11 @@ endif
COVERAGE_THRESHOLD := 70
test: buzz/whisper_cpp
# A check to get updates of yt-dlp. Should run only on local as part of regular development operations
# Sort of a local "update checker"
ifndef CI
uv lock --upgrade-package yt-dlp
endif
pytest -s -vv --cov=buzz --cov-report=xml --cov-report=html --benchmark-skip --cov-fail-under=${COVERAGE_THRESHOLD} --cov-config=.coveragerc
benchmarks: buzz/whisper_cpp
@ -52,30 +57,33 @@ ifeq ($(OS), Windows_NT)
# The _DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR is needed to prevent mutex lock issues on Windows
# https://github.com/actions/runner-images/issues/10004#issuecomment-2156109231
# -DCMAKE_[C|CXX]_COMPILER_WORKS=TRUE is used to prevent issue in building test program that fails on CI
cmake -S whisper.cpp -B whisper.cpp/build/ -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_RPATH='$$ORIGIN' -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_C_FLAGS="-D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR" -DCMAKE_CXX_FLAGS="-D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR" -DCMAKE_C_COMPILER_WORKS=TRUE -DCMAKE_CXX_COMPILER_WORKS=TRUE -DGGML_VULKAN=1
# GGML_NATIVE=OFF ensures we don't use -march=native (which would target the build machine's CPU)
cmake -S whisper.cpp -B whisper.cpp/build/ -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_RPATH='$$ORIGIN' -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_C_FLAGS="-D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR" -DCMAKE_CXX_FLAGS="-D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR" -DCMAKE_C_COMPILER_WORKS=TRUE -DCMAKE_CXX_COMPILER_WORKS=TRUE -DGGML_VULKAN=1 -DGGML_NATIVE=OFF
cmake --build whisper.cpp/build -j --config Release --verbose
-mkdir buzz/whisper_cpp
cp whisper.cpp/build/bin/Release/whisper-cli.exe buzz/whisper_cpp/
cp whisper.cpp/build/bin/Release/whisper-server.exe buzz/whisper_cpp/
cp dll_backup/SDL2.dll buzz/whisper_cpp
PowerShell -NoProfile -ExecutionPolicy Bypass -Command "if (-not (Test-Path 'buzz\whisper_cpp\ggml-silero-v6.2.0.bin')) { Start-BitsTransfer -Source https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v6.2.0.bin -Destination 'buzz\whisper_cpp\ggml-silero-v6.2.0.bin' }"
endif
ifeq ($(shell uname -s), Linux)
# Build Whisper with Vulkan support
# GGML_NATIVE=OFF ensures we don't use -march=native (which would target the build machine's CPU)
# This enables portable SSE4.2/AVX/AVX2 optimizations that work on most x86_64 CPUs
rm -rf whisper.cpp/build || true
-mkdir -p buzz/whisper_cpp
cmake -S whisper.cpp -B whisper.cpp/build/ -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_RPATH='$$ORIGIN' -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DGGML_VULKAN=1
cmake -S whisper.cpp -B whisper.cpp/build/ -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_RPATH='$$ORIGIN' -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DGGML_VULKAN=1 -DGGML_NATIVE=OFF
cmake --build whisper.cpp/build -j --config Release --verbose
cp whisper.cpp/build/bin/whisper-cli buzz/whisper_cpp/ || true
cp whisper.cpp/build/bin/whisper-server buzz/whisper_cpp/ || true
cp whisper.cpp/build/src/libwhisper.so buzz/whisper_cpp/ || true
cp whisper.cpp/build/src/libwhisper.so.1 buzz/whisper_cpp/ || true
cp whisper.cpp/build/src/libwhisper.so.1.8.2 buzz/whisper_cpp/ || true
cp whisper.cpp/build/ggml/src/libggml.so buzz/whisper_cpp/ || true
cp whisper.cpp/build/ggml/src/libggml-base.so buzz/whisper_cpp/ || true
cp whisper.cpp/build/ggml/src/libggml-cpu.so buzz/whisper_cpp/ || true
cp whisper.cpp/build/ggml/src/ggml-vulkan/libggml-vulkan.so buzz/whisper_cpp/ || true
cp -P whisper.cpp/build/src/libwhisper.so* buzz/whisper_cpp/ || true
cp -P whisper.cpp/build/ggml/src/libggml.so* buzz/whisper_cpp/ || true
cp -P whisper.cpp/build/ggml/src/libggml-base.so* buzz/whisper_cpp/ || true
cp -P whisper.cpp/build/ggml/src/libggml-cpu.so* buzz/whisper_cpp/ || true
cp -P whisper.cpp/build/ggml/src/ggml-vulkan/libggml-vulkan.so* buzz/whisper_cpp/ || true
test -f buzz/whisper_cpp/ggml-silero-v6.2.0.bin || curl -L -o buzz/whisper_cpp/ggml-silero-v6.2.0.bin https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v6.2.0.bin
endif
# Build on Macs
@ -95,6 +103,7 @@ endif
cp whisper.cpp/build/bin/whisper-server buzz/whisper_cpp/ || true
cp whisper.cpp/build/src/libwhisper.dylib buzz/whisper_cpp/ || true
cp whisper.cpp/build/ggml/src/libggml* buzz/whisper_cpp/ || true
test -f buzz/whisper_cpp/ggml-silero-v6.2.0.bin || curl -L -o buzz/whisper_cpp/ggml-silero-v6.2.0.bin https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v6.2.0.bin
endif
# Prints all the Mac developer identities used for code signing
@ -187,26 +196,26 @@ gh_upgrade_pr:
# Internationalization
translation_po_all:
$(MAKE) translation_po locale=en_US
$(MAKE) translation_po locale=ca_ES
$(MAKE) translation_po locale=es_ES
$(MAKE) translation_po locale=pl_PL
$(MAKE) translation_po locale=zh_CN
$(MAKE) translation_po locale=zh_TW
$(MAKE) translation_po locale=it_IT
$(MAKE) translation_po locale=lv_LV
$(MAKE) translation_po locale=uk_UA
$(MAKE) translation_po locale=ja_JP
$(MAKE) translation_po locale=da_DK
$(MAKE) translation_po locale=de_DE
$(MAKE) translation_po locale=en_US
$(MAKE) translation_po locale=es_ES
$(MAKE) translation_po locale=it_IT
$(MAKE) translation_po locale=ja_JP
$(MAKE) translation_po locale=lv_LV
$(MAKE) translation_po locale=nl
$(MAKE) translation_po locale=pl_PL
$(MAKE) translation_po locale=pt_BR
$(MAKE) translation_po locale=uk_UA
$(MAKE) translation_po locale=zh_CN
$(MAKE) translation_po locale=zh_TW
TMP_POT_FILE_PATH := $(shell mktemp)
PO_FILE_PATH := buzz/locale/${locale}/LC_MESSAGES/buzz.po
translation_po:
mkdir -p buzz/locale/${locale}/LC_MESSAGES
xgettext --from-code=UTF-8 -o "${TMP_POT_FILE_PATH}" -l python $(shell find buzz -name '*.py')
xgettext --from-code=UTF-8 --add-location=file -o "${TMP_POT_FILE_PATH}" -l python $(shell find buzz -name '*.py')
sed -i.bak 's/CHARSET/UTF-8/' ${TMP_POT_FILE_PATH}
if [ ! -f ${PO_FILE_PATH} ]; then \
msginit --no-translator --input=${TMP_POT_FILE_PATH} --output-file=${PO_FILE_PATH}; \

98
README.ja_JP.md Normal file
View file

@ -0,0 +1,98 @@
# Buzz
[ドキュメント](https://chidiwilliams.github.io/buzz/)
パソコン上でオフラインで音声の文字起こしと翻訳を行います。OpenAIの[Whisper](https://github.com/openai/whisper)を使用しています。
![MIT License](https://img.shields.io/badge/license-MIT-green)
[![CI](https://github.com/chidiwilliams/buzz/actions/workflows/ci.yml/badge.svg)](https://github.com/chidiwilliams/buzz/actions/workflows/ci.yml)
[![codecov](https://codecov.io/github/chidiwilliams/buzz/branch/main/graph/badge.svg?token=YJSB8S2VEP)](https://codecov.io/github/chidiwilliams/buzz)
![GitHub release (latest by date)](https://img.shields.io/github/v/release/chidiwilliams/buzz)
[![Github all releases](https://img.shields.io/github/downloads/chidiwilliams/buzz/total.svg)](https://GitHub.com/chidiwilliams/buzz/releases/)
![Buzz](./buzz/assets/buzz-banner.jpg)
## 機能
- 音声・動画ファイルまたはYouTubeリンクの文字起こし
- マイクからのリアルタイム音声文字起こし
- イベントやプレゼンテーション中に便利なプレゼンテーションウィンドウ
- ノイズの多い音声でより高い精度を得るための、文字起こし前の話者分離
- 文字起こしメディアでの話者識別
- 複数のWhisperバックエンドをサポート
- Nvidia GPU向けCUDAアクセラレーション対応
- Mac向けApple Silicon対応
- Whisper.cppでのVulkanアクセラレーション対応統合GPUを含むほとんどのGPUで利用可能
- TXT、SRT、VTT形式での文字起こしエクスポート
- 検索、再生コントロール、速度調整機能を備えた高度な文字起こしビューア
- 効率的なナビゲーションのためのキーボードショートカット
- 新しいファイルの自動文字起こしのための監視フォルダ
- スクリプトや自動化のためのコマンドラインインターフェース
## インストール
### macOS
[SourceForge](https://sourceforge.net/projects/buzz-captions/files/)から`.dmg`ファイルをダウンロードしてください。
### Windows
[SourceForge](https://sourceforge.net/projects/buzz-captions/files/)からインストールファイルを入手してください。
アプリは署名されていないため、インストール時に警告が表示されます。`詳細情報` -> `実行`を選択してください。
### Linux
Buzzは[Flatpak](https://flathub.org/apps/io.github.chidiwilliams.Buzz)または[Snap](https://snapcraft.io/buzz)として利用可能です。
Flatpakをインストールするには、以下を実行してください
```shell
flatpak install flathub io.github.chidiwilliams.Buzz
```
[![Download on Flathub](https://flathub.org/api/badge?svg&locale=en)](https://flathub.org/en/apps/io.github.chidiwilliams.Buzz)
Snapをインストールするには、以下を実行してください
```shell
sudo apt-get install libportaudio2 libcanberra-gtk-module libcanberra-gtk3-module
sudo snap install buzz
```
[![Get it from the Snap Store](https://snapcraft.io/static/images/badges/en/snap-store-black.svg)](https://snapcraft.io/buzz)
### PyPI
[ffmpeg](https://www.ffmpeg.org/download.html)をインストールしてください。
Python 3.12環境を使用していることを確認してください。
Buzzをインストール
```shell
pip install buzz-captions
python -m buzz
```
**PyPIでのGPUサポート**
PyPIでインストールしたバージョンでWindows上のNvidia GPUのGPUサポートを有効にするには、[torch](https://pytorch.org/get-started/locally/)のCUDAサポートを確認してください。
```
pip3 install -U torch==2.8.0+cu129 torchaudio==2.8.0+cu129 --index-url https://download.pytorch.org/whl/cu129
pip3 install nvidia-cublas-cu12==12.9.1.4 nvidia-cuda-cupti-cu12==12.9.79 nvidia-cuda-runtime-cu12==12.9.79 --extra-index-url https://pypi.ngc.nvidia.com
```
### 最新開発版
最新の機能やバグ修正を含む最新開発版の入手方法については、[FAQ](https://chidiwilliams.github.io/buzz/docs/faq#9-where-can-i-get-latest-development-version)をご覧ください。
### スクリーンショット
<div style="display: flex; flex-wrap: wrap;">
<img alt="ファイルインポート" src="share/screenshots/buzz-1-import.png" style="max-width: 18%; margin-right: 1%;" />
<img alt="メイン画面" src="share/screenshots/buzz-2-main_screen.png" style="max-width: 18%; margin-right: 1%; height:auto;" />
<img alt="設定" src="share/screenshots/buzz-3-preferences.png" style="max-width: 18%; margin-right: 1%; height:auto;" />
<img alt="モデル設定" src="share/screenshots/buzz-3.2-model-preferences.png" style="max-width: 18%; margin-right: 1%; height:auto;" />
<img alt="文字起こし" src="share/screenshots/buzz-4-transcript.png" style="max-width: 18%; margin-right: 1%; height:auto;" />
<img alt="ライブ録音" src="share/screenshots/buzz-5-live_recording.png" style="max-width: 18%; margin-right: 1%; height:auto;" />
<img alt="リサイズ" src="share/screenshots/buzz-6-resize.png" style="max-width: 18%;" />
</div>

View file

@ -2,7 +2,7 @@
# Buzz
[Documentation](https://chidiwilliams.github.io/buzz/) | [Buzz Captions on the App Store](https://apps.apple.com/us/app/buzz-captions/id6446018936?mt=12&itsct=apps_box_badge&itscg=30200)
[Documentation](https://chidiwilliams.github.io/buzz/)
Transcribe and translate audio offline on your personal computer. Powered by
OpenAI's [Whisper](https://github.com/openai/whisper).
@ -13,12 +13,23 @@ OpenAI's [Whisper](https://github.com/openai/whisper).
![GitHub release (latest by date)](https://img.shields.io/github/v/release/chidiwilliams/buzz)
[![Github all releases](https://img.shields.io/github/downloads/chidiwilliams/buzz/total.svg)](https://GitHub.com/chidiwilliams/buzz/releases/)
<blockquote>
<p>An older version of Buzz available on the App Store. Get a Mac-native version of Buzz with a cleaner look, audio playback, drag-and-drop import, transcript editing, search, and much more.</p>
<a href="https://apps.apple.com/us/app/buzz-captions/id6446018936?mt=12&amp;itsct=apps_box_badge&amp;itscg=30200"><img src="https://toolbox.marketingtools.apple.com/api/badges/download-on-the-mac-app-store/black/en-us?size=250x83&amp;releaseDate=1679529600" alt="Download on the Mac App Store" /></a>
</blockquote>
![Buzz](https://raw.githubusercontent.com/chidiwilliams/buzz/refs/heads/main/buzz/assets/buzz-banner.jpg)
![Buzz](./buzz/assets/buzz-banner.jpg)
## Features
- Transcribe audio and video files or Youtube links
- Live realtime audio transcription from microphone
- Presentation window for easy accessibility during events and presentations
- Speech separation before transcription for better accuracy on noisy audio
- Speaker identification in transcribed media
- Multiple whisper backend support
- CUDA acceleration support for Nvidia GPUs
- Apple Silicon support for Macs
- Vulkan acceleration support for Whisper.cpp on most GPUs, including integrated GPUs
- Export transcripts to TXT, SRT, and VTT
- Advanced Transcription Viewer with search, playback controls, and speed adjustment
- Keyboard shortcuts for efficient navigation
- Watch folder for automatic transcription of new files
- Command-Line Interface for scripting and automation
## Installation
@ -32,12 +43,6 @@ Get the installation files from the [SourceForge](https://sourceforge.net/projec
App is not signed, you will get a warning when you install it. Select `More info` -> `Run anyway`.
**Alternatively, install with [winget](https://learn.microsoft.com/en-us/windows/package-manager/winget/)**
```shell
winget install ChidiWilliams.Buzz
```
### Linux
Buzz is available as a [Flatpak](https://flathub.org/apps/io.github.chidiwilliams.Buzz) or a [Snap](https://snapcraft.io/buzz).
@ -47,17 +52,22 @@ To install flatpak, run:
flatpak install flathub io.github.chidiwilliams.Buzz
```
[![Download on Flathub](https://flathub.org/api/badge?svg&locale=en)](https://flathub.org/en/apps/io.github.chidiwilliams.Buzz)
To install snap, run:
```shell
sudo apt-get install libportaudio2 libcanberra-gtk-module libcanberra-gtk3-module
sudo snap install buzz
sudo snap connect buzz:password-manager-service
```
[![Get it from the Snap Store](https://snapcraft.io/static/images/badges/en/snap-store-black.svg)](https://snapcraft.io/buzz)
### PyPI
Install [ffmpeg](https://www.ffmpeg.org/download.html)
Ensure you use Python 3.12 environment.
Install Buzz
```shell
@ -70,23 +80,27 @@ python -m buzz
To have GPU support for Nvidia GPUS on Windows, for PyPI installed version ensure, CUDA support for [torch](https://pytorch.org/get-started/locally/)
```
pip3 install -U torch==2.7.1+cu128 torchaudio==2.7.1+cu128 --index-url https://download.pytorch.org/whl/cu128
pip3 install nvidia-cublas-cu12==12.8.3.14 nvidia-cuda-cupti-cu12==12.8.57 nvidia-cuda-nvrtc-cu12==12.8.61 nvidia-cuda-runtime-cu12==12.8.57 nvidia-cudnn-cu12==9.7.1.26 nvidia-cufft-cu12==11.3.3.41 nvidia-curand-cu12==10.3.9.55 nvidia-cusolver-cu12==11.7.2.55 nvidia-cusparse-cu12==12.5.4.2 nvidia-cusparselt-cu12==0.6.3 nvidia-nvjitlink-cu12==12.8.61 nvidia-nvtx-cu12==12.8.55 --extra-index-url https://pypi.ngc.nvidia.com
pip3 install -U torch==2.8.0+cu129 torchaudio==2.8.0+cu129 --index-url https://download.pytorch.org/whl/cu129
pip3 install nvidia-cublas-cu12==12.9.1.4 nvidia-cuda-cupti-cu12==12.9.79 nvidia-cuda-runtime-cu12==12.9.79 --extra-index-url https://pypi.ngc.nvidia.com
```
### Latest development version
For info on how to get latest development version with latest features and bug fixes see [FAQ](https://chidiwilliams.github.io/buzz/docs/faq#9-where-can-i-get-latest-development-version).
### Support Buzz
You can help the Buzz by starring 🌟 the repo and sharing it with your friends.
### Screenshots
<div style="display: flex; flex-wrap: wrap;">
<img alt="File import" src="share/screenshots/buzz-1-import.png" style="max-width: 18%; margin-right: 1%;" />
<img alt="Main screen" src="share/screenshots/buzz-2-main_screen.png" style="max-width: 18%; margin-right: 1%; height:auto;" />
<img alt="Preferences" src="share/screenshots/buzz-3-preferences.png" style="max-width: 18%; margin-right: 1%; height:auto;" />
<img alt="Model preferences" src="share/screenshots/buzz-3.2-model-preferences.png" style="max-width: 18%; margin-right: 1%; height:auto;" />
<img alt="Transcript" src="share/screenshots/buzz-4-transcript.png" style="max-width: 18%; margin-right: 1%; height:auto;" />
<img alt="Live recording" src="share/screenshots/buzz-5-live_recording.png" style="max-width: 18%; margin-right: 1%; height:auto;" />
<img alt="Resize" src="share/screenshots/buzz-6-resize.png" style="max-width: 18%;" />
<img alt="File import" src="https://github.com/chidiwilliams/buzz/raw/main/share/screenshots/buzz-1-import.png" style="max-width: 18%; margin-right: 1%;" />
<img alt="Main screen" src="https://github.com/chidiwilliams/buzz/raw/main/share/screenshots/buzz-2-main_screen.png" style="max-width: 18%; margin-right: 1%; height:auto;" />
<img alt="Preferences" src="https://github.com/chidiwilliams/buzz/raw/main/share/screenshots/buzz-3-preferences.png" style="max-width: 18%; margin-right: 1%; height:auto;" />
<img alt="Model preferences" src="https://github.com/chidiwilliams/buzz/raw/main/share/screenshots/buzz-3.2-model-preferences.png" style="max-width: 18%; margin-right: 1%; height:auto;" />
<img alt="Transcript" src="https://github.com/chidiwilliams/buzz/raw/main/share/screenshots/buzz-4-transcript.png" style="max-width: 18%; margin-right: 1%; height:auto;" />
<img alt="Live recording" src="https://github.com/chidiwilliams/buzz/raw/main/share/screenshots/buzz-5-live_recording.png" style="max-width: 18%; margin-right: 1%; height:auto;" />
<img alt="Resize" src="https://github.com/chidiwilliams/buzz/raw/main/share/screenshots/buzz-6-resize.png" style="max-width: 18%;" />
</div>

View file

@ -1 +1 @@
VERSION = "1.4.0"
VERSION = "1.4.4"

View file

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" height="48" viewBox="0 -960 960 960" width="48"><path d="M160-200v-60h640v60H160Zm320-136L280-536l42-42 128 128v-310h60v310l128-128 42 42-200 200Z" transform="rotate(180 480 -480)"/></svg>

After

Width:  |  Height:  |  Size: 229 B

View file

@ -25,8 +25,11 @@ from buzz.assets import APP_BASE_DIR
if getattr(sys, "frozen", False) is False and platform.system() != "Windows":
faulthandler.enable()
# Sets stderr to no-op TextIO when None (run as Windows GUI).
# Resolves https://github.com/chidiwilliams/buzz/issues/221
# Sets stdout/stderr to no-op TextIO when None (run as Windows GUI with --noconsole).
# stdout fix: torch.hub uses sys.stdout.write() for download progress and crashes if None.
# stderr fix: Resolves https://github.com/chidiwilliams/buzz/issues/221
if sys.stdout is None:
sys.stdout = TextIO()
if sys.stderr is None:
sys.stderr = TextIO()

View file

@ -70,9 +70,8 @@ def _setup_windows_dll_directories():
for lib_dir in lib_dirs:
try:
os.add_dll_directory(str(lib_dir))
logger.debug(f"Added DLL directory: {lib_dir}")
except (OSError, AttributeError) as e:
logger.debug(f"Could not add DLL directory {lib_dir}: {e}")
pass
def _preload_linux_libraries():
@ -101,17 +100,15 @@ def _preload_linux_libraries():
# Skip problematic libraries
if any(pattern in lib_file.name for pattern in skip_patterns):
logger.debug(f"Skipping library: {lib_file}")
continue
try:
# Use RTLD_GLOBAL so symbols are available to other libraries
ctypes.CDLL(str(lib_file), mode=ctypes.RTLD_GLOBAL)
loaded_libs.add(lib_file.name)
logger.debug(f"Preloaded library: {lib_file}")
except OSError as e:
# Some libraries may have missing dependencies, that's ok
logger.debug(f"Could not preload {lib_file}: {e}")
pass
def setup_cuda_libraries():

View file

@ -49,5 +49,4 @@ def close_app_db():
return
if db.isOpen():
logging.debug("Closing database connection: %s", db.connectionName())
db.close()

View file

@ -12,6 +12,7 @@ from uuid import UUID
# This must be done before importing demucs which uses torch.hub with urllib
try:
import certifi
os.environ.setdefault('REQUESTS_CA_BUNDLE', certifi.where())
os.environ.setdefault('SSL_CERT_FILE', certifi.where())
os.environ.setdefault('SSL_CERT_DIR', os.path.dirname(certifi.where()))
# Also update the default SSL context for urllib
@ -52,6 +53,7 @@ if sys.platform == "win32":
from demucs import api as demucsApi
from buzz.locale import _
from buzz.model_loader import ModelType
from buzz.transcriber.file_transcriber import FileTranscriber
from buzz.transcriber.openai_whisper_api_file_transcriber import (
@ -123,12 +125,22 @@ class FileTranscriberQueueWorker(QObject):
def separator_progress_callback(progress):
self.task_progress.emit(self.current_task, int(progress["segment_offset"] * 100) / int(progress["audio_length"] * 100))
separator = None
separated = None
try:
# Force CPU if specified, otherwise use CUDA if available
force_cpu = os.getenv("BUZZ_FORCE_CPU", "false").lower() == "true"
if force_cpu:
device = "cpu"
else:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
separator = demucsApi.Separator(
device=device,
progress=True,
callback=separator_progress_callback,
)
_, separated = separator.separate_audio_file(Path(self.current_task.file_path))
_origin, separated = separator.separate_audio_file(Path(self.current_task.file_path))
task_file_path = Path(self.current_task.file_path)
self.speech_path = task_file_path.with_name(f"{task_file_path.stem}_speech.mp3")
@ -137,6 +149,21 @@ class FileTranscriberQueueWorker(QObject):
self.current_task.file_path = str(self.speech_path)
except Exception as e:
logging.error(f"Error during speech extraction: {e}", exc_info=True)
self.task_error.emit(
self.current_task,
_("Speech extraction failed! Check your internet connection — a model may need to be downloaded."),
)
self.is_running = False
return
finally:
# Release memory used by speech extractor
del separator, separated
try:
import torch
if torch.cuda.is_available():
torch.cuda.empty_cache()
except Exception:
pass
logging.debug("Starting next transcription task")
self.task_progress.emit(self.current_task, 0)

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -7,8 +7,23 @@ import threading
import shutil
import subprocess
import sys
import ssl
import warnings
import platform
# Fix SSL certificate verification for bundled applications (macOS, Windows).
# This must be done before importing libraries that make HTTPS requests.
try:
import certifi
_certifi_ca_bundle = certifi.where()
os.environ.setdefault("REQUESTS_CA_BUNDLE", _certifi_ca_bundle)
os.environ.setdefault("SSL_CERT_FILE", _certifi_ca_bundle)
os.environ.setdefault("SSL_CERT_DIR", os.path.dirname(_certifi_ca_bundle))
# Also update the default SSL context for urllib
ssl._create_default_https_context = lambda: ssl.create_default_context(cafile=_certifi_ca_bundle)
except ImportError:
_certifi_ca_bundle = None
import requests
import whisper
import huggingface_hub
@ -22,6 +37,24 @@ from huggingface_hub.errors import LocalEntryNotFoundError
from buzz.locale import _
# Configure huggingface_hub to use certifi certificates directly.
# This is more reliable than environment variables for frozen apps.
if _certifi_ca_bundle is not None:
try:
from huggingface_hub import configure_http_backend
def _hf_session_factory() -> requests.Session:
session = requests.Session()
session.verify = _certifi_ca_bundle
return session
configure_http_backend(backend_factory=_hf_session_factory)
except ImportError:
# configure_http_backend not available in older huggingface_hub versions
pass
except Exception as e:
logging.debug(f"Failed to configure huggingface_hub HTTP backend: {e}")
# On Windows, creating symlinks requires special privileges (Developer Mode or
# SeCreateSymbolicLinkPrivilege). Monkey-patch huggingface_hub to use file
# copying instead of symlinks to avoid [WinError 1314] errors.

View file

@ -9,6 +9,9 @@ from PyQt6.QtCore import QObject, pyqtSignal
class RecordingAmplitudeListener(QObject):
stream: Optional[sounddevice.InputStream] = None
amplitude_changed = pyqtSignal(float)
average_amplitude_changed = pyqtSignal(float)
ACCUMULATION_SECONDS = 1
def __init__(
self,
@ -17,6 +20,9 @@ class RecordingAmplitudeListener(QObject):
):
super().__init__(parent)
self.input_device_index = input_device_index
self.buffer = np.ndarray([], dtype=np.float32)
self.accumulation_size = 0
self._active = True
def start_recording(self):
try:
@ -27,16 +33,24 @@ class RecordingAmplitudeListener(QObject):
callback=self.stream_callback,
)
self.stream.start()
except sounddevice.PortAudioError:
self.accumulation_size = int(self.stream.samplerate * self.ACCUMULATION_SECONDS)
except Exception as e:
self.stop_recording()
logging.exception("")
logging.exception("Failed to start audio stream on device %s: %s", self.input_device_index, e)
def stop_recording(self):
self._active = False
if self.stream is not None:
self.stream.stop()
self.stream.close()
def stream_callback(self, in_data: np.ndarray, frame_count, time_info, status):
if not self._active:
return
chunk = in_data.ravel()
amplitude = np.sqrt(np.mean(chunk**2)) # root-mean-square
self.amplitude_changed.emit(amplitude)
self.amplitude_changed.emit(float(np.sqrt(np.mean(chunk**2))))
self.buffer = np.append(self.buffer, chunk)
if self.buffer.size >= self.accumulation_size:
self.average_amplitude_changed.emit(float(np.sqrt(np.mean(self.buffer**2))))
self.buffer = np.ndarray([], dtype=np.float32)

View file

@ -17,7 +17,6 @@ class Settings:
RECORDING_TRANSCRIBER_TASK = "recording-transcriber/task"
RECORDING_TRANSCRIBER_MODEL = "recording-transcriber/model"
RECORDING_TRANSCRIBER_LANGUAGE = "recording-transcriber/language"
RECORDING_TRANSCRIBER_TEMPERATURE = "recording-transcriber/temperature"
RECORDING_TRANSCRIBER_INITIAL_PROMPT = "recording-transcriber/initial-prompt"
RECORDING_TRANSCRIBER_ENABLE_LLM_TRANSLATION = "recording-transcriber/enable-llm-translation"
RECORDING_TRANSCRIBER_LLM_MODEL = "recording-transcriber/llm-model"
@ -25,6 +24,13 @@ class Settings:
RECORDING_TRANSCRIBER_EXPORT_ENABLED = "recording-transcriber/export-enabled"
RECORDING_TRANSCRIBER_EXPORT_FOLDER = "recording-transcriber/export-folder"
RECORDING_TRANSCRIBER_MODE = "recording-transcriber/mode"
RECORDING_TRANSCRIBER_SILENCE_THRESHOLD = "recording-transcriber/silence-threshold"
RECORDING_TRANSCRIBER_LINE_SEPARATOR = "recording-transcriber/line-separator"
RECORDING_TRANSCRIBER_TRANSCRIPTION_STEP = "recording-transcriber/transcription-step"
RECORDING_TRANSCRIBER_EXPORT_FILE_TYPE = "recording-transcriber/export-file-type"
RECORDING_TRANSCRIBER_EXPORT_MAX_ENTRIES = "recording-transcriber/export-max-entries"
RECORDING_TRANSCRIBER_EXPORT_FILE_NAME = "recording-transcriber/export-file-name"
RECORDING_TRANSCRIBER_HIDE_UNCONFIRMED = "recording-transcriber/hide-unconfirmed"
PRESENTATION_WINDOW_TEXT_COLOR = "presentation-window/text-color"
PRESENTATION_WINDOW_BACKGROUND_COLOR = "presentation-window/background-color"
@ -34,7 +40,6 @@ class Settings:
FILE_TRANSCRIBER_TASK = "file-transcriber/task"
FILE_TRANSCRIBER_MODEL = "file-transcriber/model"
FILE_TRANSCRIBER_LANGUAGE = "file-transcriber/language"
FILE_TRANSCRIBER_TEMPERATURE = "file-transcriber/temperature"
FILE_TRANSCRIBER_INITIAL_PROMPT = "file-transcriber/initial-prompt"
FILE_TRANSCRIBER_ENABLE_LLM_TRANSLATION = "file-transcriber/enable-llm-translation"
FILE_TRANSCRIBER_LLM_MODEL = "file-transcriber/llm-model"
@ -77,6 +82,9 @@ class Settings:
FORCE_CPU = "force-cpu"
REDUCE_GPU_MEMORY = "reduce-gpu-memory"
LAST_UPDATE_CHECK = "update/last-check"
UPDATE_AVAILABLE_VERSION = "update/available-version"
def get_user_identifier(self) -> str:
user_id = self.value(self.Key.USER_IDENTIFIER, "")
if not user_id:

View file

@ -149,13 +149,22 @@ class FileTranscriber(QObject):
)
if self.transcription_task.source == FileTranscriptionTask.Source.FOLDER_WATCH:
shutil.move(
self.transcription_task.file_path,
os.path.join(
self.transcription_task.output_directory,
os.path.basename(self.transcription_task.file_path),
),
# Use original_file_path if available (before speech extraction changed file_path)
source_path = (
self.transcription_task.original_file_path
or self.transcription_task.file_path
)
if source_path and os.path.exists(source_path):
if self.transcription_task.delete_source_file:
os.remove(source_path)
else:
shutil.move(
source_path,
os.path.join(
self.transcription_task.output_directory,
os.path.basename(source_path),
),
)
def on_download_progress(self, data: dict):
if data["status"] == "downloading":
@ -170,7 +179,6 @@ class FileTranscriber(QObject):
...
# TODO: Move to transcription service
def write_output(
path: str,
segments: List[Segment],

View file

@ -183,17 +183,22 @@ class OpenAIWhisperAPIFileTranscriber(FileTranscriber):
return segments
@staticmethod
def get_value(segment, key):
def get_value(segment, key, default=None):
if hasattr(segment, key):
return getattr(segment, key)
return segment[key]
if isinstance(segment, dict):
return segment.get(key, default)
return default
def get_segments_for_file(self, file: str, offset_ms: int = 0):
with open(file, "rb") as file:
# gpt-4o models don't support verbose_json format
response_format = "json" if self.whisper_api_model.startswith("gpt-4o") else "verbose_json"
options = {
"model": self.whisper_api_model,
"file": file,
"response_format": "verbose_json",
"response_format": response_format,
"prompt": self.transcription_task.transcription_options.initial_prompt,
}
@ -219,7 +224,8 @@ class OpenAIWhisperAPIFileTranscriber(FileTranscriber):
if "segments" in transcript.model_extra:
segments = transcript.model_extra["segments"]
else:
segments = [{"words": words}]
# gpt-4o models return only text without segments/timestamps
segments = [{"text": transcript.text, "start": 0, "end": 0, "words": words}]
result_segments = []
if self.word_level_timings:
@ -274,9 +280,9 @@ class OpenAIWhisperAPIFileTranscriber(FileTranscriber):
else:
result_segments = [
Segment(
int(self.get_value(segment, "start") * 1000 + offset_ms),
int(self.get_value(segment,"end") * 1000 + offset_ms),
self.get_value(segment,"text"),
int(self.get_value(segment, "start", 0) * 1000 + offset_ms),
int(self.get_value(segment, "end", 0) * 1000 + offset_ms),
self.get_value(segment, "text", ""),
)
for segment in segments
]

View file

@ -11,6 +11,9 @@ import subprocess
from typing import Optional
from platformdirs import user_cache_dir
# Preload CUDA libraries before importing torch
from buzz import cuda_setup # noqa: F401
import torch
import numpy as np
import sounddevice
@ -23,7 +26,7 @@ from buzz.locale import _
from buzz.assets import APP_BASE_DIR
from buzz.model_loader import ModelType, map_language_to_mms
from buzz.settings.settings import Settings
from buzz.transcriber.transcriber import TranscriptionOptions, Task
from buzz.transcriber.transcriber import TranscriptionOptions, Task, DEFAULT_WHISPER_TEMPERATURE
from buzz.transformers_whisper import TransformersTranscriber
from buzz.settings.recording_transcriber_mode import RecordingTranscriberMode
@ -35,6 +38,9 @@ class RecordingTranscriber(QObject):
transcription = pyqtSignal(str)
finished = pyqtSignal()
error = pyqtSignal(str)
amplitude_changed = pyqtSignal(float)
average_amplitude_changed = pyqtSignal(float)
queue_size_changed = pyqtSignal(int)
is_running = False
SAMPLE_RATE = whisper_audio.SAMPLE_RATE
@ -56,10 +62,10 @@ class RecordingTranscriber(QObject):
self.input_device_index = input_device_index
self.sample_rate = sample_rate if sample_rate is not None else whisper_audio.SAMPLE_RATE
self.model_path = model_path
self.n_batch_samples = 5 * self.sample_rate # 5 seconds
self.n_batch_samples = int(5 * self.sample_rate) # 5 seconds
self.keep_sample_seconds = 0.15
if self.transcriber_mode == RecordingTranscriberMode.APPEND_AND_CORRECT:
self.n_batch_samples = 3 * self.sample_rate # 3 seconds
self.n_batch_samples = int(transcription_options.transcription_step * self.sample_rate)
self.keep_sample_seconds = 1.5
# pause queueing if more than 3 batches behind
self.max_queue_size = 3 * self.n_batch_samples
@ -71,8 +77,10 @@ class RecordingTranscriber(QObject):
key=Settings.Key.OPENAI_API_MODEL, default_value="whisper-1"
)
self.process = None
self._stderr_lines: list[bytes] = []
def start(self):
self.is_running = True
model = None
model_path = self.model_path
keep_samples = int(self.keep_sample_seconds * self.sample_rate)
@ -88,6 +96,12 @@ class RecordingTranscriber(QObject):
model = whisper.load_model(model_path, device=device)
elif self.transcription_options.model.model_type == ModelType.WHISPER_CPP:
self.start_local_whisper_server()
if self.openai_client is None:
if not self.is_running:
self.finished.emit()
else:
self.error.emit(_("Whisper server failed to start. Check logs for details."))
return
elif self.transcription_options.model.model_type == ModelType.FASTER_WHISPER:
model_root_dir = user_cache_dir("Buzz")
model_root_dir = os.path.join(model_root_dir, "models")
@ -120,14 +134,6 @@ class RecordingTranscriber(QObject):
cpu_threads=(os.cpu_count() or 8)//2,
)
# This was commented out as it was causing issues. On the other hand some users are reporting errors without
# this. It is possible issues were present in older model versions without some config files and now are fixed
#
# Fix for large-v3 https://github.com/guillaumekln/faster-whisper/issues/547#issuecomment-1797962599
# if self.transcription_options.model.whisper_model_size in {WhisperModelSize.LARGEV3, WhisperModelSize.LARGEV3TURBO}:
# model.feature_extractor.mel_filters = model.feature_extractor.get_mel_filters(
# model.feature_extractor.sampling_rate, model.feature_extractor.n_fft, n_mels=128
# )
elif self.transcription_options.model.model_type == ModelType.OPEN_AI_WHISPER_API:
custom_openai_base_url = self.settings.value(
key=Settings.Key.CUSTOM_OPENAI_BASE_URL, default_value=""
@ -152,7 +158,6 @@ class RecordingTranscriber(QObject):
self.input_device_index,
)
self.is_running = True
try:
with self.sounddevice.InputStream(
samplerate=self.sample_rate,
@ -164,11 +169,19 @@ class RecordingTranscriber(QObject):
while self.is_running:
if self.queue.size >= self.n_batch_samples:
self.mutex.acquire()
samples = self.queue[: self.n_batch_samples]
self.queue = self.queue[self.n_batch_samples - keep_samples:]
cut = self.find_silence_cut_point(
self.queue[:self.n_batch_samples], self.sample_rate
)
samples = self.queue[:cut]
if self.transcriber_mode == RecordingTranscriberMode.APPEND_AND_CORRECT:
self.queue = self.queue[cut - keep_samples:]
else:
self.queue = self.queue[cut:]
self.mutex.release()
amplitude = self.amplitude(samples)
self.average_amplitude_changed.emit(amplitude)
self.queue_size_changed.emit(self.queue.size)
logging.debug(
"Processing next frame, sample size = %s, queue size = %s, amplitude = %s",
@ -177,7 +190,7 @@ class RecordingTranscriber(QObject):
amplitude,
)
if amplitude < 0.025:
if amplitude < self.transcription_options.silence_threshold:
time.sleep(0.5)
continue
@ -193,8 +206,9 @@ class RecordingTranscriber(QObject):
language=self.transcription_options.language,
task=self.transcription_options.task.value,
initial_prompt=initial_prompt,
temperature=self.transcription_options.temperature,
no_speech_threshold=0.4
temperature=DEFAULT_WHISPER_TEMPERATURE,
no_speech_threshold=0.4,
fp16=False,
)
elif (
self.transcription_options.model.model_type
@ -208,7 +222,7 @@ class RecordingTranscriber(QObject):
else None,
task=self.transcription_options.task.value,
# Prevent crash on Windows https://github.com/SYSTRAN/faster-whisper/issues/71#issuecomment-1526263764
temperature=0 if platform.system() == "Windows" else self.transcription_options.temperature,
temperature=0 if platform.system() == "Windows" else DEFAULT_WHISPER_TEMPERATURE,
initial_prompt=self.transcription_options.initial_prompt,
word_timestamps=False,
without_timestamps=True,
@ -241,8 +255,7 @@ class RecordingTranscriber(QObject):
)
else: # OPEN_AI_WHISPER_API, also used for WHISPER_CPP
if self.openai_client is None:
self.transcription.emit(_("A connection error occurred"))
self.stop_recording()
self.error.emit(_("A connection error occurred"))
return
# scale samples to 16-bit PCM
@ -292,7 +305,7 @@ class RecordingTranscriber(QObject):
next_text: str = result.get("text")
# Update initial prompt between successive recording chunks
initial_prompt += next_text
initial_prompt = next_text
logging.debug(
"Received next result, length = %s, time taken = %s",
@ -305,17 +318,22 @@ class RecordingTranscriber(QObject):
except PortAudioError as exc:
self.error.emit(str(exc))
logging.exception("")
logging.exception("PortAudio error during recording")
return
except Exception as exc:
logging.exception("Unexpected error during recording")
self.error.emit(str(exc))
return
self.finished.emit()
# Cleanup
# Cleanup before emitting finished to avoid destroying QThread
# while this function is still on the call stack
if model:
del model
if torch.cuda.is_available():
torch.cuda.empty_cache()
self.finished.emit()
@staticmethod
def get_device_sample_rate(device_id: Optional[int]) -> int:
"""Returns the sample rate to be used for recording. It uses the default sample rate
@ -335,19 +353,64 @@ class RecordingTranscriber(QObject):
def stream_callback(self, in_data: np.ndarray, frame_count, time_info, status):
# Try to enqueue the next block. If the queue is already full, drop the block.
chunk: np.ndarray = in_data.ravel()
amplitude = self.amplitude(chunk)
self.amplitude_changed.emit(amplitude)
with self.mutex:
if self.queue.size < self.max_queue_size:
self.queue = np.append(self.queue, chunk)
@staticmethod
def find_silence_cut_point(samples: np.ndarray, sample_rate: int,
search_seconds: float = 1.5,
window_seconds: float = 0.02,
silence_ratio: float = 0.5) -> int:
"""Return index of the last quiet point in the final search_seconds of samples.
Scans backwards through short windows; returns the midpoint of the rightmost
window whose RMS is below silence_ratio * mean_rms of the search region.
Falls back to len(samples) if no quiet window is found.
"""
window = int(window_seconds * sample_rate)
search_start = max(0, len(samples) - int(search_seconds * sample_rate))
region = samples[search_start:]
n_windows = (len(region) - window) // window
if n_windows < 1:
return len(samples)
energies = np.array([
np.sqrt(np.mean(region[i * window:(i + 1) * window] ** 2))
for i in range(n_windows)
])
mean_energy = energies.mean()
threshold = silence_ratio * mean_energy
for i in range(n_windows - 1, -1, -1):
if energies[i] < threshold:
cut = search_start + i * window + window // 2
return cut
return len(samples)
@staticmethod
def amplitude(arr: np.ndarray):
return (abs(max(arr)) + abs(min(arr))) / 2
return float(np.sqrt(np.mean(arr**2)))
def _drain_stderr(self):
if self.process and self.process.stderr:
for line in self.process.stderr:
self._stderr_lines.append(line)
def stop_recording(self):
self.is_running = False
if self.process and self.process.poll() is None:
self.process.terminate()
self.process.wait(5000)
try:
self.process.wait(timeout=5)
except subprocess.TimeoutExpired:
self.process.kill()
logging.warning("Whisper server process had to be killed after timeout")
def start_local_whisper_server(self):
# Reduce verbose HTTP client logging from OpenAI/httpx
@ -357,6 +420,9 @@ class RecordingTranscriber(QObject):
self.transcription.emit(_("Starting Whisper.cpp..."))
if platform.system() == "Darwin" and platform.machine() == "arm64":
self.transcription.emit(_("First time use of a model may take up to several minutest to load."))
self.process = None
server_executable = "whisper-server.exe" if sys.platform == "win32" else "whisper-server"
@ -373,8 +439,6 @@ class RecordingTranscriber(QObject):
"--threads", str(os.getenv("BUZZ_WHISPERCPP_N_THREADS", (os.cpu_count() or 8) // 2)),
"--model", self.model_path,
"--no-timestamps",
# on Windows context causes duplications of last message
"--no-context",
# Protections against hallucinated repetition. Seems to be problem on macOS
# https://github.com/ggml-org/whisper.cpp/issues/1507
"--max-context", "64",
@ -408,20 +472,27 @@ class RecordingTranscriber(QObject):
except Exception as e:
error_msg = f"Failed to start whisper-server subprocess: {str(e)}"
logging.error(error_msg)
self.error.emit(error_msg)
return
# Wait for server to start and load model
time.sleep(10)
# Drain stderr in a background thread to prevent pipe buffer from filling
# up and blocking the subprocess (especially on Windows with compiled exe).
self._stderr_lines = []
stderr_thread = threading.Thread(target=self._drain_stderr, daemon=True)
stderr_thread.start()
# Wait for server to start and load model, checking periodically
for i in range(100): # 10 seconds total, in 0.1s increments
if not self.is_running or self.process.poll() is not None:
break
time.sleep(0.1)
if self.process is not None and self.process.poll() is None:
self.transcription.emit(_("Starting transcription..."))
logging.debug(f"Whisper server started successfully.")
logging.debug(f"Model: {self.model_path}")
else:
stderr_output = ""
if self.process.stderr is not None:
stderr_output = self.process.stderr.read().decode()
stderr_thread.join(timeout=2)
stderr_output = b"".join(self._stderr_lines).decode(errors="replace")
logging.error(f"Whisper server failed to start. Error: {stderr_output}")
self.transcription.emit(_("Whisper server failed to start. Check logs for details."))
@ -447,4 +518,7 @@ class RecordingTranscriber(QObject):
def __del__(self):
if self.process and self.process.poll() is None:
self.process.terminate()
self.process.wait(5000)
try:
self.process.wait(timeout=5)
except subprocess.TimeoutExpired:
self.process.kill()

View file

@ -153,6 +153,9 @@ class TranscriptionOptions:
enable_llm_translation: bool = False
llm_prompt: str = ""
llm_model: str = ""
silence_threshold: float = 0.0025
line_separator: str = "\n\n"
transcription_step: float = 3.5
def humanize_language(language: str) -> str:
@ -199,6 +202,8 @@ class FileTranscriptionTask:
output_directory: Optional[str] = None
source: Source = Source.FILE_IMPORT
file_path: Optional[str] = None
original_file_path: Optional[str] = None # Original path before speech extraction
delete_source_file: bool = False
url: Optional[str] = None
fraction_downloaded: float = 0.0
@ -229,6 +234,9 @@ def get_output_file_path(
export_file_name_template: str | None = None,
):
input_file_name = os.path.splitext(os.path.basename(file_path))[0]
# Remove "_speech" suffix from extracted speech files
if input_file_name.endswith("_speech"):
input_file_name = input_file_name[:-7]
date_time_now = datetime.datetime.now().strftime("%d-%b-%Y %H-%M-%S")
export_file_name_template = (

View file

@ -109,6 +109,11 @@ class WhisperCpp:
"-f", file_to_process,
]
# Add VAD if the model is available
vad_model_path = os.path.join(os.path.dirname(whisper_cli_path), "ggml-silero-v6.2.0.bin")
if os.path.exists(vad_model_path):
cmd.extend(["--vad", "--vad-model", vad_model_path])
# Add translate flag if needed
if task.transcription_options.task == Task.TRANSLATE:
cmd.extend(["--translate"])
@ -180,79 +185,165 @@ class WhisperCpp:
# Extract word-level timestamps from tokens array
# Combine tokens into words using similar logic as whisper_cpp.py
transcription = result.get("transcription", [])
# Languages that don't use spaces between words
# For these, each token is treated as a separate word
non_space_languages = {"zh", "ja", "th", "lo", "km", "my"}
is_non_space_language = language in non_space_languages
for segment_data in transcription:
tokens = segment_data.get("tokens", [])
# Accumulate tokens into words
word_buffer = b""
word_start = 0
word_end = 0
def append_word(buffer: bytes, start: int, end: int):
"""Try to decode and append a word segment, handling multi-byte UTF-8"""
if not buffer:
return True
# Try to decode as UTF-8
# https://github.com/ggerganov/whisper.cpp/issues/1798
try:
text = buffer.decode("utf-8").strip()
if text:
segments.append(
Segment(
start=start,
end=end,
text=text,
translation=""
if is_non_space_language:
# For languages without spaces (Chinese, Japanese, etc.),
# each complete UTF-8 character is treated as a separate word.
# Some characters may be split across multiple tokens as raw bytes.
char_buffer = b""
char_start = 0
char_end = 0
def flush_complete_chars(buffer: bytes, start: int, end: int):
"""Extract and output all complete UTF-8 characters from buffer.
Returns any remaining incomplete bytes."""
nonlocal segments
remaining = buffer
pos = 0
while pos < len(remaining):
# Try to decode one character at a time
for char_len in range(1, min(5, len(remaining) - pos + 1)):
try:
char = remaining[pos:pos + char_len].decode("utf-8")
# Successfully decoded a character
if char.strip():
segments.append(
Segment(
start=start,
end=end,
text=char,
translation=""
)
)
pos += char_len
break
except UnicodeDecodeError:
if char_len == 4 or pos + char_len >= len(remaining):
# Incomplete character at end - return as remaining
return remaining[pos:]
else:
# Couldn't decode, might be incomplete at end
return remaining[pos:]
return b""
for token_data in tokens:
token_text = token_data.get("text", "")
# Skip special tokens like [_TT_], [_BEG_]
if token_text.startswith("[_"):
continue
if not token_text:
continue
token_start = int(token_data.get("offsets", {}).get("from", 0))
token_end = int(token_data.get("offsets", {}).get("to", 0))
# Convert latin-1 string back to original bytes
token_bytes = token_text.encode("latin-1")
if not char_buffer:
char_start = token_start
char_buffer += token_bytes
char_end = token_end
# Try to flush complete characters
char_buffer = flush_complete_chars(char_buffer, char_start, char_end)
# If buffer was fully flushed, reset start time for next char
if not char_buffer:
char_start = token_end
# Flush any remaining buffer at end of segment
if char_buffer:
flush_complete_chars(char_buffer, char_start, char_end)
else:
# For space-separated languages, accumulate tokens into words
word_buffer = b""
word_start = 0
word_end = 0
def append_word(buffer: bytes, start: int, end: int):
"""Try to decode and append a word segment, handling multi-byte UTF-8"""
if not buffer:
return True
# Try to decode as UTF-8
# https://github.com/ggerganov/whisper.cpp/issues/1798
try:
text = buffer.decode("utf-8").strip()
if text:
segments.append(
Segment(
start=start,
end=end,
text=text,
translation=""
)
)
)
return True
except UnicodeDecodeError:
# Multi-byte character is split, continue accumulating
return False
for token_data in tokens:
# Token text is read as latin-1, need to convert to bytes to get original data
token_text = token_data.get("text", "")
# Skip special tokens like [_TT_], [_BEG_]
if token_text.startswith("[_"):
continue
if not token_text:
continue
token_start = int(token_data.get("offsets", {}).get("from", 0))
token_end = int(token_data.get("offsets", {}).get("to", 0))
# Convert latin-1 string back to original bytes
# (latin-1 preserves byte values as code points)
token_bytes = token_text.encode("latin-1")
# Check if token starts with space - indicates new word
if token_bytes.startswith(b" ") and word_buffer:
# Save previous word
append_word(word_buffer, word_start, word_end)
# Start new word
word_buffer = token_bytes
word_start = token_start
word_end = token_end
elif token_bytes.startswith(b", "):
# Handle comma - save word with comma, then start new word
word_buffer += b","
append_word(word_buffer, word_start, word_end)
word_buffer = token_bytes.lstrip(b",")
word_start = token_start
word_end = token_end
else:
# Accumulate token into current word
if not word_buffer:
return True
except UnicodeDecodeError:
# Multi-byte character is split, continue accumulating
return False
for token_data in tokens:
# Token text is read as latin-1, need to convert to bytes to get original data
token_text = token_data.get("text", "")
# Skip special tokens like [_TT_], [_BEG_]
if token_text.startswith("[_"):
continue
if not token_text:
continue
# Skip low probability tokens
token_p = token_data.get("p", 1.0)
if token_p < 0.01:
continue
token_start = int(token_data.get("offsets", {}).get("from", 0))
token_end = int(token_data.get("offsets", {}).get("to", 0))
# Convert latin-1 string back to original bytes
# (latin-1 preserves byte values as code points)
token_bytes = token_text.encode("latin-1")
# Check if token starts with space - indicates new word
if token_bytes.startswith(b" ") and word_buffer:
# Save previous word
append_word(word_buffer, word_start, word_end)
# Start new word
word_buffer = token_bytes
word_start = token_start
word_buffer += token_bytes
word_end = token_end
# Add the last word
append_word(word_buffer, word_start, word_end)
word_end = token_end
elif token_bytes.startswith(b", "):
# Handle comma - save word with comma, then start new word
word_buffer += b","
append_word(word_buffer, word_start, word_end)
word_buffer = token_bytes.lstrip(b",")
word_start = token_start
word_end = token_end
else:
# Accumulate token into current word
if not word_buffer:
word_start = token_start
word_buffer += token_bytes
word_end = token_end
# Add the last word
append_word(word_buffer, word_start, word_end)
else:
# Use segment-level timestamps
transcription = result.get("transcription", [])

View file

@ -5,6 +5,10 @@ import multiprocessing
import re
import os
import sys
# Preload CUDA libraries before importing torch - required for subprocess contexts
from buzz import cuda_setup # noqa: F401
import torch
import platform
import subprocess
@ -21,9 +25,10 @@ from buzz.conn import pipe_stderr
from buzz.model_loader import ModelType, WhisperModelSize, map_language_to_mms
from buzz.transformers_whisper import TransformersTranscriber
from buzz.transcriber.file_transcriber import FileTranscriber
from buzz.transcriber.transcriber import FileTranscriptionTask, Segment, Task
from buzz.transcriber.transcriber import FileTranscriptionTask, Segment, Task, DEFAULT_WHISPER_TEMPERATURE
from buzz.transcriber.whisper_cpp import WhisperCpp
import av
import faster_whisper
import whisper
import stable_whisper
@ -32,6 +37,22 @@ from stable_whisper import WhisperResult
PROGRESS_REGEX = re.compile(r"\d+(\.\d+)?%")
def check_file_has_audio_stream(file_path: str) -> None:
"""Check if a media file has at least one audio stream.
Raises:
ValueError: If the file has no audio streams.
"""
try:
with av.open(file_path) as container:
if len(container.streams.audio) == 0:
raise ValueError("No audio streams found")
except av.error.InvalidDataError as e:
raise ValueError(f"Invalid media file: {e}")
except av.error.FileNotFoundError:
raise ValueError("File not found")
class WhisperFileTranscriber(FileTranscriber):
"""WhisperFileTranscriber transcribes an audio file to text, writes the text to a file, and then opens the file
using the default program for opening txt files."""
@ -50,6 +71,7 @@ class WhisperFileTranscriber(FileTranscriber):
self.stopped = False
self.recv_pipe = None
self.send_pipe = None
self.error_message = None
def transcribe(self) -> List[Segment]:
time_started = datetime.datetime.now()
@ -115,7 +137,7 @@ class WhisperFileTranscriber(FileTranscriber):
logging.debug("Whisper process was terminated (exit code: %s), treating as cancellation", self.current_process.exitcode)
raise Exception("Transcription was canceled")
else:
raise Exception("Unknown error")
raise Exception(self.error_message or "Unknown error")
return self.segments
@ -123,10 +145,6 @@ class WhisperFileTranscriber(FileTranscriber):
def transcribe_whisper(
cls, stderr_conn: Connection, task: FileTranscriptionTask
) -> None:
# Preload CUDA libraries in the subprocess - must be done before importing torch
# This is needed because multiprocessing creates a fresh process without the main process's preloaded libraries
from buzz import cuda_setup # noqa: F401
# Patch subprocess on Windows to prevent console window flash
# This is needed because multiprocessing spawns a new process without the main process patches
if sys.platform == "win32":
@ -158,27 +176,36 @@ class WhisperFileTranscriber(FileTranscriber):
subprocess.run = _patched_run
subprocess.Popen = _PatchedPopen
with pipe_stderr(stderr_conn):
if task.transcription_options.model.model_type == ModelType.WHISPER_CPP:
segments = cls.transcribe_whisper_cpp(task)
elif task.transcription_options.model.model_type == ModelType.HUGGING_FACE:
sys.stderr.write("0%\n")
segments = cls.transcribe_hugging_face(task)
sys.stderr.write("100%\n")
elif (
task.transcription_options.model.model_type == ModelType.FASTER_WHISPER
):
segments = cls.transcribe_faster_whisper(task)
elif task.transcription_options.model.model_type == ModelType.WHISPER:
segments = cls.transcribe_openai_whisper(task)
else:
raise Exception(
f"Invalid model type: {task.transcription_options.model.model_type}"
)
try:
# Check if the file has audio streams before processing
check_file_has_audio_stream(task.file_path)
segments_json = json.dumps(segments, ensure_ascii=True, default=vars)
sys.stderr.write(f"segments = {segments_json}\n")
sys.stderr.write(WhisperFileTranscriber.READ_LINE_THREAD_STOP_TOKEN + "\n")
with pipe_stderr(stderr_conn):
if task.transcription_options.model.model_type == ModelType.WHISPER_CPP:
segments = cls.transcribe_whisper_cpp(task)
elif task.transcription_options.model.model_type == ModelType.HUGGING_FACE:
sys.stderr.write("0%\n")
segments = cls.transcribe_hugging_face(task)
sys.stderr.write("100%\n")
elif (
task.transcription_options.model.model_type == ModelType.FASTER_WHISPER
):
segments = cls.transcribe_faster_whisper(task)
elif task.transcription_options.model.model_type == ModelType.WHISPER:
segments = cls.transcribe_openai_whisper(task)
else:
raise Exception(
f"Invalid model type: {task.transcription_options.model.model_type}"
)
segments_json = json.dumps(segments, ensure_ascii=True, default=vars)
sys.stderr.write(f"segments = {segments_json}\n")
sys.stderr.write(WhisperFileTranscriber.READ_LINE_THREAD_STOP_TOKEN + "\n")
except Exception as e:
# Send error message back to the parent process
stderr_conn.send(f"error = {str(e)}\n")
stderr_conn.send(WhisperFileTranscriber.READ_LINE_THREAD_STOP_TOKEN + "\n")
raise
@classmethod
def transcribe_whisper_cpp(cls, task: FileTranscriptionTask) -> List[Segment]:
@ -265,7 +292,7 @@ class WhisperFileTranscriber(FileTranscriber):
language=task.transcription_options.language,
task=task.transcription_options.task.value,
# Prevent crash on Windows https://github.com/SYSTRAN/faster-whisper/issues/71#issuecomment-1526263764
temperature = 0 if platform.system() == "Windows" else task.transcription_options.temperature,
temperature = 0 if platform.system() == "Windows" else DEFAULT_WHISPER_TEMPERATURE,
initial_prompt=task.transcription_options.initial_prompt,
word_timestamps=task.transcription_options.word_level_timings,
no_speech_threshold=0.4,
@ -322,9 +349,10 @@ class WhisperFileTranscriber(FileTranscriber):
audio=whisper_audio.load_audio(task.file_path),
language=task.transcription_options.language,
task=task.transcription_options.task.value,
temperature=task.transcription_options.temperature,
temperature=DEFAULT_WHISPER_TEMPERATURE,
initial_prompt=task.transcription_options.initial_prompt,
no_speech_threshold=0.4,
fp16=False,
)
return [
Segment(
@ -344,6 +372,7 @@ class WhisperFileTranscriber(FileTranscriber):
temperature=task.transcription_options.temperature,
initial_prompt=task.transcription_options.initial_prompt,
verbose=False,
fp16=False,
)
segments = result.get("segments")
return [
@ -415,6 +444,8 @@ class WhisperFileTranscriber(FileTranscriber):
for segment in segments_dict
]
self.segments = segments
elif line.startswith("error = "):
self.error_message = line[8:]
else:
try:
match = PROGRESS_REGEX.search(line)

View file

@ -3,6 +3,10 @@ import sys
import logging
import platform
import numpy as np
# Preload CUDA libraries before importing torch
from buzz import cuda_setup # noqa: F401
import torch
import requests
from typing import Union
@ -225,7 +229,7 @@ class TransformersTranscriber:
model, processor, use_8bit = self._load_peft_model(device, torch_dtype)
else:
use_safetensors = True
if os.path.exists(self.model_id):
if os.path.isdir(self.model_id):
safetensors_files = [f for f in os.listdir(self.model_id) if f.endswith(".safetensors")]
use_safetensors = len(safetensors_files) > 0

View file

@ -1,17 +1,22 @@
import os
import re
import logging
import queue
from typing import Optional
from typing import Optional, List, Tuple
from openai import OpenAI, max_retries
from PyQt6.QtCore import QObject, pyqtSignal
from buzz.locale import _
from buzz.settings.settings import Settings
from buzz.store.keyring_store import get_password, Key
from buzz.transcriber.transcriber import TranscriptionOptions
from buzz.widgets.transcriber.advanced_settings_dialog import AdvancedSettingsDialog
BATCH_SIZE = 10
class Translator(QObject):
translation = pyqtSignal(str, int)
finished = pyqtSignal()
@ -51,6 +56,94 @@ class Translator(QObject):
max_retries=0
)
def _translate_single(self, transcript: str, transcript_id: int) -> Tuple[str, int]:
"""Translate a single transcript via the API. Returns (translation, transcript_id)."""
try:
completion = self.openai_client.chat.completions.create(
model=self.transcription_options.llm_model,
messages=[
{"role": "system", "content": self.transcription_options.llm_prompt},
{"role": "user", "content": transcript}
],
timeout=60.0,
)
except Exception as e:
completion = None
logging.error(f"Translation error! Server response: {e}")
if completion and completion.choices and completion.choices[0].message:
logging.debug(f"Received translation response: {completion}")
return completion.choices[0].message.content, transcript_id
else:
logging.error(f"Translation error! Server response: {completion}")
# Translation error
return "", transcript_id
def _translate_batch(self, items: List[Tuple[str, int]]) -> List[Tuple[str, int]]:
"""Translate multiple transcripts in a single API call.
Returns list of (translation, transcript_id) in the same order as input."""
numbered_parts = []
for i, (transcript, _) in enumerate(items, 1):
numbered_parts.append(f"[{i}] {transcript}")
combined = "\n".join(numbered_parts)
batch_prompt = (
f"{self.transcription_options.llm_prompt}\n\n"
f"You will receive {len(items)} numbered texts. "
f"Process each one separately according to the instruction above "
f"and return them in the exact same numbered format, e.g.:\n"
f"[1] processed text\n[2] processed text"
)
try:
completion = self.openai_client.chat.completions.create(
model=self.transcription_options.llm_model,
messages=[
{"role": "system", "content": batch_prompt},
{"role": "user", "content": combined}
],
timeout=60.0,
)
except Exception as e:
completion = None
logging.error(f"Batch translation error! Server response: {e}")
if not (completion and completion.choices and completion.choices[0].message):
logging.error(f"Batch translation error! Server response: {completion}")
# Translation error
return [("", tid) for _, tid in items]
response_text = completion.choices[0].message.content
logging.debug(f"Received batch translation response: {response_text}")
translations = self._parse_batch_response(response_text, len(items))
results = []
for i, (_, transcript_id) in enumerate(items):
if i < len(translations):
results.append((translations[i], transcript_id))
else:
# Translation error
results.append(("", transcript_id))
return results
@staticmethod
def _parse_batch_response(response: str, expected_count: int) -> List[str]:
"""Parse a numbered batch response like '[1] text\\n[2] text' into a list of strings."""
# Split on [N] markers — re.split with a group returns: [before, group1, after1, group2, after2, ...]
parts = re.split(r'\[(\d+)\]\s*', response)
translations = {}
for i in range(1, len(parts) - 1, 2):
num = int(parts[i])
text = parts[i + 1].strip()
translations[num] = text
return [
translations.get(i, "")
for i in range(1, expected_count + 1)
]
def start(self):
logging.debug("Starting translation queue")
@ -62,30 +155,32 @@ class Translator(QObject):
logging.debug("Translation queue received stop signal")
break
transcript, transcript_id = item
# Collect a batch: start with the first item, then drain more
batch = [item]
stop_after_batch = False
while len(batch) < BATCH_SIZE:
try:
next_item = self.queue.get_nowait()
if next_item is None:
stop_after_batch = True
break
batch.append(next_item)
except queue.Empty:
break
try:
completion = self.openai_client.chat.completions.create(
model=self.transcription_options.llm_model,
messages=[
{"role": "system", "content": self.transcription_options.llm_prompt},
{"role": "user", "content": transcript}
],
timeout=30.0,
)
except Exception as e:
completion = None
logging.error(f"Translation error! Server response: {e}")
if completion and completion.choices and completion.choices[0].message:
logging.debug(f"Received translation response: {completion}")
next_translation = completion.choices[0].message.content
if len(batch) == 1:
transcript, transcript_id = batch[0]
translation, tid = self._translate_single(transcript, transcript_id)
self.translation.emit(translation, tid)
else:
logging.error(f"Translation error! Server response: {completion}")
next_translation = "Translation error, see logs!"
logging.debug(f"Translating batch of {len(batch)} in single request")
results = self._translate_batch(batch)
for translation, tid in results:
self.translation.emit(translation, tid)
self.translation.emit(next_translation, transcript_id)
if stop_after_batch:
logging.debug("Translation queue received stop signal")
break
logging.debug("Translation queue stopped")
self.finished.emit()

163
buzz/update_checker.py Normal file
View file

@ -0,0 +1,163 @@
import json
import logging
import platform
from datetime import datetime
from typing import Optional
from dataclasses import dataclass
from PyQt6.QtCore import QObject, pyqtSignal, QUrl
from PyQt6.QtNetwork import QNetworkAccessManager, QNetworkRequest, QNetworkReply
from buzz.__version__ import VERSION
from buzz.settings.settings import Settings
@dataclass
class UpdateInfo:
version: str
release_notes: str
download_urls: list
class UpdateChecker(QObject):
update_available = pyqtSignal(object)
VERSION_JSON_URL = "https://github.com/chidiwilliams/buzz/releases/latest/download/version_info.json"
CHECK_INTERVAL_DAYS = 7
def __init__(
self,
settings: Settings,
network_manager: Optional[QNetworkAccessManager] = None,
parent: Optional[QObject] = None
):
super().__init__(parent)
self.settings = settings
if network_manager is None:
network_manager = QNetworkAccessManager(self)
self.network_manager = network_manager
self.network_manager.finished.connect(self._on_reply_finished)
def should_check_for_updates(self) -> bool:
"""Check if we are on Windows/macOS and if 7 days passed"""
system = platform.system()
if system not in ("Windows", "Darwin"):
logging.debug("Skipping update check on linux")
return False
last_check = self.settings.value(
Settings.Key.LAST_UPDATE_CHECK,
"",
)
if last_check:
try:
last_check_date = datetime.fromisoformat(last_check)
days_since_check = (datetime.now() - last_check_date).days
if days_since_check < self.CHECK_INTERVAL_DAYS:
logging.debug(
f"Skipping update check, last checked {days_since_check} days ago"
)
return False
except ValueError:
#Invalid date format
pass
return True
def check_for_updates(self) -> None:
"""Start the network request"""
if not self.should_check_for_updates():
return
logging.info("Checking for updates...")
url = QUrl(self.VERSION_JSON_URL)
request = QNetworkRequest(url)
self.network_manager.get(request)
def _on_reply_finished(self, reply: QNetworkReply) -> None:
"""Handles the network reply for version.json fetch"""
self.settings.set_value(
Settings.Key.LAST_UPDATE_CHECK,
datetime.now().isoformat()
)
if reply.error() != QNetworkReply.NetworkError.NoError:
error_msg = f"Failed to check for updates: {reply.errorString()}"
logging.error(error_msg)
reply.deleteLater()
return
try:
data = json.loads(reply.readAll().data().decode("utf-8"))
reply.deleteLater()
remote_version = data.get("version", "")
release_notes = data.get("release_notes", "")
download_urls = data.get("download_urls", {})
#Get the download url for current platform
download_url = self._get_download_url(download_urls)
if self._is_newer_version(remote_version):
logging.info(f"Update available: {remote_version}")
#Store the available version
self.settings.set_value(
Settings.Key.UPDATE_AVAILABLE_VERSION,
remote_version
)
update_info = UpdateInfo(
version=remote_version,
release_notes=release_notes,
download_urls=download_url
)
self.update_available.emit(update_info)
else:
logging.info("No update available")
self.settings.set_value(
Settings.Key.UPDATE_AVAILABLE_VERSION,
""
)
except (json.JSONDecodeError, KeyError) as e:
error_msg = f"Failed to parse version info: {e}"
logging.error(error_msg)
def _get_download_url(self, download_urls: dict) -> list:
system = platform.system()
machine = platform.machine().lower()
if system == "Windows":
urls = download_urls.get("windows_x64", [])
elif system == "Darwin":
if machine in ("arm64", "aarch64"):
urls = download_urls.get("macos_arm", [])
else:
urls = download_urls.get("macos_x86", [])
else:
urls = []
return urls if isinstance(urls, list) else [urls]
def _is_newer_version(self, remote_version: str) -> bool:
"""Compare remote version with current version"""
try:
current_parts = [int(x) for x in VERSION.split(".")]
remote_parts = [int(x) for x in remote_version.split(".")]
#pad with zeros if needed
while len(current_parts) < len(remote_parts):
current_parts.append(0)
while len(remote_parts) < len(current_parts):
remote_parts.append(0)
return remote_parts > current_parts
except ValueError:
logging.error(f"Invalid version format: {VERSION} or {remote_version}")
return False

View file

@ -34,6 +34,7 @@ class Application(QApplication):
if darkdetect.isDark():
self.styleHints().setColorScheme(Qt.ColorScheme.Dark)
self.setStyleSheet("QCheckBox::indicator:unchecked { border: 1px solid white; }")
if sys.platform.startswith("win"):
self.setStyle(QStyleFactory.create("Fusion"))

View file

@ -1,10 +1,12 @@
from typing import Optional
from PyQt6 import QtGui
from PyQt6.QtCore import Qt
from PyQt6.QtCore import Qt, QRect
from PyQt6.QtGui import QColor, QPainter
from PyQt6.QtWidgets import QWidget
from buzz.locale import _
class AudioMeterWidget(QWidget):
current_amplitude: float
@ -20,13 +22,17 @@ class AudioMeterWidget(QWidget):
def __init__(self, parent: Optional[QWidget] = None):
super().__init__(parent)
self.setMinimumWidth(10)
self.setFixedHeight(16)
self.setFixedHeight(56)
self.BARS_HEIGHT = 28
# Extra padding to fix layout
self.PADDING_TOP = 3
self.PADDING_TOP = 14
self.current_amplitude = 0.0
self.average_amplitude = 0.0
self.queue_size = 0
self.MINIMUM_AMPLITUDE = 0.00005 # minimum amplitude to show the first bar
self.AMPLITUDE_SCALE_FACTOR = 10 # scale the amplitudes such that 1/AMPLITUDE_SCALE_FACTOR will show all bars
@ -58,18 +64,39 @@ class AudioMeterWidget(QWidget):
center_x - ((i + 1) * (self.BAR_MARGIN + self.BAR_WIDTH)),
rect.top() + self.PADDING_TOP,
self.BAR_WIDTH,
rect.height() - self.PADDING_TOP,
self.BARS_HEIGHT - self.PADDING_TOP,
)
# draw to right
painter.drawRect(
center_x + (self.BAR_MARGIN + (i * (self.BAR_MARGIN + self.BAR_WIDTH))),
rect.top() + self.PADDING_TOP,
self.BAR_WIDTH,
rect.height() - self.PADDING_TOP,
self.BARS_HEIGHT - self.PADDING_TOP,
)
text_rect = QRect(rect.left(), self.BARS_HEIGHT, rect.width(), rect.height() - self.BARS_HEIGHT)
painter.setPen(self.BAR_ACTIVE_COLOR)
average_volume_label = _("Average volume")
queue_label = _("Queue")
painter.drawText(text_rect, Qt.AlignmentFlag.AlignCenter,
f"{average_volume_label}: {self.average_amplitude:.4f} {queue_label}: {self.queue_size}")
def reset_amplitude(self):
self.current_amplitude = 0.0
self.average_amplitude = 0.0
self.queue_size = 0
self.repaint()
def update_amplitude(self, amplitude: float):
self.current_amplitude = max(
amplitude, self.current_amplitude * self.SMOOTHING_FACTOR
)
self.repaint()
self.update()
def update_average_amplitude(self, amplitude: float):
self.average_amplitude = amplitude
self.update()
def update_queue_size(self, size: int):
self.queue_size = size
self.update()

View file

@ -129,3 +129,4 @@ ADD_ICON_PATH = get_path("assets/add_FILL0_wght700_GRAD0_opsz48.svg")
URL_ICON_PATH = get_path("assets/url.svg")
TRASH_ICON_PATH = get_path("assets/delete_FILL0_wght700_GRAD0_opsz48.svg")
CANCEL_ICON_PATH = get_path("assets/cancel_FILL0_wght700_GRAD0_opsz48.svg")
UPDATE_ICON_PATH = get_path("assets/update_FILL0_wght700_GRAD0_opsz48.svg")

View file

@ -1,6 +1,5 @@
import os
import logging
import keyring
from typing import Tuple, List, Optional
from uuid import UUID
@ -25,6 +24,8 @@ from buzz.db.service.transcription_service import TranscriptionService
from buzz.file_transcriber_queue_worker import FileTranscriberQueueWorker
from buzz.locale import _
from buzz.settings.settings import APP_NAME, Settings
from buzz.update_checker import UpdateChecker, UpdateInfo
from buzz.widgets.update_dialog import UpdateDialog
from buzz.settings.shortcuts import Shortcuts
from buzz.store.keyring_store import set_password, Key
from buzz.transcriber.transcriber import (
@ -42,6 +43,7 @@ from buzz.widgets.preferences_dialog.models.preferences import Preferences
from buzz.widgets.transcriber.file_transcriber_widget import FileTranscriberWidget
from buzz.widgets.transcription_task_folder_watcher import (
TranscriptionTaskFolderWatcher,
SUPPORTED_EXTENSIONS,
)
from buzz.widgets.transcription_tasks_table_widget import (
TranscriptionTasksTableWidget,
@ -70,6 +72,9 @@ class MainWindow(QMainWindow):
self.quit_on_complete = False
self.transcription_service = transcription_service
#update checker
self._update_info: Optional[UpdateInfo] = None
self.toolbar = MainWindowToolbar(shortcuts=self.shortcuts, parent=self)
self.toolbar.new_transcription_action_triggered.connect(
self.on_new_transcription_action_triggered
@ -87,6 +92,7 @@ class MainWindow(QMainWindow):
self.on_stop_transcription_action_triggered
)
self.addToolBar(self.toolbar)
self.toolbar.update_action_triggered.connect(self.on_update_action_triggered)
self.setUnifiedTitleAndToolBarOnMac(True)
self.preferences = self.load_preferences(settings=self.settings)
@ -101,6 +107,9 @@ class MainWindow(QMainWindow):
self.menu_bar.import_url_action_triggered.connect(
self.on_new_url_transcription_action_triggered
)
self.menu_bar.import_folder_action_triggered.connect(
self.on_import_folder_action_triggered
)
self.menu_bar.shortcuts_changed.connect(self.on_shortcuts_changed)
self.menu_bar.openai_api_key_changed.connect(
self.on_openai_access_token_changed
@ -153,6 +162,9 @@ class MainWindow(QMainWindow):
self.transcription_viewer_widget = None
#Initialize and run update checker
self._init_update_checker()
def on_preferences_changed(self, preferences: Preferences):
self.preferences = preferences
self.save_preferences(preferences)
@ -256,6 +268,20 @@ class MainWindow(QMainWindow):
if url is not None:
self.open_file_transcriber_widget(url=url)
def on_import_folder_action_triggered(self):
folder = QFileDialog.getExistingDirectory(self, _("Select folder"))
if not folder:
return
file_paths = []
for dirpath, _dirs, filenames in os.walk(folder):
for filename in filenames:
ext = os.path.splitext(filename)[1].lower()
if ext in SUPPORTED_EXTENSIONS:
file_paths.append(os.path.join(dirpath, filename))
if not file_paths:
return
self.open_file_transcriber_widget(file_paths)
def open_file_transcriber_widget(
self, file_paths: Optional[List[str]] = None, url: Optional[str] = None
):
@ -392,6 +418,7 @@ class MainWindow(QMainWindow):
basename = os.path.basename(task.file_path)
name = os.path.splitext(basename)[0] # Remove .wav extension
self.transcription_service.update_transcription_file_and_name(task.uid, task.file_path, name)
self.transcription_service.update_transcription_as_completed(task.uid, segments)
self.table_widget.refresh_row(task.uid)
@ -416,8 +443,6 @@ class MainWindow(QMainWindow):
self.save_geometry()
def closeEvent(self, event: QtGui.QCloseEvent) -> None:
logging.debug("Starting MainWindow closeEvent")
self.save_geometry()
self.settings.settings.sync()
@ -477,3 +502,27 @@ class MainWindow(QMainWindow):
self.setBaseSize(1240, 600)
self.resize(1240, 600)
self.settings.end_group()
def _init_update_checker(self):
"""Initializes and runs the update checker."""
self.update_checker = UpdateChecker(settings=self.settings, parent=self)
self.update_checker.update_available.connect(self._on_update_available)
# Check for updates on startup
self.update_checker.check_for_updates()
def _on_update_available(self, update_info: UpdateInfo):
"""Called when an update is available."""
self._update_info = update_info
self.toolbar.set_update_available(True)
def on_update_action_triggered(self):
"""Called when user clicks the update action in toolbar."""
if self._update_info is None:
return
dialog = UpdateDialog(
update_info=self._update_info,
parent=self
)
dialog.exec()

View file

@ -16,6 +16,7 @@ from buzz.widgets.icon import (
EXPAND_ICON_PATH,
CANCEL_ICON_PATH,
TRASH_ICON_PATH,
UPDATE_ICON_PATH,
)
from buzz.widgets.recording_transcriber_widget import RecordingTranscriberWidget
from buzz.widgets.toolbar import ToolBar
@ -26,6 +27,7 @@ class MainWindowToolbar(ToolBar):
new_url_transcription_action_triggered: pyqtSignal
open_transcript_action_triggered: pyqtSignal
clear_history_action_triggered: pyqtSignal
update_action_triggered: pyqtSignal
ICON_LIGHT_THEME_BACKGROUND = "#555"
ICON_DARK_THEME_BACKGROUND = "#AAA"
@ -70,6 +72,13 @@ class MainWindowToolbar(ToolBar):
self.clear_history_action = Action(
Icon(TRASH_ICON_PATH, self), _("Clear History"), self
)
self.update_action = Action(
Icon(UPDATE_ICON_PATH, self), _("Update Available"), self
)
self.update_action_triggered = self.update_action.triggered
self.update_action.setVisible(False)
self.clear_history_action_triggered = self.clear_history_action.triggered
self.clear_history_action.setDisabled(True)
@ -86,6 +95,10 @@ class MainWindowToolbar(ToolBar):
self.clear_history_action,
]
)
self.addSeparator()
self.addAction(self.update_action)
self.setMovable(False)
self.setToolButtonStyle(Qt.ToolButtonStyle.ToolButtonIconOnly)
@ -93,12 +106,6 @@ class MainWindowToolbar(ToolBar):
self.record_action.setShortcut(
QKeySequence.fromString(self.shortcuts.get(Shortcut.OPEN_RECORD_WINDOW))
)
self.new_transcription_action.setShortcut(
QKeySequence.fromString(self.shortcuts.get(Shortcut.OPEN_IMPORT_WINDOW))
)
self.new_url_transcription_action.setShortcut(
QKeySequence.fromString(self.shortcuts.get(Shortcut.OPEN_IMPORT_URL_WINDOW))
)
self.stop_transcription_action.setShortcut(
QKeySequence.fromString(self.shortcuts.get(Shortcut.STOP_TRANSCRIPTION))
)
@ -120,3 +127,7 @@ class MainWindowToolbar(ToolBar):
def set_clear_history_action_enabled(self, enabled: bool):
self.clear_history_action.setEnabled(enabled)
def set_update_available(self, available: bool):
"""Shows or hides the update action in the toolbar."""
self.update_action.setVisible(available)

View file

@ -1,3 +1,4 @@
import platform
import webbrowser
from typing import Optional
@ -19,6 +20,7 @@ from buzz.widgets.preferences_dialog.preferences_dialog import (
class MenuBar(QMenuBar):
import_action_triggered = pyqtSignal()
import_url_action_triggered = pyqtSignal()
import_folder_action_triggered = pyqtSignal()
shortcuts_changed = pyqtSignal()
openai_api_key_changed = pyqtSignal(str)
preferences_changed = pyqtSignal(Preferences)
@ -41,12 +43,17 @@ class MenuBar(QMenuBar):
self.import_url_action = QAction(_("Import URL..."), self)
self.import_url_action.triggered.connect(self.import_url_action_triggered)
self.import_folder_action = QAction(_("Import Folder..."), self)
self.import_folder_action.triggered.connect(self.import_folder_action_triggered)
about_label = _("About")
about_action = QAction(f'{about_label} {APP_NAME}', self)
about_action.triggered.connect(self.on_about_action_triggered)
about_action.setMenuRole(QAction.MenuRole.AboutRole)
self.preferences_action = QAction(_("Preferences..."), self)
self.preferences_action.triggered.connect(self.on_preferences_action_triggered)
self.preferences_action.setMenuRole(QAction.MenuRole.PreferencesRole)
help_label = _("Help")
help_action = QAction(f'{help_label}', self)
@ -57,8 +64,10 @@ class MenuBar(QMenuBar):
file_menu = self.addMenu(_("File"))
file_menu.addAction(self.import_action)
file_menu.addAction(self.import_url_action)
file_menu.addAction(self.import_folder_action)
help_menu = self.addMenu(_("Help"))
help_menu_title = _("Help") + ("\u200B" if platform.system() == "Darwin" else "")
help_menu = self.addMenu(help_menu_title)
help_menu.addAction(about_action)
help_menu.addAction(help_action)
help_menu.addAction(self.preferences_action)

View file

@ -44,11 +44,16 @@ class FolderWatchPreferencesWidget(QWidget):
checkbox.setObjectName("EnableFolderWatchCheckbox")
checkbox.stateChanged.connect(self.on_enable_changed)
input_folder_browse_button = QPushButton(_("Browse"))
input_folder_browse_button.clicked.connect(self.on_click_browse_input_folder)
delete_checkbox = QCheckBox(_("Delete processed files"))
delete_checkbox.setChecked(config.delete_processed_files)
delete_checkbox.setObjectName("DeleteProcessedFilesCheckbox")
delete_checkbox.stateChanged.connect(self.on_delete_processed_files_changed)
output_folder_browse_button = QPushButton(_("Browse"))
output_folder_browse_button.clicked.connect(self.on_click_browse_output_folder)
self.input_folder_browse_button = QPushButton(_("Browse"))
self.input_folder_browse_button.clicked.connect(self.on_click_browse_input_folder)
self.output_folder_browse_button = QPushButton(_("Browse"))
self.output_folder_browse_button.clicked.connect(self.on_click_browse_output_folder)
input_folder_row = QHBoxLayout()
self.input_folder_line_edit = LineEdit(config.input_directory, self)
@ -57,7 +62,7 @@ class FolderWatchPreferencesWidget(QWidget):
self.input_folder_line_edit.setObjectName("InputFolderLineEdit")
input_folder_row.addWidget(self.input_folder_line_edit)
input_folder_row.addWidget(input_folder_browse_button)
input_folder_row.addWidget(self.input_folder_browse_button)
output_folder_row = QHBoxLayout()
self.output_folder_line_edit = LineEdit(config.output_directory, self)
@ -66,7 +71,7 @@ class FolderWatchPreferencesWidget(QWidget):
self.output_folder_line_edit.setObjectName("OutputFolderLineEdit")
output_folder_row.addWidget(self.output_folder_line_edit)
output_folder_row.addWidget(output_folder_browse_button)
output_folder_row.addWidget(self.output_folder_browse_button)
openai_access_token = get_password(Key.OPENAI_API_KEY)
(
@ -77,15 +82,17 @@ class FolderWatchPreferencesWidget(QWidget):
file_paths=[],
)
transcription_form_widget = FileTranscriptionFormWidget(
self.transcription_form_widget = FileTranscriptionFormWidget(
transcription_options=transcription_options,
file_transcription_options=file_transcription_options,
parent=self,
)
transcription_form_widget.transcription_options_changed.connect(
self.transcription_form_widget.transcription_options_changed.connect(
self.on_transcription_options_changed
)
self.delete_checkbox = delete_checkbox
layout = QVBoxLayout(self)
folders_form_layout = QFormLayout()
@ -93,14 +100,17 @@ class FolderWatchPreferencesWidget(QWidget):
folders_form_layout.addRow("", checkbox)
folders_form_layout.addRow(_("Input folder"), input_folder_row)
folders_form_layout.addRow(_("Output folder"), output_folder_row)
folders_form_layout.addWidget(transcription_form_widget)
folders_form_layout.addRow("", delete_checkbox)
folders_form_layout.addWidget(self.transcription_form_widget)
layout.addLayout(folders_form_layout)
layout.addWidget(transcription_form_widget)
layout.addWidget(self.transcription_form_widget)
layout.addStretch()
self.setLayout(layout)
self._set_settings_enabled(config.enabled)
def on_click_browse_input_folder(self):
folder = QFileDialog.getExistingDirectory(self, _("Select Input Folder"))
self.input_folder_line_edit.setText(folder)
@ -119,8 +129,22 @@ class FolderWatchPreferencesWidget(QWidget):
self.config.output_directory = folder
self.config_changed.emit(self.config)
def _set_settings_enabled(self, enabled: bool):
self.input_folder_line_edit.setEnabled(enabled)
self.input_folder_browse_button.setEnabled(enabled)
self.output_folder_line_edit.setEnabled(enabled)
self.output_folder_browse_button.setEnabled(enabled)
self.delete_checkbox.setEnabled(enabled)
self.transcription_form_widget.setEnabled(enabled)
def on_enable_changed(self, state: int):
self.config.enabled = state == 2
enabled = state == 2
self.config.enabled = enabled
self._set_settings_enabled(enabled)
self.config_changed.emit(self.config)
def on_delete_processed_files_changed(self, state: int):
self.config.delete_processed_files = state == 2
self.config_changed.emit(self.config)
def on_transcription_options_changed(

View file

@ -188,6 +188,14 @@ class GeneralPreferencesWidget(QWidget):
layout.addRow(_("Live recording mode"), self.recording_transcriber_mode)
export_note_label = QLabel(
_("Note: Live recording export settings will be moved to the Advanced Settings in the Live Recording screen in a future version."),
self,
)
export_note_label.setWordWrap(True)
export_note_label.setSizePolicy(QSizePolicy.Policy.Expanding, QSizePolicy.Policy.Preferred)
layout.addRow("", export_note_label)
self.reduce_gpu_memory_enabled = self.settings.value(
key=Settings.Key.REDUCE_GPU_MEMORY, default_value=False
)

View file

@ -7,7 +7,6 @@ from buzz.model_loader import TranscriptionModel
from buzz.transcriber.transcriber import (
Task,
OutputFormat,
DEFAULT_WHISPER_TEMPERATURE,
TranscriptionOptions,
FileTranscriptionOptions,
)
@ -20,7 +19,6 @@ class FileTranscriptionPreferences:
model: TranscriptionModel
word_level_timings: bool
extract_speech: bool
temperature: Tuple[float, ...]
initial_prompt: str
enable_llm_translation: bool
llm_prompt: str
@ -33,7 +31,6 @@ class FileTranscriptionPreferences:
settings.setValue("model", self.model)
settings.setValue("word_level_timings", self.word_level_timings)
settings.setValue("extract_speech", self.extract_speech)
settings.setValue("temperature", self.temperature)
settings.setValue("initial_prompt", self.initial_prompt)
settings.setValue("enable_llm_translation", self.enable_llm_translation)
settings.setValue("llm_model", self.llm_model)
@ -59,7 +56,6 @@ class FileTranscriptionPreferences:
extract_speech = False if extract_speech_value == "false" \
else bool(extract_speech_value)
temperature = settings.value("temperature", DEFAULT_WHISPER_TEMPERATURE)
initial_prompt = settings.value("initial_prompt", "")
enable_llm_translation_value = settings.value("enable_llm_translation", False)
enable_llm_translation = False if enable_llm_translation_value == "false" \
@ -75,7 +71,6 @@ class FileTranscriptionPreferences:
else TranscriptionModel.default(),
word_level_timings=word_level_timings,
extract_speech=extract_speech,
temperature=temperature,
initial_prompt=initial_prompt,
enable_llm_translation=enable_llm_translation,
llm_model=llm_model,
@ -94,7 +89,6 @@ class FileTranscriptionPreferences:
return FileTranscriptionPreferences(
task=transcription_options.task,
language=transcription_options.language,
temperature=transcription_options.temperature,
initial_prompt=transcription_options.initial_prompt,
enable_llm_translation=transcription_options.enable_llm_translation,
llm_model=transcription_options.llm_model,
@ -115,7 +109,6 @@ class FileTranscriptionPreferences:
TranscriptionOptions(
task=self.task,
language=self.language,
temperature=self.temperature,
initial_prompt=self.initial_prompt,
enable_llm_translation=self.enable_llm_translation,
llm_model=self.llm_model,

View file

@ -13,11 +13,13 @@ class FolderWatchPreferences:
input_directory: str
output_directory: str
file_transcription_options: FileTranscriptionPreferences
delete_processed_files: bool = False
def save(self, settings: QSettings):
settings.setValue("enabled", self.enabled)
settings.setValue("input_folder", self.input_directory)
settings.setValue("output_directory", self.output_directory)
settings.setValue("delete_processed_files", self.delete_processed_files)
settings.beginGroup("file_transcription_options")
self.file_transcription_options.save(settings)
settings.endGroup()
@ -29,6 +31,8 @@ class FolderWatchPreferences:
input_folder = settings.value("input_folder", defaultValue="", type=str)
output_folder = settings.value("output_directory", defaultValue="", type=str)
delete_value = settings.value("delete_processed_files", False)
delete_processed_files = False if delete_value == "false" else bool(delete_value)
settings.beginGroup("file_transcription_options")
file_transcription_options = FileTranscriptionPreferences.load(settings)
settings.endGroup()
@ -37,4 +41,5 @@ class FolderWatchPreferences:
input_directory=input_folder,
output_directory=output_folder,
file_transcription_options=file_transcription_options,
delete_processed_files=delete_processed_files,
)

View file

@ -7,6 +7,7 @@ from PyQt6.QtWidgets import QWidget, QFormLayout, QPushButton
from buzz.locale import _
from buzz.settings.shortcut import Shortcut
from buzz.settings.shortcuts import Shortcuts
from buzz.widgets.line_edit import LineEdit
from buzz.widgets.sequence_edit import SequenceEdit
@ -19,8 +20,10 @@ class ShortcutsEditorPreferencesWidget(QWidget):
self.shortcuts = shortcuts
self.layout = QFormLayout(self)
_field_height = LineEdit().sizeHint().height()
for shortcut in Shortcut:
sequence_edit = SequenceEdit(shortcuts.get(shortcut), self)
sequence_edit.setFixedHeight(_field_height)
sequence_edit.keySequenceChanged.connect(
self.get_key_sequence_changed(shortcut)
)

View file

@ -1,6 +1,9 @@
import csv
import io
import os
import re
import enum
import time
import requests
import logging
import datetime
@ -8,7 +11,7 @@ import sounddevice
from enum import auto
from typing import Optional, Tuple, Any
from PyQt6.QtCore import QThread, Qt, QThreadPool, QTimer
from PyQt6.QtCore import QThread, Qt, QThreadPool, QTimer, pyqtSignal
from PyQt6.QtGui import QTextCursor, QCloseEvent, QColor
from PyQt6.QtWidgets import (
QWidget,
@ -16,6 +19,7 @@ from PyQt6.QtWidgets import (
QFormLayout,
QHBoxLayout,
QMessageBox,
QApplication,
QPushButton,
QComboBox,
QLabel,
@ -38,7 +42,6 @@ from buzz.settings.recording_transcriber_mode import RecordingTranscriberMode
from buzz.transcriber.recording_transcriber import RecordingTranscriber
from buzz.transcriber.transcriber import (
TranscriptionOptions,
DEFAULT_WHISPER_TEMPERATURE,
Task,
)
from buzz.translator import Translator
@ -68,6 +71,8 @@ class RecordingTranscriberWidget(QWidget):
recording_amplitude_listener: Optional[RecordingAmplitudeListener] = None
device_sample_rate: Optional[int] = None
transcription_stopped = pyqtSignal()
class RecordingStatus(enum.Enum):
STOPPED = auto()
RECORDING = auto()
@ -133,10 +138,6 @@ class RecordingTranscriberWidget(QWidget):
initial_prompt=self.settings.value(
key=Settings.Key.RECORDING_TRANSCRIBER_INITIAL_PROMPT, default_value=""
),
temperature=self.settings.value(
key=Settings.Key.RECORDING_TRANSCRIBER_TEMPERATURE,
default_value=DEFAULT_WHISPER_TEMPERATURE,
),
word_level_timings=False,
enable_llm_translation=self.settings.value(
key=Settings.Key.RECORDING_TRANSCRIBER_ENABLE_LLM_TRANSLATION,
@ -148,6 +149,18 @@ class RecordingTranscriberWidget(QWidget):
llm_prompt=self.settings.value(
key=Settings.Key.RECORDING_TRANSCRIBER_LLM_PROMPT, default_value=""
),
silence_threshold=self.settings.value(
key=Settings.Key.RECORDING_TRANSCRIBER_SILENCE_THRESHOLD,
default_value=0.0025,
),
line_separator=self.settings.value(
key=Settings.Key.RECORDING_TRANSCRIBER_LINE_SEPARATOR,
default_value="\n\n",
),
transcription_step=self.settings.value(
key=Settings.Key.RECORDING_TRANSCRIBER_TRANSCRIPTION_STEP,
default_value=3.5,
),
)
self.audio_devices_combo_box = AudioDevicesComboBox(self)
@ -168,18 +181,27 @@ class RecordingTranscriberWidget(QWidget):
default_transcription_options=self.transcription_options,
model_types=model_types,
parent=self,
show_recording_settings=True,
)
self.transcription_options_group_box.transcription_options_changed.connect(
self.on_transcription_options_changed
)
self.transcription_options_group_box.advanced_settings_dialog.recording_mode_changed.connect(
self.on_recording_mode_changed
)
self.transcription_options_group_box.advanced_settings_dialog.hide_unconfirmed_changed.connect(
self.on_hide_unconfirmed_changed
)
recording_options_layout = QFormLayout()
recording_options_layout.addRow(_("Microphone:"), self.audio_devices_combo_box)
self.microphone_label = QLabel(_("Microphone:"))
recording_options_layout.addRow(self.microphone_label, self.audio_devices_combo_box)
self.audio_meter_widget = AudioMeterWidget(self)
record_button_layout = QHBoxLayout()
record_button_layout.addWidget(self.audio_meter_widget)
record_button_layout.setContentsMargins(0, 4, 0, 8)
record_button_layout.addWidget(self.audio_meter_widget, alignment=Qt.AlignmentFlag.AlignVCenter)
record_button_layout.addWidget(self.record_button)
layout.addWidget(self.transcription_options_group_box)
@ -192,12 +214,18 @@ class RecordingTranscriberWidget(QWidget):
self.translation_text_box.hide()
self.setLayout(layout)
self.resize(450, 500)
self.resize(700, 600)
self.reset_recording_amplitude_listener()
self._closing = False
self.transcript_export_file = None
self.translation_export_file = None
self.export_file_type = "txt"
self.export_max_entries = 0
self.hide_unconfirmed = self.settings.value(
Settings.Key.RECORDING_TRANSCRIBER_HIDE_UNCONFIRMED, True
)
self.export_enabled = self.settings.value(
key=Settings.Key.RECORDING_TRANSCRIBER_EXPORT_ENABLED,
default_value=False,
@ -209,6 +237,9 @@ class RecordingTranscriberWidget(QWidget):
self.presentation_options_bar = self.create_presentation_options_bar()
layout.insertWidget(3, self.presentation_options_bar)
self.presentation_options_bar.hide()
self.copy_actions_bar = self.create_copy_actions_bar()
layout.addWidget(self.copy_actions_bar) # Add at the bottom
self.copy_actions_bar.hide()
def create_presentation_options_bar(self) -> QWidget:
"""Crete the presentation options bar widget"""
@ -230,7 +261,7 @@ class RecordingTranscriberWidget(QWidget):
layout.addWidget(text_size_label)
self.text_size_spinbox = QSpinBox(bar)
self.text_size_spinbox.setRange(12, 72) #12pt to 72pt
self.text_size_spinbox.setRange(10, 100) #10pt to 100pt
saved_text_size = self.settings.value(
Settings.Key.PRESENTATION_WINDOW_TEXT_SIZE,
@ -286,6 +317,56 @@ class RecordingTranscriberWidget(QWidget):
return bar
def create_copy_actions_bar(self) -> QWidget:
"""Create the copy actions bar widget"""
bar = QWidget(self)
layout = QHBoxLayout(bar)
layout.setContentsMargins(5, 5, 5, 5)
layout.setSpacing(10)
layout.addStretch() # Push button to the right
self.copy_transcript_button = QPushButton(_("Copy"), bar)
self.copy_transcript_button.setToolTip(_("Copy transcription to clipboard"))
self.copy_transcript_button.clicked.connect(self.on_copy_transcript_clicked)
layout.addWidget(self.copy_transcript_button)
return bar
def on_copy_transcript_clicked(self):
"""Handle copy transcript button click"""
transcript_text = self.transcription_text_box.toPlainText().strip()
if not transcript_text:
self.copy_transcript_button.setText(_("Nothing to copy!"))
QTimer.singleShot(1500, lambda: self.copy_transcript_button.setText(_("Copy")))
return
app = QApplication.instance()
if app is None:
logging.warning("QApplication instance not available; clipboard disabled")
self.copy_transcript_button.setText(_("Copy failed"))
QTimer.singleShot(1500, lambda: self.copy_transcript_button.setText(_("Copy")))
return
clipboard = app.clipboard()
if clipboard is None:
logging.warning("Clipboard not available")
self.copy_transcript_button.setText(_("Copy failed"))
QTimer.singleShot(1500, lambda: self.copy_transcript_button.setText(_("Copy")))
return
try:
clipboard.setText(transcript_text)
except Exception as e:
logging.warning("Clipboard error: %s", e)
self.copy_transcript_button.setText(_("Copy failed"))
QTimer.singleShot(1500, lambda: self.copy_transcript_button.setText(_("Copy")))
return
self.copy_transcript_button.setText(_("Copied!"))
QTimer.singleShot(2000, lambda: self.copy_transcript_button.setText(_("Copy")))
def on_show_presentation_clicked(self):
"""Handle click on 'Show in new window' button"""
if self.presentation_window is None or not self.presentation_window.isVisible():
@ -385,7 +466,23 @@ class RecordingTranscriberWidget(QWidget):
date_time_now = datetime.datetime.now().strftime("%d-%b-%Y %H-%M-%S")
export_file_name_template = Settings().get_default_export_file_template()
custom_template = self.settings.value(
key=Settings.Key.RECORDING_TRANSCRIBER_EXPORT_FILE_NAME,
default_value="",
)
export_file_name_template = custom_template if custom_template else Settings().get_default_export_file_template()
self.export_file_type = self.settings.value(
key=Settings.Key.RECORDING_TRANSCRIBER_EXPORT_FILE_TYPE,
default_value="txt",
)
self.export_max_entries = self.settings.value(
Settings.Key.RECORDING_TRANSCRIBER_EXPORT_MAX_ENTRIES, 0, int
)
self.hide_unconfirmed = self.settings.value(
Settings.Key.RECORDING_TRANSCRIBER_HIDE_UNCONFIRMED, True
)
ext = ".csv" if self.export_file_type == "csv" else ".txt"
export_file_name = (
export_file_name_template.replace("{{ input_file_name }}", "live recording")
@ -394,14 +491,27 @@ class RecordingTranscriberWidget(QWidget):
.replace("{{ model_type }}", self.transcription_options.model.model_type.value)
.replace("{{ model_size }}", self.transcription_options.model.whisper_model_size or "")
.replace("{{ date_time }}", date_time_now)
+ ".txt"
+ ext
)
translated_ext = ".translated" + ext
if not os.path.isdir(export_folder):
self.export_enabled = False
self.transcript_export_file = os.path.join(export_folder, export_file_name)
self.translation_export_file = self.transcript_export_file.replace(".txt", ".translated.txt")
self.translation_export_file = self.transcript_export_file.replace(ext, translated_ext)
# Clear export files at the start of each recording session
for path in (self.transcript_export_file, self.translation_export_file):
if os.path.isfile(path):
self.write_to_export_file(path, "", mode="w")
def on_recording_mode_changed(self, mode: RecordingTranscriberMode):
self.transcriber_mode = mode
def on_hide_unconfirmed_changed(self, value: bool):
self.hide_unconfirmed = value
def on_transcription_options_changed(
self, transcription_options: TranscriptionOptions
@ -454,16 +564,34 @@ class RecordingTranscriberWidget(QWidget):
self.recording_amplitude_listener.amplitude_changed.connect(
self.on_recording_amplitude_changed, Qt.ConnectionType.QueuedConnection
)
self.recording_amplitude_listener.average_amplitude_changed.connect(
self.audio_meter_widget.update_average_amplitude, Qt.ConnectionType.QueuedConnection
)
self.recording_amplitude_listener.start_recording()
def on_record_button_clicked(self):
if self.current_status == self.RecordingStatus.STOPPED:
# Stop amplitude listener and disconnect its signal before resetting
# to prevent queued amplitude events from overriding the reset
if self.recording_amplitude_listener is not None:
self.recording_amplitude_listener.amplitude_changed.disconnect(
self.on_recording_amplitude_changed
)
self.recording_amplitude_listener.average_amplitude_changed.disconnect(
self.audio_meter_widget.update_average_amplitude
)
self.recording_amplitude_listener.stop_recording()
self.recording_amplitude_listener = None
self.audio_meter_widget.reset_amplitude()
self.start_recording()
self.current_status = self.RecordingStatus.RECORDING
self.record_button.set_recording()
self.transcription_options_group_box.setEnabled(False)
self.audio_devices_combo_box.setEnabled(False)
self.microphone_label.setEnabled(False)
self.presentation_options_bar.show()
self.copy_actions_bar.hide()
else: # RecordingStatus.RECORDING
self.stop_recording()
self.set_recording_status_stopped()
@ -502,7 +630,6 @@ class RecordingTranscriberWidget(QWidget):
self.transcription_thread = QThread()
# TODO: make runnable
self.transcriber = RecordingTranscriber(
input_device_index=self.selected_device_id,
sample_rate=self.device_sample_rate,
@ -519,6 +646,19 @@ class RecordingTranscriberWidget(QWidget):
)
self.transcriber.transcription.connect(self.on_next_transcription)
self.transcriber.amplitude_changed.connect(
self.on_recording_amplitude_changed, Qt.ConnectionType.QueuedConnection
)
self.transcriber.average_amplitude_changed.connect(
self.audio_meter_widget.update_average_amplitude, Qt.ConnectionType.QueuedConnection
)
self.transcriber.queue_size_changed.connect(
self.audio_meter_widget.update_queue_size, Qt.ConnectionType.QueuedConnection
)
# Stop the separate amplitude listener to avoid two streams on the same device
if self.recording_amplitude_listener is not None:
self.recording_amplitude_listener.stop_recording()
self.transcriber.finished.connect(self.on_transcriber_finished)
self.transcriber.finished.connect(self.transcription_thread.quit)
@ -542,9 +682,15 @@ class RecordingTranscriberWidget(QWidget):
self.translation_thread.finished.connect(
self.translation_thread.deleteLater
)
self.translation_thread.finished.connect(
lambda: setattr(self, "translation_thread", None)
)
self.translator.finished.connect(self.translation_thread.quit)
self.translator.finished.connect(self.translator.deleteLater)
self.translator.finished.connect(
lambda: setattr(self, "translator", None)
)
self.translator.translation.connect(self.on_next_translation)
@ -573,13 +719,16 @@ class RecordingTranscriberWidget(QWidget):
self.current_status = self.RecordingStatus.STOPPED
self.transcription_options_group_box.setEnabled(True)
self.audio_devices_combo_box.setEnabled(True)
self.microphone_label.setEnabled(True)
self.presentation_options_bar.hide()
self.copy_actions_bar.show() #added this here
def on_download_model_error(self, error: str):
self.reset_model_download()
show_model_download_error_dialog(self, error)
self.stop_recording()
self.set_recording_status_stopped()
self.reset_recording_amplitude_listener()
self.record_button.setDisabled(False)
@staticmethod
@ -595,6 +744,102 @@ class RecordingTranscriberWidget(QWidget):
return text
@staticmethod
def write_to_export_file(file_path: str, content: str, mode: str = "a", retries: int = 5, delay: float = 0.2):
"""Write to an export file with retry logic for Windows file locking."""
for attempt in range(retries):
try:
with open(file_path, mode, encoding='utf-8') as f:
f.write(content)
return
except PermissionError:
if attempt < retries - 1:
time.sleep(delay)
else:
logging.warning("Export write failed after %d retries: %s", retries, file_path)
except OSError as e:
logging.warning("Export write failed: %s", e)
return
@staticmethod
def write_csv_export(file_path: str, text: str, max_entries: int):
"""Append a new column to a single-row CSV export file, applying max_entries limit."""
existing_columns = []
if os.path.isfile(file_path):
try:
with open(file_path, "r", encoding="utf-8-sig") as f:
raw = f.read()
if raw.strip():
reader = csv.reader(io.StringIO(raw))
for row in reader:
existing_columns = row
break
except OSError:
pass
existing_columns.append(text)
if max_entries > 0:
existing_columns = existing_columns[-max_entries:]
buf = io.StringIO()
writer = csv.writer(buf)
writer.writerow(existing_columns)
for attempt in range(5):
try:
with open(file_path, "w", encoding='utf-8-sig') as f:
f.write(buf.getvalue())
return
except PermissionError:
if attempt < 4:
time.sleep(0.2)
else:
logging.warning("CSV export write failed after retries: %s", file_path)
except OSError as e:
logging.warning("CSV export write failed: %s", e)
return
@staticmethod
def write_txt_export(file_path: str, text: str, mode: str, max_entries: int, line_separator: str):
"""Write to a TXT export file, applying max_entries limit when needed."""
if mode == "a":
RecordingTranscriberWidget.write_to_export_file(file_path, text + line_separator)
if max_entries > 0 and os.path.isfile(file_path):
raw = RecordingTranscriberWidget.read_export_file(file_path)
parts = [p for p in raw.split(line_separator) if p]
if len(parts) > max_entries:
parts = parts[-max_entries:]
RecordingTranscriberWidget.write_to_export_file(
file_path, line_separator.join(parts) + line_separator, mode="w"
)
elif mode == "prepend":
existing_content = ""
if os.path.isfile(file_path):
existing_content = RecordingTranscriberWidget.read_export_file(file_path)
new_content = text + line_separator + existing_content
if max_entries > 0:
parts = [p for p in new_content.split(line_separator) if p]
if len(parts) > max_entries:
parts = parts[:max_entries]
new_content = line_separator.join(parts) + line_separator
RecordingTranscriberWidget.write_to_export_file(file_path, new_content, mode="w")
else:
RecordingTranscriberWidget.write_to_export_file(file_path, text, mode=mode)
@staticmethod
def read_export_file(file_path: str, retries: int = 5, delay: float = 0.2) -> str:
"""Read an export file with retry logic for Windows file locking."""
for attempt in range(retries):
try:
with open(file_path, "r", encoding='utf-8') as f:
return f.read()
except PermissionError:
if attempt < retries - 1:
time.sleep(delay)
else:
logging.warning("Export read failed after %d retries: %s", retries, file_path)
except OSError as e:
logging.warning("Export read failed: %s", e)
return ""
return ""
# Copilot magic implementation of a sliding window approach to find the longest common substring between two texts,
# ignoring the initial differences.
@staticmethod
@ -631,16 +876,36 @@ class RecordingTranscriberWidget(QWidget):
def process_transcription_merge(self, text: str, texts, text_box, export_file):
texts.append(text)
# Possibly in future we want to tie this to some setting, to limit amount of data that needs
# to be processed and exported. Value should not be less than ~10, so we have enough data to
# work with.
# if len(texts) > 20:
# del texts[:len(texts) - 20]
# Remove possibly errorous parts from overlapping audio chunks
last_common_length = None
for i in range(len(texts) - 1):
common_part = self.find_common_part(texts[i], texts[i + 1])
if common_part:
common_length = len(common_part)
texts[i] = texts[i][:texts[i].rfind(common_part) + common_length]
texts[i + 1] = texts[i + 1][texts[i + 1].find(common_part):]
if i == len(texts) - 2:
last_common_length = common_length
elif i == len(texts) - 2:
last_common_length = None
# When hiding unconfirmed: trim the last text to only the part confirmed by overlap
# with the previous chunk. If no overlap found, drop the last text entirely.
display_texts = list(texts)
if self.hide_unconfirmed and len(display_texts) > 1:
if last_common_length is not None:
display_texts[-1] = display_texts[-1][:last_common_length]
else:
display_texts = display_texts[:-1]
merged_texts = ""
for text in texts:
for text in display_texts:
merged_texts = self.merge_text_no_overlap(merged_texts, text)
merged_texts = NO_SPACE_BETWEEN_SENTENCES.sub(r'\1 \2', merged_texts)
@ -649,8 +914,12 @@ class RecordingTranscriberWidget(QWidget):
text_box.moveCursor(QTextCursor.MoveOperation.End)
if self.export_enabled and export_file:
with open(export_file, "w") as f:
f.write(merged_texts)
if self.export_file_type == "csv":
# For APPEND_AND_CORRECT mode, rewrite the whole CSV with all merged text as a single entry
self.write_to_export_file(export_file, "", mode="w")
self.write_csv_export(export_file, merged_texts, 0)
else:
self.write_to_export_file(export_file, merged_texts, mode="w")
def on_next_transcription(self, text: str):
text = self.filter_text(text)
@ -664,28 +933,42 @@ class RecordingTranscriberWidget(QWidget):
if self.transcriber_mode == RecordingTranscriberMode.APPEND_BELOW:
self.transcription_text_box.moveCursor(QTextCursor.MoveOperation.End)
if len(self.transcription_text_box.toPlainText()) > 0:
self.transcription_text_box.insertPlainText("\n\n")
self.transcription_text_box.insertPlainText(self.transcription_options.line_separator)
self.transcription_text_box.insertPlainText(text)
self.transcription_text_box.moveCursor(QTextCursor.MoveOperation.End)
if self.export_enabled and self.transcript_export_file:
with open(self.transcript_export_file, "a") as f:
f.write(text + "\n\n")
if self.export_file_type == "csv":
self.write_csv_export(self.transcript_export_file, text, self.export_max_entries)
else:
self.write_txt_export(self.transcript_export_file, text, "a", self.export_max_entries, self.transcription_options.line_separator)
elif self.transcriber_mode == RecordingTranscriberMode.APPEND_ABOVE:
self.transcription_text_box.moveCursor(QTextCursor.MoveOperation.Start)
self.transcription_text_box.insertPlainText(text)
self.transcription_text_box.insertPlainText("\n\n")
self.transcription_text_box.insertPlainText(self.transcription_options.line_separator)
self.transcription_text_box.moveCursor(QTextCursor.MoveOperation.Start)
if self.export_enabled and self.transcript_export_file:
with open(self.transcript_export_file, "r") as f:
existing_content = f.read()
new_content = text + "\n\n" + existing_content
with open(self.transcript_export_file, "w") as f:
f.write(new_content)
if self.export_file_type == "csv":
# For APPEND_ABOVE, prepend in CSV means inserting at beginning of columns
existing_columns = []
if os.path.isfile(self.transcript_export_file):
raw = self.read_export_file(self.transcript_export_file)
if raw.strip():
reader = csv.reader(io.StringIO(raw))
for row in reader:
existing_columns = row
break
new_columns = [text] + existing_columns
if self.export_max_entries > 0:
new_columns = new_columns[:self.export_max_entries]
buf = io.StringIO()
writer = csv.writer(buf)
writer.writerow(new_columns)
self.write_to_export_file(self.transcript_export_file, buf.getvalue(), mode="w")
else:
self.write_txt_export(self.transcript_export_file, text, "prepend", self.export_max_entries, self.transcription_options.line_separator)
elif self.transcriber_mode == RecordingTranscriberMode.APPEND_AND_CORRECT:
self.process_transcription_merge(text, self.transcripts, self.transcription_text_box, self.transcript_export_file)
@ -715,28 +998,41 @@ class RecordingTranscriberWidget(QWidget):
if self.transcriber_mode == RecordingTranscriberMode.APPEND_BELOW:
self.translation_text_box.moveCursor(QTextCursor.MoveOperation.End)
if len(self.translation_text_box.toPlainText()) > 0:
self.translation_text_box.insertPlainText("\n\n")
self.translation_text_box.insertPlainText(self.transcription_options.line_separator)
self.translation_text_box.insertPlainText(self.strip_newlines(text))
self.translation_text_box.moveCursor(QTextCursor.MoveOperation.End)
if self.export_enabled:
with open(self.translation_export_file, "a") as f:
f.write(text + "\n\n")
if self.export_enabled and self.translation_export_file:
if self.export_file_type == "csv":
self.write_csv_export(self.translation_export_file, text, self.export_max_entries)
else:
self.write_txt_export(self.translation_export_file, text, "a", self.export_max_entries, self.transcription_options.line_separator)
elif self.transcriber_mode == RecordingTranscriberMode.APPEND_ABOVE:
self.translation_text_box.moveCursor(QTextCursor.MoveOperation.Start)
self.translation_text_box.insertPlainText(self.strip_newlines(text))
self.translation_text_box.insertPlainText("\n\n")
self.translation_text_box.insertPlainText(self.transcription_options.line_separator)
self.translation_text_box.moveCursor(QTextCursor.MoveOperation.Start)
if self.export_enabled:
with open(self.translation_export_file, "r") as f:
existing_content = f.read()
new_content = text + "\n\n" + existing_content
with open(self.translation_export_file, "w") as f:
f.write(new_content)
if self.export_enabled and self.translation_export_file:
if self.export_file_type == "csv":
existing_columns = []
if os.path.isfile(self.translation_export_file):
raw = self.read_export_file(self.translation_export_file)
if raw.strip():
reader = csv.reader(io.StringIO(raw))
for row in reader:
existing_columns = row
break
new_columns = [text] + existing_columns
if self.export_max_entries > 0:
new_columns = new_columns[:self.export_max_entries]
buf = io.StringIO()
writer = csv.writer(buf)
writer.writerow(new_columns)
self.write_to_export_file(self.translation_export_file, buf.getvalue(), mode="w")
else:
self.write_txt_export(self.translation_export_file, text, "prepend", self.export_max_entries, self.transcription_options.line_separator)
elif self.transcriber_mode == RecordingTranscriberMode.APPEND_AND_CORRECT:
self.process_transcription_merge(text, self.translations, self.translation_text_box, self.translation_export_file)
@ -769,10 +1065,14 @@ class RecordingTranscriberWidget(QWidget):
def on_transcriber_finished(self):
self.reset_record_button()
# Restart amplitude listener now that the transcription stream is closed
self.reset_recording_amplitude_listener()
self.transcription_stopped.emit()
def on_transcriber_error(self, error: str):
self.reset_record_button()
self.set_recording_status_stopped()
self.reset_recording_amplitude_listener()
QMessageBox.critical(
self,
"",
@ -789,6 +1089,7 @@ class RecordingTranscriberWidget(QWidget):
self.model_loader.cancel()
self.reset_model_download()
self.set_recording_status_stopped()
self.reset_recording_amplitude_listener()
self.record_button.setDisabled(False)
def reset_model_download(self):
@ -812,17 +1113,51 @@ class RecordingTranscriberWidget(QWidget):
self.audio_meter_widget.update_amplitude(amplitude)
def closeEvent(self, event: QCloseEvent) -> None:
if self._closing:
# Second call after deferred close — proceed normally
self._do_close()
super().closeEvent(event)
return
if self.current_status == self.RecordingStatus.RECORDING:
# Defer the close until the transcription thread finishes to avoid
# blocking the GUI thread with a synchronous wait.
event.ignore()
self._closing = True
if self.model_loader is not None:
self.model_loader.cancel()
self.stop_recording()
# Connect to QThread.finished — the transcriber C++ object may already
# be scheduled for deletion via deleteLater() by this point.
thread = self.transcription_thread
if thread is not None:
try:
if thread.isRunning():
thread.finished.connect(self._on_close_transcriber_finished)
else:
self._on_close_transcriber_finished()
except RuntimeError:
self._on_close_transcriber_finished()
else:
self._on_close_transcriber_finished()
return
self._do_close()
super().closeEvent(event)
def _on_close_transcriber_finished(self):
self.transcription_thread = None
self.close()
def _do_close(self):
#Close presentation window if open
if self.presentation_window:
self.presentation_window.close()
self.presentation_window = None
self.fullscreen_button.setEnabled(False)
if self.model_loader is not None:
self.model_loader.cancel()
self.stop_recording()
if self.recording_amplitude_listener is not None:
self.recording_amplitude_listener.stop_recording()
self.recording_amplitude_listener.deleteLater()
@ -832,11 +1167,8 @@ class RecordingTranscriberWidget(QWidget):
self.translator.stop()
if self.translation_thread is not None:
# Just request quit — do not block the GUI thread waiting for it
self.translation_thread.quit()
# Only wait if thread is actually running
if self.translation_thread.isRunning():
if not self.translation_thread.wait(45_000):
logging.warning("Translation thread did not finish within timeout")
self.settings.set_value(
Settings.Key.RECORDING_TRANSCRIBER_LANGUAGE,
@ -845,10 +1177,6 @@ class RecordingTranscriberWidget(QWidget):
self.settings.set_value(
Settings.Key.RECORDING_TRANSCRIBER_TASK, self.transcription_options.task
)
self.settings.set_value(
Settings.Key.RECORDING_TRANSCRIBER_TEMPERATURE,
self.transcription_options.temperature,
)
self.settings.set_value(
Settings.Key.RECORDING_TRANSCRIBER_INITIAL_PROMPT,
self.transcription_options.initial_prompt,
@ -868,5 +1196,15 @@ class RecordingTranscriberWidget(QWidget):
Settings.Key.RECORDING_TRANSCRIBER_LLM_PROMPT,
self.transcription_options.llm_prompt,
)
return super().closeEvent(event)
self.settings.set_value(
Settings.Key.RECORDING_TRANSCRIBER_SILENCE_THRESHOLD,
self.transcription_options.silence_threshold,
)
self.settings.set_value(
Settings.Key.RECORDING_TRANSCRIBER_LINE_SEPARATOR,
self.transcription_options.line_separator,
)
self.settings.set_value(
Settings.Key.RECORDING_TRANSCRIBER_TRANSCRIPTION_STEP,
self.transcription_options.transcription_step,
)

View file

@ -7,23 +7,34 @@ from PyQt6.QtWidgets import (
QPlainTextEdit,
QFormLayout,
QLabel,
QDoubleSpinBox,
QLineEdit,
QComboBox,
QHBoxLayout,
QPushButton,
QSpinBox,
QFileDialog,
)
from buzz.locale import _
from buzz.model_loader import ModelType
from buzz.transcriber.transcriber import TranscriptionOptions
from buzz.settings.settings import Settings
from buzz.settings.recording_transcriber_mode import RecordingTranscriberMode
from buzz.widgets.line_edit import LineEdit
from buzz.widgets.transcriber.initial_prompt_text_edit import InitialPromptTextEdit
from buzz.widgets.transcriber.temperature_validator import TemperatureValidator
class AdvancedSettingsDialog(QDialog):
transcription_options: TranscriptionOptions
transcription_options_changed = pyqtSignal(TranscriptionOptions)
recording_mode_changed = pyqtSignal(RecordingTranscriberMode)
hide_unconfirmed_changed = pyqtSignal(bool)
def __init__(
self, transcription_options: TranscriptionOptions, parent: QWidget | None = None
self,
transcription_options: TranscriptionOptions,
parent: QWidget | None = None,
show_recording_settings: bool = False,
):
super().__init__(parent)
@ -31,29 +42,15 @@ class AdvancedSettingsDialog(QDialog):
self.settings = Settings()
self.setWindowTitle(_("Advanced Settings"))
self.setMinimumWidth(800)
layout = QFormLayout(self)
layout.setFieldGrowthPolicy(QFormLayout.FieldGrowthPolicy.ExpandingFieldsGrow)
transcription_settings_title= _("Speech recognition settings")
transcription_settings_title_label = QLabel(f"<h4>{transcription_settings_title}</h4>", self)
layout.addRow("", transcription_settings_title_label)
default_temperature_text = ", ".join(
[str(temp) for temp in transcription_options.temperature]
)
self.temperature_line_edit = LineEdit(default_temperature_text, self)
self.temperature_line_edit.setPlaceholderText(
_('Comma-separated, e.g. "0.0, 0.2, 0.4, 0.6, 0.8, 1.0"')
)
self.temperature_line_edit.setMinimumWidth(250)
self.temperature_line_edit.textChanged.connect(self.on_temperature_changed)
self.temperature_line_edit.setValidator(TemperatureValidator(self))
self.temperature_line_edit.setEnabled(
transcription_options.model.model_type == ModelType.WHISPER
)
layout.addRow(_("Temperature:"), self.temperature_line_edit)
self.initial_prompt_text_edit = InitialPromptTextEdit(
transcription_options.initial_prompt,
transcription_options.model.model_type,
@ -74,22 +71,160 @@ class AdvancedSettingsDialog(QDialog):
self.enable_llm_translation_checkbox.stateChanged.connect(self.on_enable_llm_translation_changed)
layout.addRow("", self.enable_llm_translation_checkbox)
self.llm_model_line_edit = LineEdit(self.transcription_options.llm_model, self)
self.llm_model_line_edit.textChanged.connect(
self.on_llm_model_changed
)
llm_model = self.transcription_options.llm_model or "gpt-4.1-mini"
self.llm_model_line_edit = LineEdit(llm_model, self)
self.llm_model_line_edit.textChanged.connect(self.on_llm_model_changed)
self.llm_model_line_edit.setMinimumWidth(170)
self.llm_model_line_edit.setEnabled(self.transcription_options.enable_llm_translation)
self.llm_model_line_edit.setPlaceholderText("gpt-4.1-mini")
layout.addRow(_("AI model:"), self.llm_model_line_edit)
self.llm_model_label = QLabel(_("AI model:"))
self.llm_model_label.setEnabled(self.transcription_options.enable_llm_translation)
layout.addRow(self.llm_model_label, self.llm_model_line_edit)
self.llm_prompt_text_edit = QPlainTextEdit(self.transcription_options.llm_prompt)
default_llm_prompt = self.transcription_options.llm_prompt or _(
"Please translate each text sent to you from English to Spanish. Translation will be used in an automated system, please do not add any comments or notes, just the translation."
)
self.llm_prompt_text_edit = QPlainTextEdit(default_llm_prompt)
self.llm_prompt_text_edit.setEnabled(self.transcription_options.enable_llm_translation)
self.llm_prompt_text_edit.setPlaceholderText(_("Enter instructions for AI on how to translate, for example 'Please translate each text sent to you from English to Spanish.'"))
self.llm_prompt_text_edit.setMinimumWidth(170)
self.llm_prompt_text_edit.setFixedHeight(115)
self.llm_prompt_text_edit.setFixedHeight(80)
self.llm_prompt_text_edit.textChanged.connect(self.on_llm_prompt_changed)
layout.addRow(_("Instructions for AI:"), self.llm_prompt_text_edit)
self.llm_prompt_label = QLabel(_("Instructions for AI:"))
self.llm_prompt_label.setEnabled(self.transcription_options.enable_llm_translation)
layout.addRow(self.llm_prompt_label, self.llm_prompt_text_edit)
if show_recording_settings:
recording_settings_title = _("Recording settings")
recording_settings_title_label = QLabel(f"<h4>{recording_settings_title}</h4>", self)
layout.addRow("", recording_settings_title_label)
self.silence_threshold_spin_box = QDoubleSpinBox(self)
self.silence_threshold_spin_box.setRange(0.0, 1.0)
self.silence_threshold_spin_box.setSingleStep(0.0005)
self.silence_threshold_spin_box.setDecimals(4)
self.silence_threshold_spin_box.setValue(transcription_options.silence_threshold)
self.silence_threshold_spin_box.valueChanged.connect(self.on_silence_threshold_changed)
self.silence_threshold_spin_box.setFixedWidth(90)
layout.addRow(_("Silence threshold:"), self.silence_threshold_spin_box)
# Live recording mode
self.recording_mode_combo = QComboBox(self)
for mode in RecordingTranscriberMode:
self.recording_mode_combo.addItem(mode.value)
self.recording_mode_combo.setCurrentIndex(
self.settings.value(Settings.Key.RECORDING_TRANSCRIBER_MODE, 0)
)
self.recording_mode_combo.currentIndexChanged.connect(self.on_recording_mode_changed)
self.recording_mode_combo.setFixedWidth(250)
layout.addRow(_("Live recording mode") + ":", self.recording_mode_combo)
self.line_separator_line_edit = QLineEdit(self)
line_sep_display = repr(transcription_options.line_separator)[1:-1] or r"\n\n"
self.line_separator_line_edit.setText(line_sep_display)
self.line_separator_line_edit.textChanged.connect(self.on_line_separator_changed)
self.line_separator_label = QLabel(_("Line separator:"))
layout.addRow(self.line_separator_label, self.line_separator_line_edit)
self.transcription_step_spin_box = QDoubleSpinBox(self)
self.transcription_step_spin_box.setRange(2.0, 5.0)
self.transcription_step_spin_box.setSingleStep(0.1)
self.transcription_step_spin_box.setDecimals(1)
self.transcription_step_spin_box.setValue(transcription_options.transcription_step)
self.transcription_step_spin_box.valueChanged.connect(self.on_transcription_step_changed)
self.transcription_step_spin_box.setFixedWidth(80)
self.transcription_step_label = QLabel(_("Transcription step:"))
layout.addRow(self.transcription_step_label, self.transcription_step_spin_box)
hide_unconfirmed = self.settings.value(
Settings.Key.RECORDING_TRANSCRIBER_HIDE_UNCONFIRMED, True
)
self.hide_unconfirmed_checkbox = QCheckBox(_("Hide unconfirmed"))
self.hide_unconfirmed_checkbox.setChecked(hide_unconfirmed)
self.hide_unconfirmed_checkbox.stateChanged.connect(self.on_hide_unconfirmed_changed)
self.hide_unconfirmed_label = QLabel("")
layout.addRow(self.hide_unconfirmed_label, self.hide_unconfirmed_checkbox)
self._update_recording_mode_visibility(
RecordingTranscriberMode(self.recording_mode_combo.currentText())
)
# Export enabled checkbox
self._export_enabled = self.settings.value(
Settings.Key.RECORDING_TRANSCRIBER_EXPORT_ENABLED, False
)
self.export_enabled_checkbox = QCheckBox(_("Enable live recording export"))
self.export_enabled_checkbox.setChecked(self._export_enabled)
self.export_enabled_checkbox.stateChanged.connect(self.on_export_enabled_changed)
layout.addRow("", self.export_enabled_checkbox)
# Export folder
export_folder = self.settings.value(
Settings.Key.RECORDING_TRANSCRIBER_EXPORT_FOLDER, ""
)
self.export_folder_line_edit = LineEdit(export_folder, self)
self.export_folder_line_edit.setEnabled(self._export_enabled)
self.export_folder_line_edit.textChanged.connect(self.on_export_folder_changed)
self.export_folder_browse_button = QPushButton(_("Browse"), self)
self.export_folder_browse_button.setEnabled(self._export_enabled)
self.export_folder_browse_button.clicked.connect(self.on_browse_export_folder)
export_folder_row = QHBoxLayout()
export_folder_row.addWidget(self.export_folder_line_edit)
export_folder_row.addWidget(self.export_folder_browse_button)
self.export_folder_label = QLabel(_("Export folder:"))
self.export_folder_label.setEnabled(self._export_enabled)
layout.addRow(self.export_folder_label, export_folder_row)
# Export file name template
export_file_name = self.settings.value(
Settings.Key.RECORDING_TRANSCRIBER_EXPORT_FILE_NAME, ""
)
self.export_file_name_line_edit = LineEdit(export_file_name, self)
self.export_file_name_line_edit.setEnabled(self._export_enabled)
self.export_file_name_line_edit.textChanged.connect(self.on_export_file_name_changed)
self.export_file_name_label = QLabel(_("Export file name:"))
self.export_file_name_label.setEnabled(self._export_enabled)
layout.addRow(self.export_file_name_label, self.export_file_name_line_edit)
# Export file type
self.export_file_type_combo = QComboBox(self)
self.export_file_type_combo.addItem(_("Text file (.txt)"), "txt")
self.export_file_type_combo.addItem(_("CSV (.csv)"), "csv")
current_type = self.settings.value(
Settings.Key.RECORDING_TRANSCRIBER_EXPORT_FILE_TYPE, "txt"
)
type_index = self.export_file_type_combo.findData(current_type)
if type_index >= 0:
self.export_file_type_combo.setCurrentIndex(type_index)
self.export_file_type_combo.setEnabled(self._export_enabled)
self.export_file_type_combo.currentIndexChanged.connect(self.on_export_file_type_changed)
self.export_file_type_combo.setFixedWidth(200)
self.export_file_type_label = QLabel(_("Export file type:"))
self.export_file_type_label.setEnabled(self._export_enabled)
layout.addRow(self.export_file_type_label, self.export_file_type_combo)
# Max entries
max_entries = self.settings.value(
Settings.Key.RECORDING_TRANSCRIBER_EXPORT_MAX_ENTRIES, 0, int
)
self.export_max_entries_spin = QSpinBox(self)
self.export_max_entries_spin.setRange(0, 99)
self.export_max_entries_spin.setValue(max_entries)
self.export_max_entries_spin.setEnabled(self._export_enabled)
self.export_max_entries_spin.valueChanged.connect(self.on_export_max_entries_changed)
self.export_max_entries_spin.setFixedWidth(90)
self.export_max_entries_label = QLabel(_("Limit export entries\n(0 = export all):"))
self.export_max_entries_label.setEnabled(self._export_enabled)
layout.addRow(self.export_max_entries_label, self.export_max_entries_spin)
_field_height = self.llm_model_line_edit.sizeHint().height()
for widget in (
self.line_separator_line_edit,
self.silence_threshold_spin_box,
self.recording_mode_combo,
self.transcription_step_spin_box,
self.export_file_type_combo,
self.export_max_entries_spin,
):
widget.setFixedHeight(_field_height)
button_box = QDialogButtonBox(
QDialogButtonBox.StandardButton(QDialogButtonBox.StandardButton.Ok), self
@ -100,15 +235,6 @@ class AdvancedSettingsDialog(QDialog):
layout.addWidget(button_box)
self.setLayout(layout)
self.resize(self.sizeHint())
def on_temperature_changed(self, text: str):
try:
temperatures = [float(temp.strip()) for temp in text.split(",")]
self.transcription_options.temperature = tuple(temperatures)
self.transcription_options_changed.emit(self.transcription_options)
except ValueError:
pass
def on_initial_prompt_changed(self):
self.transcription_options.initial_prompt = (
@ -120,8 +246,11 @@ class AdvancedSettingsDialog(QDialog):
self.transcription_options.enable_llm_translation = state == 2
self.transcription_options_changed.emit(self.transcription_options)
self.llm_model_line_edit.setEnabled(self.transcription_options.enable_llm_translation)
self.llm_prompt_text_edit.setEnabled(self.transcription_options.enable_llm_translation)
enabled = self.transcription_options.enable_llm_translation
self.llm_model_label.setEnabled(enabled)
self.llm_model_line_edit.setEnabled(enabled)
self.llm_prompt_label.setEnabled(enabled)
self.llm_prompt_text_edit.setEnabled(enabled)
def on_llm_model_changed(self, text: str):
self.transcription_options.llm_model = text
@ -132,3 +261,72 @@ class AdvancedSettingsDialog(QDialog):
self.llm_prompt_text_edit.toPlainText()
)
self.transcription_options_changed.emit(self.transcription_options)
def on_silence_threshold_changed(self, value: float):
self.transcription_options.silence_threshold = value
self.transcription_options_changed.emit(self.transcription_options)
def on_line_separator_changed(self, text: str):
try:
self.transcription_options.line_separator = text.encode().decode("unicode_escape")
except UnicodeDecodeError:
return
self.transcription_options_changed.emit(self.transcription_options)
def on_recording_mode_changed(self, index: int):
self.settings.set_value(Settings.Key.RECORDING_TRANSCRIBER_MODE, index)
mode = list(RecordingTranscriberMode)[index]
self._update_recording_mode_visibility(mode)
self.recording_mode_changed.emit(mode)
def _update_recording_mode_visibility(self, mode: RecordingTranscriberMode):
is_append_and_correct = mode == RecordingTranscriberMode.APPEND_AND_CORRECT
self.line_separator_label.setVisible(not is_append_and_correct)
self.line_separator_line_edit.setVisible(not is_append_and_correct)
self.transcription_step_label.setVisible(is_append_and_correct)
self.transcription_step_spin_box.setVisible(is_append_and_correct)
self.hide_unconfirmed_label.setVisible(is_append_and_correct)
self.hide_unconfirmed_checkbox.setVisible(is_append_and_correct)
def on_transcription_step_changed(self, value: float):
self.transcription_options.transcription_step = round(value, 1)
self.transcription_options_changed.emit(self.transcription_options)
def on_hide_unconfirmed_changed(self, state: int):
value = state == 2
self.settings.set_value(Settings.Key.RECORDING_TRANSCRIBER_HIDE_UNCONFIRMED, value)
self.hide_unconfirmed_changed.emit(value)
def on_export_enabled_changed(self, state: int):
self._export_enabled = state == 2
self.settings.set_value(Settings.Key.RECORDING_TRANSCRIBER_EXPORT_ENABLED, self._export_enabled)
for widget in (
self.export_folder_label,
self.export_folder_line_edit,
self.export_folder_browse_button,
self.export_file_name_label,
self.export_file_name_line_edit,
self.export_file_type_label,
self.export_file_type_combo,
self.export_max_entries_label,
self.export_max_entries_spin,
):
widget.setEnabled(self._export_enabled)
def on_export_folder_changed(self, text: str):
self.settings.set_value(Settings.Key.RECORDING_TRANSCRIBER_EXPORT_FOLDER, text)
def on_browse_export_folder(self):
folder = QFileDialog.getExistingDirectory(self, _("Select Export Folder"))
if folder:
self.export_folder_line_edit.setText(folder)
def on_export_file_name_changed(self, text: str):
self.settings.set_value(Settings.Key.RECORDING_TRANSCRIBER_EXPORT_FILE_NAME, text)
def on_export_file_type_changed(self, index: int):
file_type = self.export_file_type_combo.itemData(index)
self.settings.set_value(Settings.Key.RECORDING_TRANSCRIBER_EXPORT_FILE_TYPE, file_type)
def on_export_max_entries_changed(self, value: int):
self.settings.set_value(Settings.Key.RECORDING_TRANSCRIBER_EXPORT_MAX_ENTRIES, value)

View file

@ -10,4 +10,4 @@ class InitialPromptTextEdit(QPlainTextEdit):
self.setPlaceholderText(_("Enter prompt..."))
self.setEnabled(model_type.supports_initial_prompt)
self.setMinimumWidth(350)
self.setFixedHeight(115)
self.setFixedHeight(80)

View file

@ -2,7 +2,7 @@ from typing import Optional
import os
from PyQt6.QtCore import pyqtSignal, Qt
from PyQt6.QtWidgets import QComboBox, QWidget
from PyQt6.QtWidgets import QComboBox, QWidget, QFrame
from PyQt6.QtGui import QStandardItem, QStandardItemModel
from buzz.locale import _
@ -51,3 +51,9 @@ class LanguagesComboBox(QComboBox):
def on_index_changed(self, index: int):
self.languageChanged.emit(self.languages[index][0])
def showPopup(self):
super().showPopup()
popup = self.findChild(QFrame)
if popup and popup.height() > 400:
popup.setFixedHeight(400)

View file

@ -1,21 +0,0 @@
from typing import Optional, Tuple
from PyQt6.QtCore import QObject
from PyQt6.QtGui import QValidator
class TemperatureValidator(QValidator):
def __init__(self, parent: Optional[QObject] = ...) -> None:
super().__init__(parent)
def validate(
self, text: str, cursor_position: int
) -> Tuple["QValidator.State", str, int]:
try:
temp_strings = [temp.strip() for temp in text.split(",")]
if temp_strings[-1] == "":
return QValidator.State.Intermediate, text, cursor_position
_ = [float(temp) for temp in temp_strings]
return QValidator.State.Acceptable, text, cursor_position
except ValueError:
return QValidator.State.Invalid, text, cursor_position

View file

@ -33,6 +33,7 @@ class TranscriptionOptionsGroupBox(QGroupBox):
default_transcription_options: TranscriptionOptions = TranscriptionOptions(),
model_types: Optional[List[ModelType]] = None,
parent: Optional[QWidget] = None,
show_recording_settings: bool = False,
):
super().__init__(title="", parent=parent)
self.settings = Settings()
@ -49,7 +50,9 @@ class TranscriptionOptionsGroupBox(QGroupBox):
self.model_type_combo_box.changed.connect(self.on_model_type_changed)
self.advanced_settings_dialog = AdvancedSettingsDialog(
transcription_options=self.transcription_options, parent=self
transcription_options=self.transcription_options,
parent=self,
show_recording_settings=show_recording_settings,
)
self.advanced_settings_dialog.transcription_options_changed.connect(
self.on_transcription_options_changed

View file

@ -11,6 +11,12 @@ from buzz.widgets.preferences_dialog.models.folder_watch_preferences import (
FolderWatchPreferences,
)
# Supported media file extensions (audio and video)
SUPPORTED_EXTENSIONS = {
".mp3", ".wav", ".m4a", ".ogg", ".opus", ".flac", # audio
".mp4", ".webm", ".ogm", ".mov", ".mkv", ".avi", ".wmv", # video
}
class TranscriptionTaskFolderWatcher(QFileSystemWatcher):
preferences: FolderWatchPreferences
@ -34,9 +40,14 @@ class TranscriptionTaskFolderWatcher(QFileSystemWatcher):
if len(self.directories()) > 0:
self.removePaths(self.directories())
if preferences.enabled:
self.addPath(preferences.input_directory)
# Add the input directory and all subdirectories to the watcher
for dirpath, dirnames, _ in os.walk(preferences.input_directory):
# Skip hidden directories
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
self.addPath(dirpath)
logging.debug(
'Watching for media files in "%s"', preferences.input_directory
'Watching for media files in "%s" and subdirectories',
preferences.input_directory,
)
def find_tasks(self):
@ -49,8 +60,18 @@ class TranscriptionTaskFolderWatcher(QFileSystemWatcher):
for dirpath, dirnames, filenames in os.walk(input_directory):
for filename in filenames:
file_path = os.path.join(dirpath, filename)
file_ext = os.path.splitext(filename)[1].lower()
# Check for temp conversion files (e.g., .ogg.wav)
name_without_ext = os.path.splitext(filename)[0]
secondary_ext = os.path.splitext(name_without_ext)[1].lower()
is_temp_conversion_file = secondary_ext in SUPPORTED_EXTENSIONS
if (
filename.startswith(".") # hidden files
or file_ext not in SUPPORTED_EXTENSIONS # non-media files
or is_temp_conversion_file # temp conversion files like .ogg.wav
or "_speech.mp3" in filename # extracted speech output files
or file_path in tasks # file already in tasks
or file_path in self.paths_emitted # file already emitted
):
@ -70,16 +91,34 @@ class TranscriptionTaskFolderWatcher(QFileSystemWatcher):
ModelDownloader(model=transcription_options.model).run()
model_path = transcription_options.model.get_local_model_path()
# Preserve subdirectory structure in output directory
relative_path = os.path.relpath(dirpath, input_directory)
if relative_path == ".":
output_directory = self.preferences.output_directory
else:
output_directory = os.path.join(
self.preferences.output_directory, relative_path
)
# Create output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)
task = FileTranscriptionTask(
file_path=file_path,
original_file_path=file_path,
transcription_options=transcription_options,
file_transcription_options=file_transcription_options,
model_path=model_path,
output_directory=self.preferences.output_directory,
output_directory=output_directory,
source=FileTranscriptionTask.Source.FOLDER_WATCH,
delete_source_file=self.preferences.delete_processed_files,
)
self.task_found.emit(task)
self.paths_emitted.add(file_path)
# Don't traverse into subdirectories
break
# Filter out hidden directories and add new subdirectories to the watcher
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
for dirname in dirnames:
subdir_path = os.path.join(dirpath, dirname)
if subdir_path not in self.directories():
self.addPath(subdir_path)

View file

@ -10,6 +10,7 @@ from typing import Optional
# This must be done before importing libraries that download from Hugging Face
try:
import certifi
os.environ.setdefault('REQUESTS_CA_BUNDLE', certifi.where())
os.environ.setdefault('SSL_CERT_FILE', certifi.where())
os.environ.setdefault('SSL_CERT_DIR', os.path.dirname(certifi.where()))
# Also update the default SSL context for urllib
@ -44,23 +45,63 @@ from buzz.settings.settings import Settings
from buzz.widgets.line_edit import LineEdit
from buzz.transcriber.transcriber import Segment
from ctc_forced_aligner.ctc_forced_aligner import (
generate_emissions,
get_alignments,
get_spans,
load_alignment_model,
postprocess_results,
preprocess_text,
)
from whisper_diarization.helpers import (
get_realigned_ws_mapping_with_punctuation,
get_sentences_speaker_mapping,
get_words_speaker_mapping,
langs_to_iso,
punct_model_langs,
)
from deepmultilingualpunctuation.deepmultilingualpunctuation import PunctuationModel
from whisper_diarization.diarization import MSDDDiarizer
def process_in_batches(
items,
process_func,
batch_size=200,
chunk_size=230,
smaller_batch_size=100,
exception_types=(AssertionError,),
**process_func_kwargs
):
"""
Process items in batches with automatic fallback to smaller batches on errors.
This is a generic batch processing function that can be used with any processing
function that has chunk size limitations. It automatically retries with smaller
batches when specified exceptions occur.
Args:
items: List of items to process
process_func: Callable that processes a batch. Should accept (batch, chunk_size, **kwargs)
and return a list of results
batch_size: Initial batch size (default: 200)
chunk_size: Maximum chunk size for the processing function (default: 230)
smaller_batch_size: Fallback batch size when errors occur (default: 100)
exception_types: Tuple of exception types to catch and retry with smaller batches
(default: (AssertionError,))
**process_func_kwargs: Additional keyword arguments to pass to process_func
Returns:
List of processed results (concatenated from all batches)
Example:
>>> def my_predict(batch, chunk_size):
... return [f"processed_{item}" for item in batch]
>>> results = process_in_batches(
... items=["a", "b", "c"],
... process_func=my_predict,
... batch_size=2
... )
"""
all_results = []
for i in range(0, len(items), batch_size):
batch = items[i:i + batch_size]
try:
batch_results = process_func(batch, chunk_size=min(chunk_size, len(batch)), **process_func_kwargs)
all_results.extend(batch_results)
except exception_types as e:
# If batch still fails, try with even smaller chunks
logging.warning(f"Batch processing failed, trying smaller chunks: {e}")
for j in range(0, len(batch), smaller_batch_size):
smaller_batch = batch[j:j + smaller_batch_size]
smaller_results = process_func(smaller_batch, chunk_size=min(chunk_size, len(smaller_batch)), **process_func_kwargs)
all_results.extend(smaller_results)
return all_results
SENTENCE_END = re.compile(r'.*[.!?。!?]')
@ -109,6 +150,32 @@ class IdentificationWorker(QObject):
}
def run(self):
try:
from ctc_forced_aligner.ctc_forced_aligner import (
generate_emissions,
get_alignments,
get_spans,
load_alignment_model,
postprocess_results,
preprocess_text,
)
from whisper_diarization.helpers import (
get_realigned_ws_mapping_with_punctuation,
get_sentences_speaker_mapping,
get_words_speaker_mapping,
langs_to_iso,
punct_model_langs,
)
from deepmultilingualpunctuation.deepmultilingualpunctuation import PunctuationModel
from whisper_diarization.diarization import MSDDDiarizer
except ImportError as e:
logging.exception("Failed to import speaker identification libraries: %s", e)
self.error.emit(
_("Speaker identification is not available: failed to load required libraries.")
+ f"\n\n{e}"
)
return
diarizer_model = None
alignment_model = None
@ -130,7 +197,8 @@ class IdentificationWorker(QObject):
transcription_id=self.transcription.id_as_uuid
)
full_transcript = "".join(segment.text for segment in segments)
full_transcript = " ".join(segment.text for segment in segments)
full_transcript = re.sub(r' {2,}', ' ', full_transcript)
if self._is_cancelled:
logging.debug("Speaker identification worker: Cancelled at step 2")
@ -191,13 +259,15 @@ class IdentificationWorker(QObject):
return
self.progress_update.emit(_("4/8 Processing audio"))
logging.debug("Speaker identification worker: Generating emissions")
emissions, stride = generate_emissions(
alignment_model,
torch.from_numpy(audio_waveform)
.to(alignment_model.dtype)
.to(alignment_model.device),
batch_size=8,
batch_size=1 if device == "cpu" else 8,
)
logging.debug("Speaker identification worker: Emissions generated")
# Clean up alignment model
del alignment_model
@ -243,10 +313,14 @@ class IdentificationWorker(QObject):
logging.debug("Speaker identification worker: Creating diarizer model")
diarizer_model = MSDDDiarizer(device)
logging.debug("Speaker identification worker: Running diarization")
logging.debug("Speaker identification worker: Running diarization (this may take a while on CPU)")
speaker_ts = diarizer_model.diarize(torch.from_numpy(audio_waveform).unsqueeze(0))
logging.debug("Speaker identification worker: Diarization complete")
if self._is_cancelled:
logging.debug("Speaker identification worker: Cancelled after diarization")
return
# Clean up diarizer model immediately after use
del diarizer_model
diarizer_model = None
@ -267,7 +341,14 @@ class IdentificationWorker(QObject):
words_list = list(map(lambda x: x["word"], wsm))
labled_words = punct_model.predict(words_list, chunk_size=230)
# Process in batches to avoid chunk size errors
def predict_wrapper(batch, chunk_size, **kwargs):
return punct_model.predict(batch, chunk_size=chunk_size)
labled_words = process_in_batches(
items=words_list,
process_func=predict_wrapper
)
ending_puncts = ".?!。!?"
model_puncts = ".,;:!?。!?"
@ -378,6 +459,11 @@ class SpeakerIdentificationWidget(QWidget):
self.step_1_button.setMinimumWidth(200)
self.step_1_button.clicked.connect(self.on_identify_button_clicked)
self.cancel_button = QPushButton(_("Cancel"))
self.cancel_button.setMinimumWidth(200)
self.cancel_button.setVisible(False)
self.cancel_button.clicked.connect(self.on_cancel_button_clicked)
# Progress container with label and bar
progress_container = QVBoxLayout()
@ -398,7 +484,10 @@ class SpeakerIdentificationWidget(QWidget):
self.step_1_row.addLayout(progress_container)
self.step_1_row.addWidget(self.step_1_button, alignment=Qt.AlignmentFlag.AlignTop)
button_container = QVBoxLayout()
button_container.addWidget(self.step_1_button)
button_container.addWidget(self.cancel_button)
self.step_1_row.addLayout(button_container)
step_1_layout.addLayout(self.step_1_row)
@ -463,6 +552,8 @@ class SpeakerIdentificationWidget(QWidget):
def on_identify_button_clicked(self):
self.step_1_button.setEnabled(False)
self.step_1_button.setVisible(False)
self.cancel_button.setVisible(True)
# Clean up any existing thread before starting a new one
self._cleanup_thread()
@ -482,18 +573,36 @@ class SpeakerIdentificationWidget(QWidget):
self.thread.start()
def on_cancel_button_clicked(self):
"""Handle cancel button click."""
logging.debug("Speaker identification: Cancel requested by user")
self.cancel_button.setEnabled(False)
self.progress_label.setText(_("Cancelling..."))
self._cleanup_thread()
self._reset_buttons()
self.progress_label.setText(_("Cancelled"))
self.progress_bar.setValue(0)
def _reset_buttons(self):
"""Reset identify/cancel buttons to initial state."""
self.step_1_button.setVisible(True)
self.step_1_button.setEnabled(True)
self.cancel_button.setVisible(False)
self.cancel_button.setEnabled(True)
def _on_thread_finished(self, result):
"""Handle thread completion and cleanup."""
logging.debug("Speaker identification: Thread finished")
if self.thread is not None:
self.thread.quit()
self.thread.wait(5000)
self._reset_buttons()
self.on_identification_finished(result)
def on_identification_error(self, error_message):
"""Handle identification error."""
logging.error(f"Speaker identification error: {error_message}")
self.step_1_button.setEnabled(True)
self._reset_buttons()
self.progress_bar.setValue(0)
def on_progress_update(self, progress):

View file

@ -36,6 +36,9 @@ from buzz.widgets.preferences_dialog.models.file_transcription_preferences impor
SENTENCE_END = re.compile(r'.*[.!?。!?]')
# Languages that don't use spaces between words
NON_SPACE_LANGUAGES = {"zh", "ja", "th", "lo", "km", "my"}
class TranscriptionWorker(QObject):
finished = pyqtSignal(list)
@ -51,16 +54,23 @@ class TranscriptionWorker(QObject):
transcription_id=self.transcription.id_as_uuid
)
# Check if the language uses spaces between words
language = self.transcription.language or ""
is_non_space_language = language in NON_SPACE_LANGUAGES
# For non-space languages, don't add spaces between words
separator = "" if is_non_space_language else " "
segments = []
words = []
text = ""
for buzz_segment in buzz_segments:
words.append({
'word': buzz_segment.text + " ",
'word': buzz_segment.text + separator,
'start': buzz_segment.start_time / 100,
'end': buzz_segment.end_time / 100,
})
text += buzz_segment.text + " "
text += buzz_segment.text + separator
if SENTENCE_END.match(buzz_segment.text):
segments.append({
@ -70,6 +80,13 @@ class TranscriptionWorker(QObject):
words = []
text = ""
# Add any remaining words that weren't terminated by sentence-ending punctuation
if words:
segments.append({
'text': text,
'words': words
})
return {
'language': self.transcription.language,
'segments': segments
@ -153,6 +170,38 @@ class TranscriptionResizerWidget(QWidget):
layout = QFormLayout(self)
# Extend segment endings
extend_label = QLabel(_("Extend end time"), self)
font = extend_label.font()
font.setWeight(QFont.Weight.Bold)
extend_label.setFont(font)
layout.addRow(extend_label)
extend_group_box = QGroupBox(self)
extend_layout = QVBoxLayout(extend_group_box)
self.extend_row = QHBoxLayout()
self.extend_amount_label = QLabel(_("Extend endings by up to (seconds)"), self)
self.extend_amount_input = LineEdit("0.2", self)
self.extend_amount_input.setMaximumWidth(60)
self.extend_button = QPushButton(_("Extend endings"))
self.extend_button.clicked.connect(self.on_extend_button_clicked)
self.extend_row.addWidget(self.extend_amount_label)
self.extend_row.addWidget(self.extend_amount_input)
self.extend_row.addWidget(self.extend_button)
extend_layout.addLayout(self.extend_row)
layout.addRow(extend_group_box)
# Spacer
spacer1 = QSpacerItem(0, 10, QSizePolicy.Policy.Minimum, QSizePolicy.Policy.Fixed)
layout.addItem(spacer1)
# Resize longer subtitles
resize_label = QLabel(_("Resize Options"), self)
font = resize_label.font()
@ -182,12 +231,14 @@ class TranscriptionResizerWidget(QWidget):
resize_layout.addLayout(self.resize_row)
resize_group_box.setEnabled(self.transcription.word_level_timings != 1)
if self.transcription.word_level_timings == 1:
resize_group_box.setToolTip(_("Available only if word level timings were disabled during transcription"))
layout.addRow(resize_group_box)
# Spacer
spacer = QSpacerItem(0, 10, QSizePolicy.Policy.Minimum, QSizePolicy.Policy.Fixed)
layout.addItem(spacer)
spacer2 = QSpacerItem(0, 10, QSizePolicy.Policy.Minimum, QSizePolicy.Policy.Fixed)
layout.addItem(spacer2)
# Merge words into subtitles
merge_options_label = QLabel(_("Merge Options"), self)
@ -237,6 +288,8 @@ class TranscriptionResizerWidget(QWidget):
merge_options_layout.addLayout(self.merge_options_row)
merge_options_group_box.setEnabled(self.transcription.word_level_timings == 1)
if self.transcription.word_level_timings != 1:
merge_options_group_box.setToolTip(_("Available only if word level timings were enabled during transcription"))
layout.addRow(merge_options_group_box)
@ -292,6 +345,44 @@ class TranscriptionResizerWidget(QWidget):
if self.transcriptions_updated_signal:
self.transcriptions_updated_signal.emit(new_transcript_id)
def on_extend_button_clicked(self):
try:
extend_amount_seconds = float(self.extend_amount_input.text())
except ValueError:
extend_amount_seconds = 0.2
# Convert seconds to milliseconds (internal time unit)
extend_amount = int(extend_amount_seconds * 1000)
segments = self.transcription_service.get_transcription_segments(
transcription_id=self.transcription.id_as_uuid
)
extended_segments = []
for i, segment in enumerate(segments):
new_end = segment.end_time + extend_amount
# Ensure segment end doesn't exceed start of next segment
if i < len(segments) - 1:
next_start = segments[i + 1].start_time
new_end = min(new_end, next_start)
extended_segments.append(
Segment(
start=segment.start_time,
end=new_end,
text=segment.text
)
)
new_transcript_id = self.transcription_service.copy_transcription(
self.transcription.id_as_uuid
)
self.transcription_service.update_transcription_as_completed(new_transcript_id, extended_segments)
if self.transcriptions_updated_signal:
self.transcriptions_updated_signal.emit(new_transcript_id)
def on_merge_button_clicked(self):
self.new_transcript_id = self.transcription_service.copy_transcription(
self.transcription.id_as_uuid

View file

@ -0,0 +1,262 @@
import logging
import os
import platform
import subprocess
import tempfile
from typing import Optional
from PyQt6.QtCore import Qt, QUrl
from PyQt6.QtWidgets import QApplication
from PyQt6.QtGui import QIcon
from PyQt6.QtNetwork import QNetworkAccessManager, QNetworkRequest, QNetworkReply
from PyQt6.QtWidgets import (
QDialog,
QVBoxLayout,
QHBoxLayout,
QLabel,
QPushButton,
QProgressBar,
QMessageBox,
QWidget,
QTextEdit,
)
from buzz.__version__ import VERSION
from buzz.locale import _
from buzz.update_checker import UpdateInfo
from buzz.widgets.icon import BUZZ_ICON_PATH
class UpdateDialog(QDialog):
"""Dialog shows when an update is available"""
def __init__(
self,
update_info: UpdateInfo,
network_manager: Optional[QNetworkAccessManager] = None,
parent: Optional[QWidget] = None
):
super().__init__(parent)
self.update_info = update_info
if network_manager is None:
network_manager = QNetworkAccessManager(self)
self.network_manager = network_manager
self._download_reply: Optional[QNetworkReply] = None
self._temp_file_paths: list = []
self._pending_urls: list = []
self._temp_dir: Optional[str] = None
self._setup_ui()
def _setup_ui(self):
self.setWindowTitle(_("Update Available"))
self.setWindowIcon(QIcon(BUZZ_ICON_PATH))
self.setMinimumWidth(450)
layout = QVBoxLayout(self)
layout.setSpacing(16)
#header
header_label = QLabel(
_("A new version of Buzz is available!")
)
header_label.setStyleSheet("font-size: 16px; font-weight: bold;")
layout.addWidget(header_label)
#Version info
version_layout = QHBoxLayout()
current_version_label = QLabel(_("Current version:"))
current_version_value = QLabel(f"<b>{VERSION}</b>")
new_version_label = QLabel(_("New version:"))
new_version_value = QLabel(f"<b>{self.update_info.version}</b>")
version_layout.addWidget(current_version_label)
version_layout.addWidget(current_version_value)
version_layout.addStretch()
version_layout.addWidget(new_version_label)
version_layout.addWidget(new_version_value)
layout.addLayout(version_layout)
#Release notes
if self.update_info.release_notes:
notes_label = QLabel(_("Release Notes:"))
notes_label.setStyleSheet("font-weight: bold;")
layout.addWidget(notes_label)
notes_text = QTextEdit()
notes_text.setReadOnly(True)
notes_text.setMarkdown(self.update_info.release_notes)
notes_text.setMaximumHeight(150)
layout.addWidget(notes_text)
#progress bar
self.progress_bar = QProgressBar()
self.progress_bar.setVisible(False)
self.progress_bar.setTextVisible(True)
layout.addWidget(self.progress_bar)
#Status label
self.status_label = QLabel("")
self.status_label.setAlignment(Qt.AlignmentFlag.AlignCenter)
layout.addWidget(self.status_label)
#Buttons
button_layout = QVBoxLayout()
self.download_button = QPushButton(_("Download and Install"))
self.download_button.clicked.connect(self._on_download_clicked)
self.download_button.setDefault(True)
button_layout.addStretch()
button_layout.addWidget(self.download_button)
layout.addLayout(button_layout)
def _on_download_clicked(self):
"""Starts downloading the installer"""
if not self.update_info.download_urls:
QMessageBox.warning(
self,
_("Error"),
_("No download URL available for your platform.")
)
return
self.download_button.setEnabled(False)
self.progress_bar.setVisible(True)
self.progress_bar.setValue(0)
self._temp_file_paths = []
self._pending_urls = list(self.update_info.download_urls)
self._temp_dir = tempfile.mkdtemp()
self._download_next_file()
def _download_next_file(self):
"""Download the next file in the queue"""
if not self._pending_urls:
self._all_downloads_finished()
return
url_str = self._pending_urls[0]
file_index = len(self.update_info.download_urls) - len(self._pending_urls) + 1
total_files = len(self.update_info.download_urls)
self.status_label.setText(
_("Downloading file {} of {}...").format(file_index, total_files)
)
url = QUrl(url_str)
request = QNetworkRequest(url)
self._download_reply = self.network_manager.get(request)
self._download_reply.downloadProgress.connect(self._on_download_progress)
self._download_reply.finished.connect(self._on_download_finished)
def _on_download_progress(self, bytes_received: int, bytes_total: int):
"""Update the progress bar during download"""
if bytes_total > 0:
progress = int((bytes_received / bytes_total) * 100)
self.progress_bar.setValue(progress)
mb_received = bytes_received / (1024 * 1024)
mb_total = bytes_total / (1024 * 1024)
file_index = len(self.update_info.download_urls) - len(self._pending_urls) + 1
total_files = len(self.update_info.download_urls)
self.status_label.setText(
_("Downloading file {} of {} ({:.1f} MB / {:.1f} MB)...").format(
file_index, total_files, mb_received, mb_total
)
)
def _on_download_finished(self):
"""Handles download completion for one file"""
if self._download_reply is None:
return
if self._download_reply.error() != QNetworkReply.NetworkError.NoError:
error_msg = self._download_reply.errorString()
logging.error(f"Download failed: {error_msg}")
QMessageBox.critical(
self,
_("Download Failed"),
_("Failed to download the update: {}").format(error_msg)
)
self._reset_ui()
self._download_reply.deleteLater()
self._download_reply = None
return
data = self._download_reply.readAll().data()
self._download_reply.deleteLater()
self._download_reply = None
url_str = self._pending_urls.pop(0)
# Extract original filename from URL to preserve it
original_filename = QUrl(url_str).fileName()
if not original_filename:
original_filename = f"download_{len(self._temp_file_paths)}"
try:
temp_path = os.path.join(self._temp_dir, original_filename)
with open(temp_path, "wb") as f:
f.write(data)
self._temp_file_paths.append(temp_path)
logging.info(f"File saved to: {temp_path}")
except Exception as e:
logging.error(f"Failed to save file: {e}")
QMessageBox.critical(
self,
_("Error"),
_("Failed to save the installer: {}").format(str(e))
)
self._reset_ui()
return
self._download_next_file()
def _all_downloads_finished(self):
"""All files downloaded, run the installer"""
self.progress_bar.setValue(100)
self.status_label.setText(_("Download complete!"))
self._run_installer()
def _run_installer(self):
"""Run the downloaded installer"""
if not self._temp_file_paths:
return
installer_path = self._temp_file_paths[0]
system = platform.system()
try:
if system == "Windows":
subprocess.Popen([installer_path], shell=True)
elif system == "Darwin":
#open the DMG file
subprocess.Popen(["open", installer_path])
# Close the app so the installer can replace files
self.accept()
QApplication.quit()
except Exception as e:
logging.error(f"Failed to run installer: {e}")
QMessageBox.critical(
self,
_("Error"),
_("Failed to run the installer: {}").format(str(e))
)
def _reset_ui(self):
"""Reset the UI to initial state after an error"""
self.download_button.setEnabled(True)
self.progress_bar.setVisible(False)
self.status_label.setText("")

View file

@ -11,9 +11,7 @@ The models are stored:
- Mac OS: `~/Library/Caches/Buzz`
- Windows: `%USERPROFILE%\AppData\Local\Buzz\Buzz\Cache`
Paste the location in your file manager to access the models.
Since Version `1.3.4`, to get to the logs folder go to `Help -> About Buzz` and click on `Show logs` button.
Paste the location in your file manager to access the models or go to `Help -> Preferences -> Models` and click on `Show file location` button after downloading some model.
### 2. What can I try if the transcription runs too slowly?
@ -39,7 +37,7 @@ When choosing among large models consider the following. "Large" is the first re
In addition to choosing an appropriate model size you also can choose whisper type.
- **Whisper** is initial OpenAI implementation, it is accurate but slow and requires a lot of RAM.
- Faster **Whisper** is an optimized implementation, it is orders of magnitude faster than regular Whisper and requires less RAM. Use this option if you have an Nvidia GPU with at least 6GB of VRAM.
- **Faster Whisper** is an optimized implementation, it is orders of magnitude faster than regular Whisper and requires less RAM. Use this option if you have an Nvidia GPU with at least 6GB of VRAM.
- **Whisper.cpp** is optimized C++ implementation, it quite fast and efficient and will use any brand of GPU. Whisper.cpp is capable of running real time transcription even on a modern laptop with integrated GPU. It can also run on CPU only. Use this option if you do not have Nvidia GPU.
- **HuggingFace** option is a `Transformers` implementation and is good in that it supports wide range of custom models that may be optimized for a particular language. This option also supports [MMS](https://ai.meta.com/blog/multilingual-model-speech-recognition/) family of models from Meta AI that support over 1000 of worlds languages as well as [PEFT](https://github.com/huggingface/peft) adjustments to Whisper models.
@ -67,7 +65,7 @@ Yes, Buzz can be used without internet connection if you download the necessary
If a model download was incomplete or corrupted, Buzz may crash. Try to delete the downloaded model files in `Help -> Preferences -> Models` and re-download them.
If that does not help, check the log file for errors and [report the issue](https://github.com/chidiwilliams/buzz/issues) so we can fix it. The log file is located in `~/Library/Logs/Buzz` (Mac OS) or `%USERPROFILE%\AppData\Local\Buzz\Buzz\Logs` (Windows). On Linux run the Buzz from the command line to see the relevant messages.
If that does not help, check the log file for errors and [report the issue](https://github.com/chidiwilliams/buzz/issues) so we can fix it. If possible attach the log file to the issue. Since Version `1.3.4`, to get to the logs folder go to `Help -> About Buzz` and click on `Show logs` button.
### 9. Where can I get latest development version?

View file

@ -12,33 +12,22 @@ OpenAI's [Whisper](https://github.com/openai/whisper).
![GitHub release (latest by date)](https://img.shields.io/github/v/release/chidiwilliams/buzz)
[![Github all releases](https://img.shields.io/github/downloads/chidiwilliams/buzz/total.svg)](https://GitHub.com/chidiwilliams/buzz/releases/)
> Buzz is better on the App Store. Get a Mac-native version of Buzz with a cleaner look, audio playback, drag-and-drop import, transcript editing, search, and much more.
> <a href="https://apps.apple.com/us/app/buzz-captions/id6446018936?mt=12&amp;itsct=apps_box_badge&amp;itscg=30200"><img src="https://tools.applemediaservices.com/api/badges/download-on-the-mac-app-store/black/en-us?size=250x83&amp;releaseDate=1679529600" alt="Download on the Mac App Store" /></a>
## Features
- Import audio and video files and export transcripts to TXT, SRT, and
VTT ([Demo](https://www.loom.com/share/cf263b099ac3481082bb56d19b7c87fe))
- Transcription and translation from your computer's microphones to text (Resource-intensive and may not be
real-time, [Demo](https://www.loom.com/share/564b753eb4d44b55b985b8abd26b55f7))
- **Advanced Transcription Viewer** with search, playback controls, and speed adjustment
- Presentation window for easy accessibility during events and presentations
- [Realtime translation](https://chidiwilliams.github.io/buzz/docs/usage/translations) with OpenAI API compatible AI
- [Advanced Transcription Viewer](https://chidiwilliams.github.io/buzz/docs/usage/transcription_viewer) with search, playback controls, and speed adjustment
- **Smart Interface** with conditional visibility and state persistence
- **Professional Controls** including loop segments, follow audio, and keyboard shortcuts
- Supports [Whisper](https://github.com/openai/whisper#available-models-and-languages),
[Whisper.cpp](https://github.com/ggerganov/whisper.cpp), [Faster Whisper](https://github.com/guillaumekln/faster-whisper),
[Whisper.cpp](https://github.com/ggerganov/whisper.cpp) (with Vulkan GPU acceleration), [Faster Whisper](https://github.com/guillaumekln/faster-whisper),
[Whisper-compatible Hugging Face models](https://huggingface.co/models?other=whisper), and
the [OpenAI Whisper API](https://platform.openai.com/docs/api-reference/introduction)
- [Command-Line Interface](#command-line-interface)
- Available on Mac, Windows, and Linux
## Transcription Viewer
Buzz features a powerful transcription viewer that makes it easy to work with your transcriptions:
- **🔍 Smart Search**: Find text quickly with real-time search and navigation
- **🎵 Playback Controls**: Loop segments, follow audio, and adjust playback speed
- **⌨️ Keyboard Shortcuts**: Efficient navigation with Ctrl+F, Ctrl+L, and more
- **🎨 Clean Interface**: Conditional visibility keeps the interface uncluttered
- **💾 State Persistence**: Remembers your preferences between sessions
[Learn more about the Transcription Viewer →](https://chidiwilliams.github.io/buzz/docs/usage/transcription_viewer)
- Speech separation before transcription for better accuracy on noisy audio
- [Speaker identification](https://chidiwilliams.github.io/buzz/docs/usage/speaker_identification) in transcribed media
- Available on Mac, Windows, and Linux

View file

@ -3,8 +3,8 @@ title: Installation
sidebar_position: 2
---
To install Buzz, download the [latest version](https://github.com/chidiwilliams/buzz/releases/latest) for your operating
system. Buzz is available on **Mac** (Intel), **Windows**, and **Linux**.
To install Buzz, download the latest version for your operating
system. Buzz is available on **Mac** (Intel and Apple silicon), **Windows**, and **Linux**.
### macOS
@ -25,6 +25,8 @@ To install flatpak, run:
flatpak install flathub io.github.chidiwilliams.Buzz
```
[![Download on Flathub](https://flathub.org/api/badge?svg&locale=en)](https://flathub.org/en/apps/io.github.chidiwilliams.Buzz)
To install snap, run:
```shell
sudo apt-get install libportaudio2 libcanberra-gtk-module libcanberra-gtk3-module
@ -34,15 +36,15 @@ sudo snap connect buzz:password-manager-service
[![Get it from the Snap Store](https://snapcraft.io/static/images/badges/en/snap-store-black.svg)](https://snapcraft.io/buzz)
Alternatively, on Ubuntu 20.04 and later, install the dependencies:
```shell
sudo apt-get install libportaudio2
```
## PyPI
```shell
pip install buzz-captions
python -m buzz
```
On Linux install system dependencies you may be missing
```
sudo apt-get install --no-install-recommends libyaml-dev libtbb-dev libxkbcommon-x11-0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1 libxcb-randr0 libxcb-render-util0 libxcb-xinerama0 libxcb-shape0 libxcb-cursor0 libportaudio2 gettext libpulse0 ffmpeg
```
On versions prior to Ubuntu 24.04 install `sudo apt-get install --no-install-recommends libegl1-mesa`

View file

@ -16,11 +16,11 @@ title: File Import
To reduce misspellings you can pass some commonly misspelled words in an `Initial prompt` that is available under `Advanced...` button. See this [guide on prompting](https://cookbook.openai.com/examples/whisper_prompting_guide#pass-names-in-the-prompt-to-prevent-misspellings).
| Field | Options | Default | Description |
| ------------------ | ------------------- | ------- |---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Export As | "TXT", "SRT", "VTT" | "TXT" | Export file format |
| Word-Level Timings | Off / On | Off | If checked, the transcription will generate a separate subtitle line for each word in the audio. Combine words into subtitles afterwards with the [resize option](https://chidiwilliams.github.io/buzz/docs/usage/edit_and_resize). |
| Extract speech | Off / On | Off | If checked, speech will be extracted to a separate audio tack to improve accuracy. Available since 1.3.0. |
| Field | Options | Default | Description |
| ------------------ | ------------------- | ------- |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Export As | "TXT", "SRT", "VTT" | "TXT" | Export file format |
| Word-Level Timings | Off / On | Off | If checked, the transcription will generate a separate subtitle line for each word in the audio. Combine words into subtitles afterwards with the [resize option](https://chidiwilliams.github.io/buzz/docs/usage/edit_and_resize). |
| Extract speech | Off / On | Off | If checked, speech will be extracted to a separate audio tack to improve accuracy. |
(See the [Live Recording section](https://chidiwilliams.github.io/buzz/docs/usage/live_recording) for more information about the task, language, and quality settings.)

View file

@ -8,7 +8,7 @@ To start a live recording:
- Click Record.
> **Note:** Transcribing audio using the default Whisper model is resource-intensive. Consider using the Whisper.cpp.
> Since 1.3.0 it supports GPU acceleration, if the model fits in GPU memory. Use smaller models for real-time performance.
> It supports GPU acceleration, if the model fits in GPU memory. Use smaller models for real-time performance.
| Field | Options | Default | Description |
|------------|------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
@ -18,6 +18,19 @@ To start a live recording:
[![Live Recording on Buzz](https://cdn.loom.com/sessions/thumbnails/564b753eb4d44b55b985b8abd26b55f7-with-play.gif)](https://www.loom.com/share/564b753eb4d44b55b985b8abd26b55f7 "Live Recording on Buzz")
#### Advanced preferences
**Silence threshold** Set threshold to for transcriptions to be processed. If average volume level is under this setting the sentence will not be transcribed. Available since 1.4.4.
**Line separator** Marking to add to the transcription and translation lines. Default value is two new lines (`\n\n`) that result in an empty space between translation or transcription lines. To have no empty line use `\n`. Available since 1.4.4.
**Transcription step** If live recording mode is set to `Append and correct`, you can also set a transcription step. Shorter steps will reduce latency but cause larger load on the system. Monitor the `Queue` while transcribing in this mode, if it grows too much, increase the transcription step, to reduce load. Available since 1.4.4.
**Hide unconfirmed** If live recording mode is set to `Append and correct`, you can also hide the unconfirmed part of the last transcript. This part may be incorrect as the Buzz has seen it only in one overlapping transcription segment. Hiding it will increase latency, but result will show only the correct transcripts. Available since 1.4.4.
#### Presentation Window
Buzz has an easy to use presentation window you can use to show live transcriptions during events and presentations. To open it start the recording and new options for the `Presentation window` will appear.
### Record audio playing from computer (macOS)
To record audio playing from an application on your computer, you may install an audio loopback driver (a program that

View file

@ -2,7 +2,7 @@
title: Translations
---
Default `Translation` task uses Whisper model ability to translate to English, however `Large-V3-Turbo` is not compatible with this standard. Since version `1.0.0` Buzz supports additional AI translations to any other language.
Default `Translation` task uses Whisper model ability to translate to English, however `Large-V3-Turbo` is not compatible with this standard. Buzz supports additional AI translations to any other language.
To use translation feature you will need to configure OpenAI API key and translation settings. Set OpenAI API ket in Preferences. Buzz also supports custom locally running translation AIs that support OpenAI API. For more information on locally running AIs see [ollama](https://ollama.com/blog/openai-compatibility) or [LM Studio](https://lmstudio.ai/). For information on available custom APIs see this [discussion thread](https://github.com/chidiwilliams/buzz/discussions/827).

View file

@ -8,4 +8,6 @@ When transcript of some audio or video file is generated you can edit it and exp
Transcription view screen has option to resize the transcripts. Click on the "Resize" button so see available options. Transcripts that have been generated **with word-level timings** setting enabled can be combined into subtitles specifying different options, like maximum length of a subtitle and if subtitles should be split on punctuation. For transcripts that have been generated **without word-level timings** setting enabled can only be recombined specifying desired max length of a subtitle.
If audio file is still present on the system word-level timing merge will also analyze the audio for silences to improve subtitle accuracy. Subtitle generation from transcripts with word-level timings is available since version 1.3.0.
If audio file is still present on the system word-level timing merge will also analyze the audio for silences to improve subtitle accuracy.
The resize tool also has an option to extend end time of segments if you want the subtitles to be on the screen for longer. You can specify the amount of time in seconds to extend each subtitle segment. Buzz will add this amount of time to the end of each subtitle segment making sure that the end of a segment does not go over start of the next segment. This feature is available since 1.4.3.

View file

@ -6,4 +6,4 @@ When transcript of some audio or video file is generated you can identify speake
Transcription view screen has option to identify speakers. Click on the "Identify speakers" button so see available options.
If audio file is still present on the system speaker identification will mark each speakers sentences with appropriate label. You can preview 10 seconds of some random sentence of the identified speaker and rename the automatically identified label to speakers real name. If "Merge speaker sentences" checkbox is selected when you save the speaker labels, all consecutive sentences of the same speaker will be merged into one segment. Speaker identification is available since version 1.4.0 on all platforms except Intel macOS.
If audio file is still present on the system speaker identification will mark each speakers sentences with appropriate label. You can preview 10 seconds of some random sentence of the identified speaker and rename the automatically identified label to speakers real name. If "Merge speaker sentences" checkbox is selected when you save the speaker labels, all consecutive sentences of the same speaker will be merged into one segment. Speaker identification is not available on Intel macOS.

View file

@ -9,7 +9,7 @@ The transcription viewer is organized into several key sections:
- **Top Toolbar**: Contains view mode, export, translate, resize, and search
- **Search Bar**: Find and navigate through transcript text
- **Transcription Segments**: Table view of all transcription segments with timestamps
- **Playback Controls**: Audio playback settings and speed controls (since version 1.3.0)
- **Playback Controls**: Audio playback settings and speed controls
- **Audio Player**: Standard media player with progress bar
- **Current Segment Display**: Shows the currently selected or playing segment
@ -37,25 +37,21 @@ The transcription viewer is organized into several key sections:
- **More information**: See [Edit and Resize](https://chidiwilliams.github.io/buzz/docs/usage/edit_and_resize) section
### Playback Controls Button
(since version 1.3.0)
- **Function**: Show/hide playback control panel
- **Shortcut**: `Ctrl+Alt+P` (Windows/Linux) or `Cmd+Alt+P` (macOS)
- **Behavior**: Toggle button that shows/hides the playback controls below
### Find Button
(since version 1.3.0)
- **Function**: Show/hide search functionality
- **Shortcut**: `Ctrl+F` (Windows/Linux) or `Cmd+F` (macOS)
- **Behavior**: Toggle button that shows/hides the search bar
### Scroll to Current Button
(since version 1.3.0)
- **Function**: Automatically scroll to the currently playing text
- **Shortcut**: `Ctrl+G` (Windows/Linux) or `Cmd+G` (macOS)
- **Usage**: Click to jump to the current audio position in the transcript
## Search Functionality
(since version 1.3.0)
### Search Bar
The search bar appears below the toolbar when activated and provides:
@ -80,7 +76,6 @@ The search bar appears below the toolbar when activated and provides:
- **Cross-view Search**: Works in all view modes (Timestamps, Text, Translation)
## Playback Controls
(since version 1.3.0)
### Loop Segment
- **Function**: Automatically loop playback of selected segments
@ -105,7 +100,6 @@ The search bar appears below the toolbar when activated and provides:
- **Button Sizing**: Speed control buttons match the size of search navigation buttons for visual consistency
## Keyboard Shortcuts
(since version 1.3.0)
### Audio Playback
- **`Ctrl+P` / `Cmd+P`**: Play/Pause audio

View file

@ -11,11 +11,6 @@ sidebar_position: 1
![GitHub release (latest by date)](https://img.shields.io/github/v/release/chidiwilliams/buzz)
[![Github all releases](https://img.shields.io/github/downloads/chidiwilliams/buzz/total.svg)](https://GitHub.com/chidiwilliams/buzz/releases/)
<blockquote>
<p>在 App Store 下载运行的性能更佳。 获得外观更整洁、音频播放、拖放导入、转录编辑、搜索等功能的原生Mac版本。</p>
<a href="https://apps.apple.com/cn/app/buzz-captions/id6446018936?mt=12&amp;itsct=apps_box_badge&amp;itscg=30200"><img src="https://toolbox.marketingtools.apple.com/api/badges/download-on-the-mac-app-store/black/zh-cn?size=250x83" alt="在 Mac App Store 下载" /></a>
</blockquote>
## 功能
- 导入音频和视频文件,并将转录内容导出为 TXT、SRT 和 VTT 格式([演示](https://www.loom.com/share/cf263b099ac3481082bb56d19b7c87fe)

View file

@ -3,7 +3,7 @@ title: 安装
sidebar_position: 2
---
要安装 Buzz请下载适用于您操作系统的[最新版本](https://github.com/chidiwilliams/buzz/releases/latest)。Buzz 支持 **Mac**Intel、**Windows** 和 **Linux** 系统。(对于 Apple Silicon 用户,请参阅 [App Store 版本](https://apps.apple.com/us/app/buzz-captions/id6446018936?mt=12&itsct=apps_box_badge&itscg=30200)。)
要安装 Buzz请下载适用于您操作系统的[最新版本](https://github.com/chidiwilliams/buzz/releases/latest)。Buzz 支持 **Mac**Intel、**Windows** 和 **Linux** 系统。
## macOSIntelmacOS 11.7 及更高版本)
@ -15,8 +15,7 @@ brew install --cask buzz
或者,下载并运行 `Buzz-x.y.z.dmg` 文件。
对于 Mac Silicon 用户(以及希望在 Mac Intel 上获得更好体验的用户),
请从 App Store 下载 [Buzz Captions](https://apps.apple.com/us/app/buzz-captions/id6446018936?mt=12&itsct=apps_box_badge&itscg=30200)。
对于 Mac Silicon 用户(以及希望在 Mac Intel 上获得更好体验的用户)。
## WindowsWindows 10 及更高版本)

View file

@ -82,6 +82,42 @@ class CustomBuildHook(BuildHookInterface):
# Build ctc_forced_aligner C++ extension in-place
print("Building ctc_forced_aligner C++ extension...")
ctc_aligner_dir = project_root / "ctc_forced_aligner"
# Apply local patches before building.
# Uses --check first to avoid touching the working tree unnecessarily,
# which is safer in a detached-HEAD submodule.
patches_dir = project_root / "patches"
for patch_file in sorted(patches_dir.glob("ctc_forced_aligner_*.patch")):
# Dry-run forward: succeeds only if patch is NOT yet applied.
check_forward = subprocess.run(
["git", "apply", "--check", "--ignore-whitespace", str(patch_file)],
cwd=ctc_aligner_dir,
capture_output=True,
text=True,
)
if check_forward.returncode == 0:
# Patch can be applied — do it for real.
subprocess.run(
["git", "apply", "--ignore-whitespace", str(patch_file)],
cwd=ctc_aligner_dir,
check=True,
capture_output=True,
text=True,
)
print(f"Applied patch: {patch_file.name}")
else:
# Dry-run failed — either already applied or genuinely broken.
check_reverse = subprocess.run(
["git", "apply", "--check", "--reverse", "--ignore-whitespace", str(patch_file)],
cwd=ctc_aligner_dir,
capture_output=True,
text=True,
)
if check_reverse.returncode == 0:
print(f"Patch already applied (skipping): {patch_file.name}")
else:
print(f"WARNING: could not apply patch {patch_file.name}: {check_forward.stderr}", file=sys.stderr)
result = subprocess.run(
[sys.executable, "setup.py", "build_ext", "--inplace"],
cwd=ctc_aligner_dir,
@ -116,27 +152,29 @@ class CustomBuildHook(BuildHookInterface):
else:
print(f"Warning: {whisper_cpp_dir} does not exist after build", file=sys.stderr)
# Force include all files in demucs directory
demucs_dir = project_root / "demucs_repo"
if demucs_dir.exists():
# Get all files in the demucs directory
demucs_files = glob.glob(str(demucs_dir / "**" / "*"), recursive=True)
# Force include demucs package at top level (demucs_repo/demucs -> demucs/)
demucs_pkg_dir = project_root / "demucs_repo" / "demucs"
if demucs_pkg_dir.exists():
# Get all files in the demucs package directory
demucs_files = glob.glob(str(demucs_pkg_dir / "**" / "*"), recursive=True)
# Filter only files (not directories)
demucs_files = [f for f in demucs_files if Path(f).is_file()]
# Add them to force_include
# Add them to force_include, mapping to top-level demucs/
if 'force_include' not in build_data:
build_data['force_include'] = {}
for file_path in demucs_files:
# Convert to relative path from project root
rel_path = Path(file_path).relative_to(project_root)
build_data['force_include'][str(rel_path)] = str(rel_path)
# Convert to relative path from demucs package dir
rel_from_pkg = Path(file_path).relative_to(demucs_pkg_dir)
# Target path is demucs/<relative_path>
target_path = Path("demucs") / rel_from_pkg
build_data['force_include'][str(file_path)] = str(target_path)
print(f"Force including {len(demucs_files)} files from demucs_repo/")
print(f"Force including {len(demucs_files)} files from demucs_repo/demucs/ -> demucs/")
else:
print(f"Warning: {demucs_dir} does not exist", file=sys.stderr)
print(f"Warning: {demucs_pkg_dir} does not exist", file=sys.stderr)
# Force include all .mo files from buzz/locale directory
locale_dir = project_root / "buzz" / "locale"

View file

@ -0,0 +1,16 @@
diff --git a/setup.py b/setup.py
index de84a25..386f662 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,10 @@ ext_modules = [
Pybind11Extension(
"ctc_forced_aligner.ctc_forced_aligner",
["ctc_forced_aligner/forced_align_impl.cpp"],
- extra_compile_args=["/O2"] if sys.platform == "win32" else ["-O3"],
+ # /D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR prevents MSVC runtime mutex
+ # static-initializer crash on newer GitHub Actions Windows runners.
+ # See: https://github.com/actions/runner-images/issues/10004
+ extra_compile_args=["/O2", "/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR"] if sys.platform == "win32" else ["-O3"],
)
]

View file

@ -1,11 +1,12 @@
[project]
name = "buzz-captions"
# Change also in Makefile and buzz/__version__.py
version = "1.4.0"
version = "1.4.4"
description = ""
authors = [{ name = "Chidi Williams", email = "williamschidi1@gmail.com" }]
requires-python = ">=3.12,<3.13"
readme = "README.md"
# License format change to remove warning in PyPI will cause snap not to build
license = { text = "MIT" }
dependencies = [
"sounddevice>=0.5.3,<0.6",
@ -19,7 +20,7 @@ dependencies = [
"dataclasses-json>=0.6.4,<0.7",
"numpy>=1.21.2,<2",
"requests>=2.31.0,<3",
"yt-dlp>=2025.11.12,<2026",
"yt-dlp>=2026.2.21",
"stable-ts>=2.19.1,<3",
"faster-whisper>=1.2.1,<2",
"openai-whisper==20250625",
@ -45,10 +46,7 @@ dependencies = [
"ctranslate2>=4.6.2,<5; sys_platform != 'darwin'",
# faster whisper need cudnn 9
"nvidia-cudnn-cu12>=9,<10; sys_platform != 'darwin'",
# CUDA runtime libraries for Windows (Linux gets them via torch dependencies)
"nvidia-cuda-runtime-cu12>=12.9,<13; sys_platform == 'win32'",
"nvidia-cublas-cu12>=12.9,<13; sys_platform == 'win32'",
"nvidia-cuda-nvrtc-cu12>=12.9,<13; sys_platform == 'win32'",
# CUDA runtime libraries are provided by torch dependencies, no need to specify explicitly
"darkdetect>=0.8.0,<0.9",
"dora-search>=0.1.12,<0.2",
"diffq>=0.2.4,<0.3",
@ -70,6 +68,7 @@ dependencies = [
"posthog>=3.23.0,<4",
# This version works, newer have issues on Windows
"onnxruntime==1.18.1",
"onnx>=1.20.0", # Required for nemo-toolkit, ensures ml-dtypes is installed
"vulkan>=1.3.275.1,<2",
"hf-xet>=1.1.5,<2",
"hatchling>=1.28.0",
@ -80,7 +79,7 @@ dependencies = [
"uroman>=1.3.1.1",
"lhotse==1.32.1",
"coverage==7.12.0",
"demucs",
# demucs is bundled directly in the wheel from demucs_repo/, not installed as a dependency
"certifi==2025.11.12",
"torchcodec>=0.9.0; sys_platform != 'darwin' or platform_machine != 'x86_64'",
"torch>=2.2.2",
@ -131,7 +130,6 @@ override-dependencies = [
]
[tool.uv.sources]
demucs = { path = "demucs_repo", editable = true }
torch = [
{ index = "PyPI", marker = "sys_platform == 'darwin'" },
{ index = "pytorch-cu129", marker = "sys_platform != 'darwin'" },
@ -173,18 +171,27 @@ include = [
"buzz",
"buzz/whisper_cpp/*",
"buzz/locale/*/LC_MESSAGES/buzz.mo",
"demucs_repo",
"whisper_diarization",
"deepmultilingualpunctuation",
"ctc_forced_aligner",
]
# Map demucs_repo/demucs to top-level demucs/ so 'import demucs' works
sources = {"demucs_repo/demucs" = "demucs"}
[tool.hatch.build.hooks.custom]
[build-system]
requires = ["hatchling", "cmake>=4.2.0,<5", "polib>=1.2.0,<2", "pybind11", "setuptools>=42"]
requires = ["hatchling", "cmake>=4.2.0,<5", "polib>=1.2.0,<2", "pybind11", "setuptools>=80.9.0"]
build-backend = "hatchling.build"
[tool.coverage.report]
exclude_also = [
"if sys.platform == \"win32\":",
"if platform.system\\(\\) == \"Windows\":",
"if platform.system\\(\\) == \"Linux\":",
"if platform.system\\(\\) == \"Darwin\":",
]
[tool.ruff]
exclude = [
"**/whisper.cpp",

View file

@ -2,7 +2,7 @@
# Buzz
[项目文档](https://chidiwilliams.github.io/buzz/zh/docs) | [苹果应用商店的 Buzz Captions 页面](https://apps.apple.com/us/app/buzz-captions/id6446018936?mt=12&itsct=apps_box_badge&itscg=30200)
[项目文档](https://chidiwilliams.github.io/buzz/zh/docs)
在个人电脑上离线转录和翻译音频。技术模型来源 OpenAI [Whisper](https://github.com/openai/whisper).
@ -12,13 +12,6 @@
![GitHub release (latest by date)](https://img.shields.io/github/v/release/chidiwilliams/buzz)
[![Github all releases](https://img.shields.io/github/downloads/chidiwilliams/buzz/total.svg)](https://GitHub.com/chidiwilliams/buzz/releases/)
<blockquote>
<p>在 App Store 下载运行的性能更佳。 获得外观更整洁、音频播放、拖放导入、转录编辑、搜索等功能的原生Mac版本。</p>
<a href="https://apps.apple.com/cn/app/buzz-captions/id6446018936?mt=12&amp;itsct=apps_box_badge&amp;itscg=30200"><img src="https://toolbox.marketingtools.apple.com/api/badges/download-on-the-mac-app-store/black/zh-cn?size=250x83" alt="Download on the Mac App Store" /></a>
</blockquote>
![Buzz](../buzz/assets/buzz-banner.jpg)
## 安装
**PyPI**:
@ -53,7 +46,6 @@ brew install --cask buzz
```shell
sudo apt-get install libportaudio2 libcanberra-gtk-module libcanberra-gtk3-module
sudo snap install buzz
sudo snap connect buzz:password-manager-service
```
### 最新开发者版本

View file

@ -27,6 +27,7 @@
<url type="bugtracker">https://github.com/chidiwilliams/buzz/issues</url>
<url type="homepage">https://github.com/chidiwilliams/buzz</url>
<url type="faq">https://chidiwilliams.github.io/buzz/docs</url>
<url type="vcs-browser">https://github.com/chidiwilliams/buzz</url>
<branding>
<color type="primary" scheme_preference="light">#f66151</color>
@ -64,8 +65,28 @@
<content_rating type="oars-1.1"/>
<releases>
<release version="1.4.0" date="2025-12-30">
<url type="details">https://github.com/chidiwilliams/buzz/releases/tag/v1.4.0</url>
<release version="1.4.4" date="2026-03-08">
<url type="details">https://github.com/chidiwilliams/buzz/releases/tag/v1.4.4</url>
<description>
<p>Bug fixes and minor improvements.</p>
<ul>
<li>Fixed Youtube link downloading</li>
<li>Added option to import folder</li>
<li>Extra settings for live recordings</li>
<li>Adjusted live recording batching process to avoid min-word cuts</li>
<li>Update checker for Windows and Macs</li>
<li>Added voice activity detection to whisper.cpp</li>
</ul>
</description>
</release>
<release version="1.4.3" date="2026-01-26">
<url type="details">https://github.com/chidiwilliams/buzz/releases/tag/v1.4.3</url>
<description>
<p>Fixed support for whisper.cpp on older CPUs and issues in speaker identification.</p>
</description>
</release>
<release version="1.4.2" date="2026-01-03">
<url type="details">https://github.com/chidiwilliams/buzz/releases/tag/v1.4.2</url>
<description>
<p>Adding speaker identification on transcriptions and video support for transcription viewer, improvements to transcription table and support for over 1000 of worlds languages via MMS models as well as separate window to show live transcripts on a projector.</p>
<p>Release details:</p>

View file

@ -50,8 +50,23 @@ parts:
prime:
- etc/asound.conf
buzz:
portaudio:
after: [ alsa-pulseaudio ]
plugin: autotools
source: https://files.portaudio.com/archives/pa_stable_v190700_20210406.tgz
build-packages:
- libasound2-dev
- libpulse-dev
autotools-configure-parameters:
- --enable-shared
- --disable-static
stage:
- usr/local/lib/libportaudio*
prime:
- usr/local/lib/libportaudio*
buzz:
after: [ alsa-pulseaudio, portaudio ]
plugin: uv
source: .
build-snaps:
@ -78,9 +93,8 @@ parts:
- libproxy1v5
# Audio
- ffmpeg
- libportaudio2
- libpulse0
- libasound2
- libasound2t64
- libasound2-dev
- libasound2-plugins
- libasound2-plugins-extra
@ -115,6 +129,12 @@ parts:
# Clean caches
uv cache clean
# Create launcher wrapper to ensure the snap's own portaudio and libasound are found
# before gnome content snap libraries (which desktop-launch prepends to LD_LIBRARY_PATH)
mkdir -p $CRAFT_PART_INSTALL/bin
printf '#!/bin/sh\nexport LD_LIBRARY_PATH="$SNAP/usr/local/lib:$SNAP/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH"\nexec "$SNAP/bin/python" -m buzz "$@"\n' > $CRAFT_PART_INSTALL/bin/buzz-launcher
chmod +x $CRAFT_PART_INSTALL/bin/buzz-launcher
# Copy source files
cp -r $CRAFT_PART_BUILD/buzz $CRAFT_PART_INSTALL/
cp -r $CRAFT_PART_BUILD/ctc_forced_aligner $CRAFT_PART_INSTALL/
@ -148,11 +168,11 @@ apps:
- gnome
command-chain:
- bin/gpu-2404-wrapper
command: snap/command-chain/desktop-launch $SNAP/bin/python -m buzz
command: snap/command-chain/desktop-launch $SNAP/bin/buzz-launcher
desktop: usr/share/applications/buzz.desktop
environment:
PATH: $SNAP/usr/bin:$SNAP/bin:$PATH
LD_LIBRARY_PATH: $SNAP/lib/python3.12/site-packages/nvidia/cudnn/lib:$SNAP/lib/python3.12/site-packages/PyQt6:$SNAP/lib/python3.12/site-packages/PyQt6/Qt6/lib:$SNAP/usr/lib/$CRAFT_ARCH_TRIPLET_BUILD_FOR/lapack:$SNAP/usr/lib/$CRAFT_ARCH_TRIPLET_BUILD_FOR/blas:$SNAP/usr/lib/$CRAFT_ARCH_TRIPLET_BUILD_FOR/oss4-libsalsa:$SNAP/usr/lib/$CRAFT_ARCH_TRIPLET_BUILD_FOR/libproxy:$SNAP:$LD_LIBRARY_PATH
LD_LIBRARY_PATH: $SNAP/usr/local/lib:$SNAP/lib/python3.12/site-packages/nvidia/cudnn/lib:$SNAP/lib/python3.12/site-packages/PyQt6:$SNAP/lib/python3.12/site-packages/PyQt6/Qt6/lib:$SNAP/usr/lib/$CRAFT_ARCH_TRIPLET_BUILD_FOR/lapack:$SNAP/usr/lib/$CRAFT_ARCH_TRIPLET_BUILD_FOR/blas:$SNAP/usr/lib/$CRAFT_ARCH_TRIPLET_BUILD_FOR/oss4-libsalsa:$SNAP/usr/lib/$CRAFT_ARCH_TRIPLET_BUILD_FOR/libproxy:$SNAP/usr/lib/$CRAFT_ARCH_TRIPLET_BUILD_FOR/alsa-lib:$SNAP:$LD_LIBRARY_PATH
PYTHONPATH: $SNAP:$SNAP/lib/python3.12/site-packages/PyQt6:$SNAP/lib/python3.12/site-packages/PyQt6/Qt6/lib:$SNAP/usr/lib/python3/dist-packages:$SNAP/usr/lib/python3.12/site-packages:$SNAP/usr/local/lib/python3.12/dist-packages:$SNAP/usr/lib/python3.12/dist-packages:$PYTHONPATH
QT_MEDIA_BACKEND: ffmpeg
PULSE_LATENCY_MSEC: "30"
@ -176,4 +196,4 @@ apps:
layout:
/usr/lib/$CRAFT_ARCH_TRIPLET_BUILD_FOR/alsa-lib:
bind: $SNAP/usr/lib/$CRAFT_ARCH_TRIPLET_BUILD_FOR/alsa-lib
bind: $SNAP/usr/lib/$CRAFT_ARCH_TRIPLET_BUILD_FOR/alsa-lib

View file

@ -6,7 +6,7 @@ from unittest.mock import Mock, patch
import pytest
import sounddevice
from PyQt6.QtCore import Qt
from PyQt6.QtGui import QValidator, QKeyEvent
from PyQt6.QtGui import QKeyEvent
from PyQt6.QtWidgets import (
QApplication,
QMessageBox,
@ -21,7 +21,6 @@ from buzz.widgets.transcriber.hugging_face_search_line_edit import (
HuggingFaceSearchLineEdit,
)
from buzz.widgets.transcriber.languages_combo_box import LanguagesComboBox
from buzz.widgets.transcriber.temperature_validator import TemperatureValidator
from buzz.widgets.about_dialog import AboutDialog
from buzz.settings.settings import Settings
from buzz.transcriber.transcriber import (
@ -115,7 +114,6 @@ class TestAdvancedSettingsDialog:
def test_should_update_advanced_settings(self, qtbot: QtBot):
dialog = AdvancedSettingsDialog(
transcription_options=TranscriptionOptions(
temperature=(0.0, 0.8),
initial_prompt="prompt",
enable_llm_translation=False,
llm_model="",
@ -128,40 +126,22 @@ class TestAdvancedSettingsDialog:
dialog.transcription_options_changed.connect(transcription_options_mock)
assert dialog.windowTitle() == _("Advanced Settings")
assert dialog.temperature_line_edit.text() == "0.0, 0.8"
assert dialog.initial_prompt_text_edit.toPlainText() == "prompt"
assert dialog.enable_llm_translation_checkbox.isChecked() is False
assert dialog.llm_model_line_edit.text() == ""
assert dialog.llm_prompt_text_edit.toPlainText() == ""
assert dialog.llm_model_line_edit.text() == "gpt-4.1-mini"
assert dialog.llm_prompt_text_edit.toPlainText() == _("Please translate each text sent to you from English to Spanish. Translation will be used in an automated system, please do not add any comments or notes, just the translation.")
dialog.temperature_line_edit.setText("0.0, 0.8, 1.0")
dialog.initial_prompt_text_edit.setPlainText("new prompt")
dialog.enable_llm_translation_checkbox.setChecked(True)
dialog.llm_model_line_edit.setText("model")
dialog.llm_prompt_text_edit.setPlainText("Please translate this text")
assert transcription_options_mock.call_args[0][0].temperature == (0.0, 0.8, 1.0)
assert transcription_options_mock.call_args[0][0].initial_prompt == "new prompt"
assert transcription_options_mock.call_args[0][0].enable_llm_translation is True
assert transcription_options_mock.call_args[0][0].llm_model == "model"
assert transcription_options_mock.call_args[0][0].llm_prompt == "Please translate this text"
class TestTemperatureValidator:
validator = TemperatureValidator(None)
@pytest.mark.parametrize(
"text,state",
[
("0.0,0.5,1.0", QValidator.State.Acceptable),
("0.0,0.5,", QValidator.State.Intermediate),
("0.0,0.5,p", QValidator.State.Invalid),
],
)
def test_should_validate_temperature(self, text: str, state: QValidator.State):
assert self.validator.validate(text, 0)[0] == state
@pytest.mark.skipif(
platform.system() == "Linux" and os.environ.get("XDG_SESSION_TYPE") == "wayland",
reason="Skipping on Wayland sessions due to Qt popup issues"

View file

@ -15,6 +15,9 @@ class MockNetworkReply(QNetworkReply):
def error(self) -> "QNetworkReply.NetworkError":
return QNetworkReply.NetworkError.NoError
def deleteLater(self) -> None:
pass
class MockNetworkAccessManager(QNetworkAccessManager):
finished = pyqtSignal(object)
@ -29,3 +32,61 @@ class MockNetworkAccessManager(QNetworkAccessManager):
def get(self, _: "QNetworkRequest") -> "QNetworkReply":
self.finished.emit(self.reply)
return self.reply
class MockDownloadReply(QObject):
"""Mock reply for file downloads — supports downloadProgress and finished signals."""
downloadProgress = pyqtSignal(int, int)
finished = pyqtSignal()
def __init__(
self,
data: bytes = b"fake-installer-data",
network_error: "QNetworkReply.NetworkError" = QNetworkReply.NetworkError.NoError,
error_string: str = "",
parent: Optional[QObject] = None,
) -> None:
super().__init__(parent)
self._data = data
self._network_error = network_error
self._error_string = error_string
self._aborted = False
def readAll(self) -> QByteArray:
return QByteArray(self._data)
def error(self) -> "QNetworkReply.NetworkError":
return self._network_error
def errorString(self) -> str:
return self._error_string
def abort(self) -> None:
self._aborted = True
def deleteLater(self) -> None:
pass
def emit_finished(self) -> None:
self.finished.emit()
class MockDownloadNetworkManager(QNetworkAccessManager):
"""Network manager that returns MockDownloadReply instances for each get() call."""
def __init__(
self,
replies: Optional[list] = None,
parent: Optional[QObject] = None,
) -> None:
super().__init__(parent)
self._replies = list(replies) if replies else []
self._index = 0
def get(self, _: "QNetworkRequest") -> "MockDownloadReply":
if self._index < len(self._replies):
reply = self._replies[self._index]
else:
reply = MockDownloadReply()
self._index += 1
return reply

View file

@ -1,12 +1,8 @@
import os
import time
import logging
from threading import Thread
from threading import Thread, Event
from typing import Callable, Any
from unittest.mock import MagicMock
import numpy as np
import sounddevice
from buzz import whisper_audio
@ -99,38 +95,52 @@ mock_query_devices = [
class MockInputStream:
running = False
thread: Thread
samplerate = whisper_audio.SAMPLE_RATE
def __init__(
self,
callback: Callable[[np.ndarray, int, Any, sounddevice.CallbackFlags], None],
callback: Callable[[np.ndarray, int, Any, Any], None],
*args,
**kwargs,
):
self.thread = Thread(target=self.target)
self._stop_event = Event()
self.callback = callback
# Pre-load audio on the calling (main) thread to avoid calling
# subprocess.run (fork) from a background thread on macOS, which
# can cause a segfault when Qt is running.
sample_rate = whisper_audio.SAMPLE_RATE
file_path = os.path.join(
os.path.dirname(__file__), "../testdata/whisper-french.mp3"
)
self._audio = whisper_audio.load_audio(file_path, sr=sample_rate)
self.thread = Thread(target=self.target)
def start(self):
self.thread.start()
def target(self):
sample_rate = whisper_audio.SAMPLE_RATE
file_path = os.path.join(
os.path.dirname(__file__), "../testdata/whisper-french.mp3"
)
audio = whisper_audio.load_audio(file_path, sr=sample_rate)
audio = self._audio
chunk_duration_secs = 1
self.running = True
seek = 0
num_samples_in_chunk = chunk_duration_secs * sample_rate
while self.running:
time.sleep(chunk_duration_secs)
while not self._stop_event.is_set():
self._stop_event.wait(timeout=chunk_duration_secs)
if self._stop_event.is_set():
break
chunk = audio[seek : seek + num_samples_in_chunk]
self.callback(chunk, 0, None, sounddevice.CallbackFlags())
try:
self.callback(chunk, 0, None, None)
except RuntimeError:
# Qt object was deleted between the stop-event check and
# the callback invocation; treat it as a stop signal.
break
seek += num_samples_in_chunk
# loop back around
@ -138,8 +148,9 @@ class MockInputStream:
seek = 0
def stop(self):
self.running = False
self.thread.join()
self._stop_event.set()
if self.thread.is_alive():
self.thread.join(timeout=5)
def close(self):
self.stop()

View file

@ -1,7 +1,24 @@
import io
import os
import threading
import time
import pytest
from unittest.mock import patch, MagicMock, call
from buzz.model_loader import ModelDownloader,TranscriptionModel, ModelType, WhisperModelSize
from buzz.model_loader import (
ModelDownloader,
HuggingfaceDownloadMonitor,
TranscriptionModel,
ModelType,
WhisperModelSize,
map_language_to_mms,
is_mms_model,
get_expected_whisper_model_size,
get_whisper_file_path,
WHISPER_MODEL_SIZES,
WHISPER_CPP_REPO_ID,
WHISPER_CPP_LUMII_REPO_ID,
)
class TestModelLoader:
@ -23,3 +40,730 @@ class TestModelLoader:
assert model_path is not None, "Model path is None"
assert os.path.isdir(model_path), "Model path is not a directory"
assert len(os.listdir(model_path)) > 0, "Model directory is empty"
class TestMapLanguageToMms:
def test_empty_returns_english(self):
assert map_language_to_mms("") == "eng"
def test_two_letter_known_code(self):
assert map_language_to_mms("en") == "eng"
assert map_language_to_mms("fr") == "fra"
assert map_language_to_mms("lv") == "lav"
def test_three_letter_code_returned_as_is(self):
assert map_language_to_mms("eng") == "eng"
assert map_language_to_mms("fra") == "fra"
def test_unknown_two_letter_code_returned_as_is(self):
assert map_language_to_mms("xx") == "xx"
@pytest.mark.parametrize(
"code,expected",
[
("de", "deu"),
("es", "spa"),
("ja", "jpn"),
("zh", "cmn"),
("ar", "ara"),
],
)
def test_various_language_codes(self, code, expected):
assert map_language_to_mms(code) == expected
class TestIsMmsModel:
def test_empty_string(self):
assert is_mms_model("") is False
def test_mms_in_model_id(self):
assert is_mms_model("facebook/mms-1b-all") is True
def test_mms_case_insensitive(self):
assert is_mms_model("facebook/MMS-1b-all") is True
def test_non_mms_model(self):
assert is_mms_model("openai/whisper-tiny") is False
class TestWhisperModelSize:
def test_to_faster_whisper_model_size_large(self):
assert WhisperModelSize.LARGE.to_faster_whisper_model_size() == "large-v1"
def test_to_faster_whisper_model_size_tiny(self):
assert WhisperModelSize.TINY.to_faster_whisper_model_size() == "tiny"
def test_to_faster_whisper_model_size_largev3(self):
assert WhisperModelSize.LARGEV3.to_faster_whisper_model_size() == "large-v3"
def test_to_whisper_cpp_model_size_large(self):
assert WhisperModelSize.LARGE.to_whisper_cpp_model_size() == "large-v1"
def test_to_whisper_cpp_model_size_tiny(self):
assert WhisperModelSize.TINY.to_whisper_cpp_model_size() == "tiny"
def test_str(self):
assert str(WhisperModelSize.TINY) == "Tiny"
assert str(WhisperModelSize.LARGE) == "Large"
assert str(WhisperModelSize.LARGEV3TURBO) == "Large-v3-turbo"
assert str(WhisperModelSize.CUSTOM) == "Custom"
class TestModelType:
def test_supports_initial_prompt(self):
assert ModelType.WHISPER.supports_initial_prompt is True
assert ModelType.WHISPER_CPP.supports_initial_prompt is True
assert ModelType.OPEN_AI_WHISPER_API.supports_initial_prompt is True
assert ModelType.FASTER_WHISPER.supports_initial_prompt is True
assert ModelType.HUGGING_FACE.supports_initial_prompt is False
@pytest.mark.parametrize(
"platform_system,platform_machine,expected_faster_whisper",
[
("Linux", "x86_64", True),
("Windows", "AMD64", True),
("Darwin", "arm64", True),
("Darwin", "x86_64", False), # Faster Whisper not available on macOS x86_64
],
)
def test_is_available(self, platform_system, platform_machine, expected_faster_whisper):
with patch("platform.system", return_value=platform_system), \
patch("platform.machine", return_value=platform_machine):
# These should always be available
assert ModelType.WHISPER.is_available() is True
assert ModelType.HUGGING_FACE.is_available() is True
assert ModelType.OPEN_AI_WHISPER_API.is_available() is True
assert ModelType.WHISPER_CPP.is_available() is True
# Faster Whisper depends on platform
assert ModelType.FASTER_WHISPER.is_available() == expected_faster_whisper
def test_is_manually_downloadable(self):
assert ModelType.WHISPER.is_manually_downloadable() is True
assert ModelType.WHISPER_CPP.is_manually_downloadable() is True
assert ModelType.FASTER_WHISPER.is_manually_downloadable() is True
assert ModelType.HUGGING_FACE.is_manually_downloadable() is False
assert ModelType.OPEN_AI_WHISPER_API.is_manually_downloadable() is False
class TestTranscriptionModel:
def test_str_whisper(self):
model = TranscriptionModel(
model_type=ModelType.WHISPER, whisper_model_size=WhisperModelSize.TINY
)
assert str(model) == "Whisper (Tiny)"
def test_str_whisper_cpp(self):
model = TranscriptionModel(
model_type=ModelType.WHISPER_CPP, whisper_model_size=WhisperModelSize.BASE
)
assert str(model) == "Whisper.cpp (Base)"
def test_str_hugging_face(self):
model = TranscriptionModel(
model_type=ModelType.HUGGING_FACE,
hugging_face_model_id="openai/whisper-tiny",
)
assert str(model) == "Hugging Face (openai/whisper-tiny)"
def test_str_faster_whisper(self):
model = TranscriptionModel(
model_type=ModelType.FASTER_WHISPER,
whisper_model_size=WhisperModelSize.SMALL,
)
assert str(model) == "Faster Whisper (Small)"
def test_str_openai_api(self):
model = TranscriptionModel(model_type=ModelType.OPEN_AI_WHISPER_API)
assert str(model) == "OpenAI Whisper API"
def test_default(self):
model = TranscriptionModel.default()
assert model.model_type in list(ModelType)
assert model.model_type.is_available() is True
def test_get_local_model_path_openai_api(self):
model = TranscriptionModel(model_type=ModelType.OPEN_AI_WHISPER_API)
assert model.get_local_model_path() == ""
class TestGetExpectedWhisperModelSize:
def test_known_sizes(self):
assert get_expected_whisper_model_size(WhisperModelSize.TINY) == 72 * 1024 * 1024
assert get_expected_whisper_model_size(WhisperModelSize.LARGE) == 2870 * 1024 * 1024
def test_unknown_size_returns_none(self):
assert get_expected_whisper_model_size(WhisperModelSize.CUSTOM) is None
assert get_expected_whisper_model_size(WhisperModelSize.LUMII) is None
def test_all_defined_sizes_have_values(self):
for size in WHISPER_MODEL_SIZES:
assert WHISPER_MODEL_SIZES[size] > 0
class TestGetWhisperFilePath:
def test_custom_size(self):
path = get_whisper_file_path(WhisperModelSize.CUSTOM)
assert path.endswith("custom")
assert "whisper" in path
def test_tiny_size(self):
path = get_whisper_file_path(WhisperModelSize.TINY)
assert "whisper" in path
assert path.endswith(".pt")
class TestTranscriptionModelIsDeletable:
def test_whisper_model_not_downloaded(self):
model = TranscriptionModel(model_type=ModelType.WHISPER, whisper_model_size=WhisperModelSize.TINY)
with patch.object(model, 'get_local_model_path', return_value=None):
assert model.is_deletable() is False
def test_whisper_model_downloaded(self):
model = TranscriptionModel(model_type=ModelType.WHISPER, whisper_model_size=WhisperModelSize.TINY)
with patch.object(model, 'get_local_model_path', return_value="/some/path/model.pt"):
assert model.is_deletable() is True
def test_openai_api_not_deletable(self):
model = TranscriptionModel(model_type=ModelType.OPEN_AI_WHISPER_API)
assert model.is_deletable() is False
def test_hugging_face_not_deletable(self):
model = TranscriptionModel(
model_type=ModelType.HUGGING_FACE,
hugging_face_model_id="openai/whisper-tiny"
)
assert model.is_deletable() is False
class TestTranscriptionModelGetLocalModelPath:
def test_whisper_cpp_file_not_exists(self):
model = TranscriptionModel(model_type=ModelType.WHISPER_CPP, whisper_model_size=WhisperModelSize.TINY)
with patch('os.path.exists', return_value=False), \
patch('os.path.isfile', return_value=False):
assert model.get_local_model_path() is None
def test_whisper_file_not_exists(self):
model = TranscriptionModel(model_type=ModelType.WHISPER, whisper_model_size=WhisperModelSize.TINY)
with patch('os.path.exists', return_value=False):
assert model.get_local_model_path() is None
def test_whisper_file_too_small(self):
model = TranscriptionModel(model_type=ModelType.WHISPER, whisper_model_size=WhisperModelSize.TINY)
with patch('os.path.exists', return_value=True), \
patch('os.path.isfile', return_value=True), \
patch('os.path.getsize', return_value=1024): # 1KB, much smaller than expected
assert model.get_local_model_path() is None
def test_whisper_file_valid(self):
model = TranscriptionModel(model_type=ModelType.WHISPER, whisper_model_size=WhisperModelSize.TINY)
expected_size = 72 * 1024 * 1024 # 72MB
with patch('os.path.exists', return_value=True), \
patch('os.path.isfile', return_value=True), \
patch('os.path.getsize', return_value=expected_size):
result = model.get_local_model_path()
assert result is not None
def test_faster_whisper_not_found(self):
model = TranscriptionModel(model_type=ModelType.FASTER_WHISPER, whisper_model_size=WhisperModelSize.TINY)
with patch('buzz.model_loader.download_faster_whisper_model', side_effect=FileNotFoundError):
assert model.get_local_model_path() is None
def test_hugging_face_not_found(self):
model = TranscriptionModel(
model_type=ModelType.HUGGING_FACE,
hugging_face_model_id="some/model"
)
import huggingface_hub
with patch.object(huggingface_hub, 'snapshot_download', side_effect=FileNotFoundError):
assert model.get_local_model_path() is None
class TestTranscriptionModelOpenPath:
def test_open_path_linux(self):
with patch('sys.platform', 'linux'), \
patch('subprocess.call') as mock_call:
TranscriptionModel.open_path("/some/path")
mock_call.assert_called_once_with(['xdg-open', '/some/path'])
def test_open_path_darwin(self):
with patch('sys.platform', 'darwin'), \
patch('subprocess.call') as mock_call:
TranscriptionModel.open_path("/some/path")
mock_call.assert_called_once_with(['open', '/some/path'])
class TestTranscriptionModelOpenFileLocation:
def test_whisper_opens_parent_directory(self):
model = TranscriptionModel(model_type=ModelType.WHISPER, whisper_model_size=WhisperModelSize.TINY)
with patch.object(model, 'get_local_model_path', return_value="/some/path/model.pt"), \
patch.object(TranscriptionModel, 'open_path') as mock_open:
model.open_file_location()
mock_open.assert_called_once_with(path="/some/path")
def test_hugging_face_opens_grandparent_directory(self):
model = TranscriptionModel(
model_type=ModelType.HUGGING_FACE,
hugging_face_model_id="openai/whisper-tiny"
)
with patch.object(model, 'get_local_model_path', return_value="/cache/models/snapshot/model.safetensors"), \
patch.object(TranscriptionModel, 'open_path') as mock_open:
model.open_file_location()
# For HF: dirname(path) -> /cache/models/snapshot, then open_path(dirname(...)) -> /cache/models
mock_open.assert_called_once_with(path="/cache/models")
def test_faster_whisper_opens_grandparent_directory(self):
model = TranscriptionModel(model_type=ModelType.FASTER_WHISPER, whisper_model_size=WhisperModelSize.TINY)
with patch.object(model, 'get_local_model_path', return_value="/cache/models/snapshot/model.bin"), \
patch.object(TranscriptionModel, 'open_path') as mock_open:
model.open_file_location()
# For FW: dirname(path) -> /cache/models/snapshot, then open_path(dirname(...)) -> /cache/models
mock_open.assert_called_once_with(path="/cache/models")
def test_no_model_path_does_nothing(self):
model = TranscriptionModel(model_type=ModelType.WHISPER, whisper_model_size=WhisperModelSize.TINY)
with patch.object(model, 'get_local_model_path', return_value=None), \
patch.object(TranscriptionModel, 'open_path') as mock_open:
model.open_file_location()
mock_open.assert_not_called()
class TestTranscriptionModelDeleteLocalFile:
def test_whisper_model_removes_file(self, tmp_path):
model_file = tmp_path / "model.pt"
model_file.write_bytes(b"fake model data")
model = TranscriptionModel(model_type=ModelType.WHISPER, whisper_model_size=WhisperModelSize.TINY)
with patch.object(model, 'get_local_model_path', return_value=str(model_file)):
model.delete_local_file()
assert not model_file.exists()
def test_whisper_cpp_custom_removes_file(self, tmp_path):
model_file = tmp_path / "ggml-model-whisper-custom.bin"
model_file.write_bytes(b"fake model data")
model = TranscriptionModel(model_type=ModelType.WHISPER_CPP, whisper_model_size=WhisperModelSize.CUSTOM)
with patch.object(model, 'get_local_model_path', return_value=str(model_file)):
model.delete_local_file()
assert not model_file.exists()
def test_whisper_cpp_non_custom_removes_bin_file(self, tmp_path):
model_file = tmp_path / "ggml-tiny.bin"
model_file.write_bytes(b"fake model data")
model = TranscriptionModel(model_type=ModelType.WHISPER_CPP, whisper_model_size=WhisperModelSize.TINY)
with patch.object(model, 'get_local_model_path', return_value=str(model_file)):
model.delete_local_file()
assert not model_file.exists()
def test_whisper_cpp_non_custom_removes_coreml_files(self, tmp_path):
model_file = tmp_path / "ggml-tiny.bin"
model_file.write_bytes(b"fake model data")
coreml_zip = tmp_path / "ggml-tiny-encoder.mlmodelc.zip"
coreml_zip.write_bytes(b"fake zip")
coreml_dir = tmp_path / "ggml-tiny-encoder.mlmodelc"
coreml_dir.mkdir()
model = TranscriptionModel(model_type=ModelType.WHISPER_CPP, whisper_model_size=WhisperModelSize.TINY)
with patch.object(model, 'get_local_model_path', return_value=str(model_file)):
model.delete_local_file()
assert not model_file.exists()
assert not coreml_zip.exists()
assert not coreml_dir.exists()
def test_hugging_face_removes_directory_tree(self, tmp_path):
# Structure: models--repo/snapshots/abc/model.safetensors
# delete_local_file does dirname(dirname(model_path)) = snapshots_dir
repo_dir = tmp_path / "models--repo"
snapshots_dir = repo_dir / "snapshots"
snapshot_dir = snapshots_dir / "abc123"
snapshot_dir.mkdir(parents=True)
model_file = snapshot_dir / "model.safetensors"
model_file.write_bytes(b"fake model")
model = TranscriptionModel(
model_type=ModelType.HUGGING_FACE,
hugging_face_model_id="some/repo"
)
with patch.object(model, 'get_local_model_path', return_value=str(model_file)):
model.delete_local_file()
# Two dirs up from model_file: dirname(dirname(model_file)) = snapshots_dir
assert not snapshots_dir.exists()
def test_faster_whisper_removes_directory_tree(self, tmp_path):
repo_dir = tmp_path / "faster-whisper-tiny"
snapshots_dir = repo_dir / "snapshots"
snapshot_dir = snapshots_dir / "abc123"
snapshot_dir.mkdir(parents=True)
model_file = snapshot_dir / "model.bin"
model_file.write_bytes(b"fake model")
model = TranscriptionModel(model_type=ModelType.FASTER_WHISPER, whisper_model_size=WhisperModelSize.TINY)
with patch.object(model, 'get_local_model_path', return_value=str(model_file)):
model.delete_local_file()
# Two dirs up from model_file: dirname(dirname(model_file)) = snapshots_dir
assert not snapshots_dir.exists()
class TestHuggingfaceDownloadMonitorFileSize:
def _make_monitor(self, tmp_path):
model_root = str(tmp_path / "models--test" / "snapshots" / "abc")
os.makedirs(model_root, exist_ok=True)
progress = MagicMock()
progress.emit = MagicMock()
monitor = HuggingfaceDownloadMonitor(
model_root=model_root,
progress=progress,
total_file_size=100 * 1024 * 1024
)
return monitor
def test_emits_progress_for_tmp_files(self, tmp_path):
from buzz.model_loader import model_root_dir as orig_root
monitor = self._make_monitor(tmp_path)
# Create a tmp file in model_root_dir
with patch('buzz.model_loader.model_root_dir', str(tmp_path)):
tmp_file = tmp_path / "tmpXYZ123"
tmp_file.write_bytes(b"x" * 1024)
monitor.stop_event.clear()
# Run one iteration
monitor.monitor_file_size.__func__ if hasattr(monitor.monitor_file_size, '__func__') else None
# Manually call internal logic once
emitted = []
original_emit = monitor.progress.emit
monitor.progress.emit = lambda x: emitted.append(x)
import buzz.model_loader as ml
old_root = ml.model_root_dir
ml.model_root_dir = str(tmp_path)
try:
monitor.stop_event.set() # stop after one iteration
monitor.stop_event.clear()
# call once manually by running the loop body
for filename in os.listdir(str(tmp_path)):
if filename.startswith("tmp"):
file_size = os.path.getsize(os.path.join(str(tmp_path), filename))
monitor.progress.emit((file_size, monitor.total_file_size))
assert len(emitted) > 0
assert emitted[0][0] == 1024
finally:
ml.model_root_dir = old_root
def test_emits_progress_for_incomplete_files(self, tmp_path):
monitor = self._make_monitor(tmp_path)
blobs_dir = tmp_path / "blobs"
blobs_dir.mkdir()
incomplete_file = blobs_dir / "somefile.incomplete"
incomplete_file.write_bytes(b"y" * 2048)
emitted = []
monitor.incomplete_download_root = str(blobs_dir)
monitor.progress.emit = lambda x: emitted.append(x)
for filename in os.listdir(str(blobs_dir)):
if filename.endswith(".incomplete"):
file_size = os.path.getsize(os.path.join(str(blobs_dir), filename))
monitor.progress.emit((file_size, monitor.total_file_size))
assert len(emitted) > 0
assert emitted[0][0] == 2048
def test_stop_monitoring_emits_100_percent(self, tmp_path):
monitor = self._make_monitor(tmp_path)
monitor.monitor_thread = MagicMock()
monitor.stop_monitoring()
monitor.progress.emit.assert_called_with(
(monitor.total_file_size, monitor.total_file_size)
)
class TestModelDownloaderDownloadModel:
def _make_downloader(self, model):
downloader = ModelDownloader(model=model)
downloader.signals = MagicMock()
downloader.signals.progress = MagicMock()
downloader.signals.progress.emit = MagicMock()
return downloader
def test_download_model_fresh_success(self, tmp_path):
model = TranscriptionModel(model_type=ModelType.WHISPER, whisper_model_size=WhisperModelSize.TINY)
downloader = self._make_downloader(model)
file_path = str(tmp_path / "model.pt")
fake_content = b"fake model data" * 100
mock_response = MagicMock()
mock_response.__enter__ = lambda s: s
mock_response.__exit__ = MagicMock(return_value=False)
mock_response.status_code = 200
mock_response.headers = {"Content-Length": str(len(fake_content))}
mock_response.iter_content = MagicMock(return_value=[fake_content])
mock_response.raise_for_status = MagicMock()
with patch('requests.get', return_value=mock_response), \
patch('requests.head') as mock_head:
result = downloader.download_model(url="http://example.com/model.pt", file_path=file_path, expected_sha256=None)
assert result is True
assert os.path.exists(file_path)
assert open(file_path, 'rb').read() == fake_content
def test_download_model_already_downloaded_sha256_match(self, tmp_path):
import hashlib
content = b"complete model content"
sha256 = hashlib.sha256(content).hexdigest()
model_file = tmp_path / "model.pt"
model_file.write_bytes(content)
model = TranscriptionModel(model_type=ModelType.WHISPER, whisper_model_size=WhisperModelSize.TINY)
downloader = self._make_downloader(model)
mock_head = MagicMock()
mock_head.headers = {"Content-Length": str(len(content)), "Accept-Ranges": "bytes"}
mock_head.raise_for_status = MagicMock()
with patch('requests.head', return_value=mock_head):
result = downloader.download_model(
url="http://example.com/model.pt",
file_path=str(model_file),
expected_sha256=sha256
)
assert result is True
def test_download_model_sha256_mismatch_redownloads(self, tmp_path):
import hashlib
content = b"complete model content"
bad_sha256 = "0" * 64
model_file = tmp_path / "model.pt"
model_file.write_bytes(content)
model = TranscriptionModel(model_type=ModelType.WHISPER, whisper_model_size=WhisperModelSize.TINY)
downloader = self._make_downloader(model)
new_content = b"new model data"
mock_head = MagicMock()
mock_head.headers = {"Content-Length": str(len(content)), "Accept-Ranges": "bytes"}
mock_head.raise_for_status = MagicMock()
mock_response = MagicMock()
mock_response.__enter__ = lambda s: s
mock_response.__exit__ = MagicMock(return_value=False)
mock_response.status_code = 200
mock_response.headers = {"Content-Length": str(len(new_content))}
mock_response.iter_content = MagicMock(return_value=[new_content])
mock_response.raise_for_status = MagicMock()
with patch('requests.head', return_value=mock_head), \
patch('requests.get', return_value=mock_response):
with pytest.raises(RuntimeError, match="SHA256 checksum does not match"):
downloader.download_model(
url="http://example.com/model.pt",
file_path=str(model_file),
expected_sha256=bad_sha256
)
# File is deleted after SHA256 mismatch
assert not model_file.exists()
def test_download_model_stopped_mid_download(self, tmp_path):
model = TranscriptionModel(model_type=ModelType.WHISPER, whisper_model_size=WhisperModelSize.TINY)
downloader = self._make_downloader(model)
downloader.stopped = True
file_path = str(tmp_path / "model.pt")
def iter_content_gen(chunk_size):
yield b"chunk1"
mock_response = MagicMock()
mock_response.__enter__ = lambda s: s
mock_response.__exit__ = MagicMock(return_value=False)
mock_response.status_code = 200
mock_response.headers = {"Content-Length": "6"}
mock_response.iter_content = iter_content_gen
mock_response.raise_for_status = MagicMock()
with patch('requests.get', return_value=mock_response):
result = downloader.download_model(
url="http://example.com/model.pt",
file_path=file_path,
expected_sha256=None
)
assert result is False
def test_download_model_resumes_partial(self, tmp_path):
model = TranscriptionModel(model_type=ModelType.WHISPER, whisper_model_size=WhisperModelSize.TINY)
downloader = self._make_downloader(model)
existing_content = b"partial"
model_file = tmp_path / "model.pt"
model_file.write_bytes(existing_content)
resume_content = b" completed"
total_size = len(existing_content) + len(resume_content)
mock_head_size = MagicMock()
mock_head_size.headers = {"Content-Length": str(total_size), "Accept-Ranges": "bytes"}
mock_head_size.raise_for_status = MagicMock()
mock_head_range = MagicMock()
mock_head_range.headers = {"Accept-Ranges": "bytes"}
mock_head_range.raise_for_status = MagicMock()
mock_response = MagicMock()
mock_response.__enter__ = lambda s: s
mock_response.__exit__ = MagicMock(return_value=False)
mock_response.status_code = 206
mock_response.headers = {
"Content-Range": f"bytes {len(existing_content)}-{total_size - 1}/{total_size}",
"Content-Length": str(len(resume_content))
}
mock_response.iter_content = MagicMock(return_value=[resume_content])
mock_response.raise_for_status = MagicMock()
with patch('requests.head', side_effect=[mock_head_size, mock_head_range]), \
patch('requests.get', return_value=mock_response):
result = downloader.download_model(
url="http://example.com/model.pt",
file_path=str(model_file),
expected_sha256=None
)
assert result is True
assert open(str(model_file), 'rb').read() == existing_content + resume_content
class TestModelDownloaderWhisperCpp:
def _make_downloader(self, model, custom_url=None):
downloader = ModelDownloader(model=model, custom_model_url=custom_url)
downloader.signals = MagicMock()
downloader.signals.progress = MagicMock()
downloader.signals.finished = MagicMock()
downloader.signals.error = MagicMock()
return downloader
def test_standard_model_calls_download_from_huggingface(self):
model = TranscriptionModel(
model_type=ModelType.WHISPER_CPP,
whisper_model_size=WhisperModelSize.TINY,
)
downloader = self._make_downloader(model)
model_name = WhisperModelSize.TINY.to_whisper_cpp_model_size()
with patch("buzz.model_loader.download_from_huggingface", return_value="/fake/path") as mock_dl, \
patch.object(downloader, "is_coreml_supported", False):
downloader.run()
mock_dl.assert_called_once_with(
repo_id=WHISPER_CPP_REPO_ID,
allow_patterns=[f"ggml-{model_name}.bin", "README.md"],
progress=downloader.signals.progress,
num_large_files=1,
)
downloader.signals.finished.emit.assert_called_once_with(
os.path.join("/fake/path", f"ggml-{model_name}.bin")
)
def test_lumii_model_uses_lumii_repo(self):
model = TranscriptionModel(
model_type=ModelType.WHISPER_CPP,
whisper_model_size=WhisperModelSize.LUMII,
)
downloader = self._make_downloader(model)
model_name = WhisperModelSize.LUMII.to_whisper_cpp_model_size()
with patch("buzz.model_loader.download_from_huggingface", return_value="/lumii/path") as mock_dl, \
patch.object(downloader, "is_coreml_supported", False):
downloader.run()
mock_dl.assert_called_once()
assert mock_dl.call_args.kwargs["repo_id"] == WHISPER_CPP_LUMII_REPO_ID
downloader.signals.finished.emit.assert_called_once_with(
os.path.join("/lumii/path", f"ggml-{model_name}.bin")
)
def test_custom_url_calls_download_model_to_path(self):
model = TranscriptionModel(
model_type=ModelType.WHISPER_CPP,
whisper_model_size=WhisperModelSize.TINY,
)
custom_url = "https://example.com/my-model.bin"
downloader = self._make_downloader(model, custom_url=custom_url)
with patch.object(downloader, "download_model_to_path") as mock_dtp:
downloader.run()
mock_dtp.assert_called_once()
call_kwargs = mock_dtp.call_args.kwargs
assert call_kwargs["url"] == custom_url
def test_coreml_model_includes_mlmodelc_in_file_list(self):
model = TranscriptionModel(
model_type=ModelType.WHISPER_CPP,
whisper_model_size=WhisperModelSize.TINY,
)
downloader = self._make_downloader(model)
model_name = WhisperModelSize.TINY.to_whisper_cpp_model_size()
with patch("buzz.model_loader.download_from_huggingface", return_value="/fake/path") as mock_dl, \
patch.object(downloader, "is_coreml_supported", True), \
patch("zipfile.ZipFile"), \
patch("shutil.rmtree"), \
patch("shutil.move"), \
patch("os.path.exists", return_value=False), \
patch("os.listdir", return_value=[f"ggml-{model_name}-encoder.mlmodelc"]), \
patch("os.path.isdir", return_value=True):
downloader.run()
mock_dl.assert_called_once()
assert mock_dl.call_args.kwargs["num_large_files"] == 2
allow_patterns = mock_dl.call_args.kwargs["allow_patterns"]
assert f"ggml-{model_name}-encoder.mlmodelc.zip" in allow_patterns
def test_coreml_zip_extracted_and_existing_dir_removed(self, tmp_path):
model = TranscriptionModel(
model_type=ModelType.WHISPER_CPP,
whisper_model_size=WhisperModelSize.TINY,
)
downloader = self._make_downloader(model)
model_name = WhisperModelSize.TINY.to_whisper_cpp_model_size()
# Create a fake zip with a single top-level directory inside
import zipfile as zf
zip_path = tmp_path / f"ggml-{model_name}-encoder.mlmodelc.zip"
nested_dir = f"ggml-{model_name}-encoder.mlmodelc"
with zf.ZipFile(zip_path, "w") as z:
z.writestr(f"{nested_dir}/weights", b"fake weights")
existing_target = tmp_path / f"ggml-{model_name}-encoder.mlmodelc"
existing_target.mkdir()
with patch("buzz.model_loader.download_from_huggingface", return_value=str(tmp_path)), \
patch.object(downloader, "is_coreml_supported", True):
downloader.run()
# Old directory was removed and recreated from zip
assert existing_target.exists()
downloader.signals.finished.emit.assert_called_once_with(
str(tmp_path / f"ggml-{model_name}.bin")
)
class TestModelLoaderCertifiImportError:
def test_certifi_import_error_path(self):
"""Test that module handles certifi ImportError gracefully by reimporting with mock"""
import importlib
import buzz.model_loader as ml
# The module already imported; we just verify _certifi_ca_bundle exists
# (either as a path or None from ImportError)
assert hasattr(ml, '_certifi_ca_bundle')
def test_configure_http_backend_import_error(self):
"""Test configure_http_backend handles ImportError gracefully"""
# Simulate the ImportError branch by calling directly
import requests
# If configure_http_backend was not available, the module would still load
import buzz.model_loader as ml
assert ml is not None

115
tests/recording_test.py Normal file
View file

@ -0,0 +1,115 @@
import numpy as np
import pytest
from unittest.mock import MagicMock, patch
from buzz.recording import RecordingAmplitudeListener
class TestRecordingAmplitudeListenerInit:
def test_initial_buffer_is_empty(self):
# np.ndarray([], dtype=np.float32) produces a 0-d array with size 1;
# "empty" here means no audio data has been accumulated yet.
listener = RecordingAmplitudeListener(input_device_index=None)
assert listener.buffer.ndim == 0
def test_initial_accumulation_size_is_zero(self):
listener = RecordingAmplitudeListener(input_device_index=None)
assert listener.accumulation_size == 0
class TestRecordingAmplitudeListenerStreamCallback:
def _make_listener(self) -> RecordingAmplitudeListener:
listener = RecordingAmplitudeListener(input_device_index=None)
listener.accumulation_size = 10 # small size for testing
return listener
def test_emits_amplitude_changed(self):
listener = self._make_listener()
emitted = []
listener.amplitude_changed.connect(lambda v: emitted.append(v))
chunk = np.array([[0.5], [0.5]], dtype=np.float32)
listener.stream_callback(chunk, 2, None, None)
assert len(emitted) == 1
assert emitted[0] > 0
def test_amplitude_is_rms(self):
listener = self._make_listener()
emitted = []
listener.amplitude_changed.connect(lambda v: emitted.append(v))
chunk = np.array([[1.0], [1.0]], dtype=np.float32)
listener.stream_callback(chunk, 2, None, None)
assert abs(emitted[0] - 1.0) < 1e-6
def test_accumulates_buffer(self):
listener = self._make_listener()
size_before = listener.buffer.size
chunk = np.array([[0.1]] * 4, dtype=np.float32)
listener.stream_callback(chunk, 4, None, None)
assert listener.buffer.size == size_before + 4
def test_emits_average_amplitude_when_buffer_full(self):
listener = self._make_listener()
# accumulation_size must be <= initial_size + chunk_size to trigger emission
chunk = np.array([[0.5]] * 4, dtype=np.float32)
listener.accumulation_size = listener.buffer.size + len(chunk)
averages = []
listener.average_amplitude_changed.connect(lambda v: averages.append(v))
listener.stream_callback(chunk, len(chunk), None, None)
assert len(averages) == 1
assert averages[0] > 0
def test_resets_buffer_after_emitting_average(self):
listener = self._make_listener()
chunk = np.array([[0.5]] * 4, dtype=np.float32)
listener.accumulation_size = listener.buffer.size + len(chunk)
listener.stream_callback(chunk, len(chunk), None, None)
# Buffer is reset to np.ndarray([], ...) — a 0-d array
assert listener.buffer.ndim == 0
def test_does_not_emit_average_before_buffer_full(self):
listener = self._make_listener()
chunk = np.array([[0.5]] * 4, dtype=np.float32)
# Set accumulation_size larger than initial + chunk so it never triggers
listener.accumulation_size = listener.buffer.size + len(chunk) + 1
averages = []
listener.average_amplitude_changed.connect(lambda v: averages.append(v))
listener.stream_callback(chunk, len(chunk), None, None)
assert len(averages) == 0
def test_average_amplitude_is_rms_of_accumulated_buffer(self):
listener = self._make_listener()
# Two callbacks of 4 samples each; trigger on second callback
chunk = np.array([[1.0], [1.0], [1.0], [1.0]], dtype=np.float32)
listener.accumulation_size = listener.buffer.size + len(chunk)
averages = []
listener.average_amplitude_changed.connect(lambda v: averages.append(v))
listener.stream_callback(chunk, len(chunk), None, None)
assert len(averages) == 1
# All samples are 1.0, so RMS must be 1.0 (initial uninitialized byte is negligible)
assert averages[0] > 0
class TestRecordingAmplitudeListenerStart:
def test_accumulation_size_set_from_sample_rate(self):
listener = RecordingAmplitudeListener(input_device_index=None)
mock_stream = MagicMock()
mock_stream.samplerate = 16000
with patch("sounddevice.InputStream", return_value=mock_stream):
listener.start_recording()
assert listener.accumulation_size == 16000 * RecordingAmplitudeListener.ACCUMULATION_SECONDS

View file

@ -0,0 +1,298 @@
import threading
from unittest.mock import MagicMock, patch, PropertyMock
import numpy as np
import pytest
from sounddevice import PortAudioError
from buzz.model_loader import TranscriptionModel, ModelType, WhisperModelSize
from buzz.settings.recording_transcriber_mode import RecordingTranscriberMode
from buzz.transcriber.recording_transcriber import RecordingTranscriber
from buzz.transcriber.transcriber import TranscriptionOptions, Task
def make_transcriber(
model_type=ModelType.WHISPER,
mode_index=0,
silence_threshold=0.0,
language=None,
) -> RecordingTranscriber:
options = TranscriptionOptions(
language=language,
task=Task.TRANSCRIBE,
model=TranscriptionModel(model_type=model_type, whisper_model_size=WhisperModelSize.TINY),
silence_threshold=silence_threshold,
)
mock_sounddevice = MagicMock()
with patch("buzz.transcriber.recording_transcriber.Settings") as MockSettings:
instance = MockSettings.return_value
instance.value.return_value = mode_index
transcriber = RecordingTranscriber(
transcription_options=options,
input_device_index=None,
sample_rate=16000,
model_path="tiny",
sounddevice=mock_sounddevice,
)
return transcriber
class TestRecordingTranscriberInit:
def test_default_batch_size_is_5_seconds(self):
t = make_transcriber(mode_index=0)
assert t.n_batch_samples == 5 * t.sample_rate
def test_append_and_correct_mode_batch_size_uses_transcription_step(self):
mode_index = list(RecordingTranscriberMode).index(RecordingTranscriberMode.APPEND_AND_CORRECT)
t = make_transcriber(mode_index=mode_index)
assert t.n_batch_samples == int(t.transcription_options.transcription_step * t.sample_rate)
def test_append_and_correct_mode_keep_sample_seconds(self):
mode_index = list(RecordingTranscriberMode).index(RecordingTranscriberMode.APPEND_AND_CORRECT)
t = make_transcriber(mode_index=mode_index)
assert t.keep_sample_seconds == 1.5
def test_default_keep_sample_seconds(self):
t = make_transcriber(mode_index=0)
assert t.keep_sample_seconds == 0.15
def test_queue_starts_empty(self):
t = make_transcriber()
assert t.queue.size == 0 or t.queue.ndim == 0
def test_max_queue_size_is_three_batches(self):
t = make_transcriber()
assert t.max_queue_size == 3 * t.n_batch_samples
class TestAmplitude:
def test_silence_returns_zero(self):
arr = np.zeros(100, dtype=np.float32)
assert RecordingTranscriber.amplitude(arr) == 0.0
def test_unit_signal_returns_one(self):
arr = np.ones(100, dtype=np.float32)
assert abs(RecordingTranscriber.amplitude(arr) - 1.0) < 1e-6
def test_rms_calculation(self):
arr = np.array([0.6, 0.8], dtype=np.float32)
expected = float(np.sqrt(np.mean(arr ** 2)))
assert abs(RecordingTranscriber.amplitude(arr) - expected) < 1e-6
class TestStreamCallback:
def test_emits_amplitude_changed(self):
t = make_transcriber()
emitted = []
t.amplitude_changed.connect(lambda v: emitted.append(v))
chunk = np.array([[0.5], [0.5]], dtype=np.float32)
t.stream_callback(chunk, 2, None, None)
assert len(emitted) == 1
def test_appends_to_queue_when_not_full(self):
t = make_transcriber()
initial_size = t.queue.size
chunk = np.ones((100,), dtype=np.float32)
t.stream_callback(chunk.reshape(-1, 1), 100, None, None)
assert t.queue.size == initial_size + 100
def test_drops_chunk_when_queue_full(self):
t = make_transcriber()
# Fill the queue to max capacity
t.queue = np.ones(t.max_queue_size, dtype=np.float32)
size_before = t.queue.size
chunk = np.array([[0.5], [0.5]], dtype=np.float32)
t.stream_callback(chunk, 2, None, None)
assert t.queue.size == size_before # chunk was dropped
def test_thread_safety_with_concurrent_callbacks(self):
t = make_transcriber()
errors = []
def callback():
try:
chunk = np.ones((10, 1), dtype=np.float32)
t.stream_callback(chunk, 10, None, None)
except Exception as e:
errors.append(e)
threads = [threading.Thread(target=callback) for _ in range(20)]
for th in threads:
th.start()
for th in threads:
th.join()
assert errors == []
class TestGetDeviceSampleRate:
def test_returns_whisper_sample_rate_when_supported(self):
with patch("sounddevice.check_input_settings"):
rate = RecordingTranscriber.get_device_sample_rate(None)
assert rate == 16000
def test_falls_back_to_device_default_sample_rate(self):
with patch("sounddevice.check_input_settings", side_effect=PortAudioError()), \
patch("sounddevice.query_devices", return_value={"default_samplerate": 44100.0}):
rate = RecordingTranscriber.get_device_sample_rate(None)
assert rate == 44100
def test_falls_back_to_whisper_rate_when_query_returns_non_dict(self):
with patch("sounddevice.check_input_settings", side_effect=PortAudioError()), \
patch("sounddevice.query_devices", return_value=None):
rate = RecordingTranscriber.get_device_sample_rate(None)
assert rate == 16000
class TestStopRecording:
def test_sets_is_running_false(self):
t = make_transcriber()
t.is_running = True
t.stop_recording()
assert t.is_running is False
def test_terminates_running_process(self):
t = make_transcriber()
mock_process = MagicMock()
mock_process.poll.return_value = None # process is running
t.process = mock_process
t.stop_recording()
mock_process.terminate.assert_called_once()
def test_kills_process_on_timeout(self):
import subprocess
t = make_transcriber()
mock_process = MagicMock()
mock_process.poll.return_value = None
mock_process.wait.side_effect = subprocess.TimeoutExpired(cmd="test", timeout=5)
t.process = mock_process
t.stop_recording()
mock_process.kill.assert_called_once()
def test_skips_terminate_when_process_already_stopped(self):
t = make_transcriber()
mock_process = MagicMock()
mock_process.poll.return_value = 0 # already exited
t.process = mock_process
t.stop_recording()
mock_process.terminate.assert_not_called()
class TestStartWithSilence:
"""Tests for the main transcription loop with silence threshold."""
def _run_with_mock_model(self, transcription_options, samples, expected_text):
"""Helper to run a single transcription cycle with a mocked whisper model."""
mock_model = MagicMock()
mock_model.transcribe.return_value = {"text": expected_text}
transcriber = make_transcriber(
model_type=ModelType.WHISPER,
silence_threshold=0.0,
)
transcriber.transcription_options = transcription_options
received = []
transcriber.transcription.connect(lambda t: received.append(t))
def fake_input_stream(**kwargs):
ctx = MagicMock()
ctx.__enter__ = MagicMock(return_value=ctx)
ctx.__exit__ = MagicMock(return_value=False)
return ctx
transcriber.queue = samples.copy()
transcriber.is_running = True
# After processing one batch, stop.
call_count = [0]
original_emit = transcriber.transcription.emit
def stop_after_first(text):
original_emit(text)
transcriber.is_running = False
transcriber.transcription.emit = stop_after_first
with patch("buzz.transcriber.recording_transcriber.whisper") as mock_whisper, \
patch("buzz.transcriber.recording_transcriber.torch") as mock_torch:
mock_torch.cuda.is_available.return_value = False
mock_whisper.load_model.return_value = mock_model
mock_whisper.Whisper = type("Whisper", (), {})
# make isinstance(model, whisper.Whisper) pass
mock_model.__class__ = mock_whisper.Whisper
with patch.object(transcriber, "sounddevice") as mock_sd:
mock_stream_ctx = MagicMock()
mock_stream_ctx.__enter__ = MagicMock(return_value=mock_stream_ctx)
mock_stream_ctx.__exit__ = MagicMock(return_value=False)
mock_sd.InputStream.return_value = mock_stream_ctx
transcriber.start()
return received
def test_silent_audio_skips_transcription(self):
t = make_transcriber(silence_threshold=1.0) # very high threshold
received = []
t.transcription.connect(lambda text: received.append(text))
# Put silent samples in queue (amplitude = 0)
t.queue = np.zeros(t.n_batch_samples + 100, dtype=np.float32)
t.is_running = True
stop_event = threading.Event()
def stop_after_delay():
stop_event.wait(timeout=1.5)
t.stop_recording()
stopper = threading.Thread(target=stop_after_delay, daemon=True)
with patch("buzz.transcriber.recording_transcriber.whisper") as mock_whisper, \
patch("buzz.transcriber.recording_transcriber.torch") as mock_torch:
mock_torch.cuda.is_available.return_value = False
mock_whisper.load_model.return_value = MagicMock()
with patch.object(t, "sounddevice") as mock_sd:
mock_stream_ctx = MagicMock()
mock_stream_ctx.__enter__ = MagicMock(return_value=mock_stream_ctx)
mock_stream_ctx.__exit__ = MagicMock(return_value=False)
mock_sd.InputStream.return_value = mock_stream_ctx
stopper.start()
stop_event.set()
t.start()
# No transcription should have been emitted since audio is silent
assert received == []
class TestStartPortAudioError:
def test_emits_error_on_portaudio_failure(self):
t = make_transcriber()
errors = []
t.error.connect(lambda e: errors.append(e))
with patch("buzz.transcriber.recording_transcriber.whisper") as mock_whisper, \
patch("buzz.transcriber.recording_transcriber.torch") as mock_torch:
mock_torch.cuda.is_available.return_value = False
mock_whisper.load_model.return_value = MagicMock()
with patch.object(t, "sounddevice") as mock_sd:
mock_sd.InputStream.side_effect = PortAudioError()
t.start()
assert len(errors) == 1

View file

@ -1,9 +1,10 @@
import pytest
import unittest.mock
import uuid
from PyQt6.QtCore import QCoreApplication, QThread
from buzz.file_transcriber_queue_worker import FileTranscriberQueueWorker
from buzz.model_loader import ModelType, TranscriptionModel, WhisperModelSize
from buzz.transcriber.transcriber import FileTranscriptionTask, TranscriptionOptions, FileTranscriptionOptions
from buzz.transcriber.transcriber import FileTranscriptionTask, TranscriptionOptions, FileTranscriptionOptions, Segment
from buzz.transcriber.whisper_file_transcriber import WhisperFileTranscriber
from tests.audio import test_multibyte_utf8_audio_path
import time
@ -31,6 +32,310 @@ def worker(qapp):
thread.wait()
@pytest.fixture
def simple_worker(qapp):
"""A non-threaded worker for unit tests that only test individual methods."""
worker = FileTranscriberQueueWorker()
yield worker
class TestFileTranscriberQueueWorker:
def test_cancel_task_adds_to_canceled_set(self, simple_worker):
task_id = uuid.uuid4()
simple_worker.cancel_task(task_id)
assert task_id in simple_worker.canceled_tasks
def test_add_task_removes_from_canceled(self, simple_worker):
options = TranscriptionOptions(
model=TranscriptionModel(model_type=ModelType.WHISPER_CPP, whisper_model_size=WhisperModelSize.TINY),
extract_speech=False
)
task = FileTranscriptionTask(
file_path=str(test_multibyte_utf8_audio_path),
transcription_options=options,
file_transcription_options=FileTranscriptionOptions(),
model_path="mock_path"
)
# First cancel it
simple_worker.cancel_task(task.uid)
assert task.uid in simple_worker.canceled_tasks
# Prevent trigger_run from starting the run loop
simple_worker.is_running = True
# Then add it back
simple_worker.add_task(task)
assert task.uid not in simple_worker.canceled_tasks
def test_on_task_error_with_cancellation(self, simple_worker):
options = TranscriptionOptions()
task = FileTranscriptionTask(
file_path=str(test_multibyte_utf8_audio_path),
transcription_options=options,
file_transcription_options=FileTranscriptionOptions(),
model_path="mock_path"
)
simple_worker.current_task = task
error_spy = unittest.mock.Mock()
simple_worker.task_error.connect(error_spy)
simple_worker.on_task_error("Transcription was canceled")
error_spy.assert_called_once()
assert task.status == FileTranscriptionTask.Status.CANCELED
assert "canceled" in task.error.lower()
def test_on_task_error_with_regular_error(self, simple_worker):
options = TranscriptionOptions()
task = FileTranscriptionTask(
file_path=str(test_multibyte_utf8_audio_path),
transcription_options=options,
file_transcription_options=FileTranscriptionOptions(),
model_path="mock_path"
)
simple_worker.current_task = task
error_spy = unittest.mock.Mock()
simple_worker.task_error.connect(error_spy)
simple_worker.on_task_error("Some error occurred")
error_spy.assert_called_once()
assert task.status == FileTranscriptionTask.Status.FAILED
assert task.error == "Some error occurred"
def test_on_task_progress_conversion(self, simple_worker):
options = TranscriptionOptions()
task = FileTranscriptionTask(
file_path=str(test_multibyte_utf8_audio_path),
transcription_options=options,
file_transcription_options=FileTranscriptionOptions(),
model_path="mock_path"
)
simple_worker.current_task = task
progress_spy = unittest.mock.Mock()
simple_worker.task_progress.connect(progress_spy)
simple_worker.on_task_progress((50, 100))
progress_spy.assert_called_once()
args = progress_spy.call_args[0]
assert args[0] == task
assert args[1] == 0.5
def test_stop_puts_sentinel_in_queue(self, simple_worker):
initial_size = simple_worker.tasks_queue.qsize()
simple_worker.stop()
# Sentinel (None) should be added to queue
assert simple_worker.tasks_queue.qsize() == initial_size + 1
def test_on_task_completed_with_speech_path(self, simple_worker, tmp_path):
"""Test on_task_completed cleans up speech_path file"""
options = TranscriptionOptions()
task = FileTranscriptionTask(
file_path=str(test_multibyte_utf8_audio_path),
transcription_options=options,
file_transcription_options=FileTranscriptionOptions(),
model_path="mock_path"
)
simple_worker.current_task = task
# Create a temporary file to simulate speech extraction output
speech_file = tmp_path / "audio_speech.mp3"
speech_file.write_bytes(b"fake audio data")
simple_worker.speech_path = speech_file
completed_spy = unittest.mock.Mock()
simple_worker.task_completed.connect(completed_spy)
simple_worker.on_task_completed([Segment(0, 1000, "Test")])
completed_spy.assert_called_once()
# Speech path should be cleaned up
assert simple_worker.speech_path is None
assert not speech_file.exists()
def test_on_task_completed_speech_path_missing(self, simple_worker, tmp_path):
"""Test on_task_completed handles missing speech_path file gracefully"""
options = TranscriptionOptions()
task = FileTranscriptionTask(
file_path=str(test_multibyte_utf8_audio_path),
transcription_options=options,
file_transcription_options=FileTranscriptionOptions(),
model_path="mock_path"
)
simple_worker.current_task = task
# Set a speech path that doesn't exist
simple_worker.speech_path = tmp_path / "nonexistent_speech.mp3"
completed_spy = unittest.mock.Mock()
simple_worker.task_completed.connect(completed_spy)
# Should not raise even if file doesn't exist
simple_worker.on_task_completed([])
completed_spy.assert_called_once()
assert simple_worker.speech_path is None
def test_on_task_download_progress(self, simple_worker):
"""Test on_task_download_progress emits signal"""
options = TranscriptionOptions()
task = FileTranscriptionTask(
file_path=str(test_multibyte_utf8_audio_path),
transcription_options=options,
file_transcription_options=FileTranscriptionOptions(),
model_path="mock_path"
)
simple_worker.current_task = task
download_spy = unittest.mock.Mock()
simple_worker.task_download_progress.connect(download_spy)
simple_worker.on_task_download_progress(0.5)
download_spy.assert_called_once()
args = download_spy.call_args[0]
assert args[0] == task
assert args[1] == 0.5
def test_cancel_task_stops_current_transcriber(self, simple_worker):
"""Test cancel_task stops the current transcriber if it matches"""
options = TranscriptionOptions()
task = FileTranscriptionTask(
file_path=str(test_multibyte_utf8_audio_path),
transcription_options=options,
file_transcription_options=FileTranscriptionOptions(),
model_path="mock_path"
)
simple_worker.current_task = task
mock_transcriber = unittest.mock.Mock()
simple_worker.current_transcriber = mock_transcriber
simple_worker.cancel_task(task.uid)
assert task.uid in simple_worker.canceled_tasks
mock_transcriber.stop.assert_called_once()
def test_on_task_error_task_in_canceled_set(self, simple_worker):
"""Test on_task_error does not emit signal when task is canceled"""
options = TranscriptionOptions()
task = FileTranscriptionTask(
file_path=str(test_multibyte_utf8_audio_path),
transcription_options=options,
file_transcription_options=FileTranscriptionOptions(),
model_path="mock_path"
)
simple_worker.current_task = task
# Mark task as canceled
simple_worker.canceled_tasks.add(task.uid)
error_spy = unittest.mock.Mock()
simple_worker.task_error.connect(error_spy)
simple_worker.on_task_error("Some error")
# Should NOT emit since task was canceled
error_spy.assert_not_called()
class TestFileTranscriberQueueWorkerRun:
def _make_task(self, model_type=ModelType.WHISPER_CPP, extract_speech=False):
options = TranscriptionOptions(
model=TranscriptionModel(model_type=model_type, whisper_model_size=WhisperModelSize.TINY),
extract_speech=extract_speech
)
return FileTranscriptionTask(
file_path=str(test_multibyte_utf8_audio_path),
transcription_options=options,
file_transcription_options=FileTranscriptionOptions(),
model_path="mock_path"
)
def test_run_returns_early_when_already_running(self, simple_worker):
simple_worker.is_running = True
# Should return without blocking (queue is empty, no get() call)
simple_worker.run()
# is_running stays True, nothing changed
assert simple_worker.is_running is True
def test_run_stops_on_sentinel(self, simple_worker, qapp):
completed_spy = unittest.mock.Mock()
simple_worker.completed.connect(completed_spy)
simple_worker.tasks_queue.put(None)
simple_worker.run()
completed_spy.assert_called_once()
assert simple_worker.is_running is False
def test_run_skips_canceled_task_then_stops_on_sentinel(self, simple_worker, qapp):
task = self._make_task()
simple_worker.canceled_tasks.add(task.uid)
started_spy = unittest.mock.Mock()
simple_worker.task_started.connect(started_spy)
# Put canceled task then sentinel
simple_worker.tasks_queue.put(task)
simple_worker.tasks_queue.put(None)
simple_worker.run()
# Canceled task should be skipped; completed emitted
started_spy.assert_not_called()
assert simple_worker.is_running is False
def test_run_creates_openai_transcriber(self, simple_worker, qapp):
from buzz.transcriber.openai_whisper_api_file_transcriber import OpenAIWhisperAPIFileTranscriber
task = self._make_task(model_type=ModelType.OPEN_AI_WHISPER_API)
simple_worker.tasks_queue.put(task)
with unittest.mock.patch.object(OpenAIWhisperAPIFileTranscriber, 'run'), \
unittest.mock.patch.object(OpenAIWhisperAPIFileTranscriber, 'moveToThread'), \
unittest.mock.patch('buzz.file_transcriber_queue_worker.QThread') as mock_thread_class:
mock_thread = unittest.mock.MagicMock()
mock_thread_class.return_value = mock_thread
simple_worker.run()
assert isinstance(simple_worker.current_transcriber, OpenAIWhisperAPIFileTranscriber)
def test_run_creates_whisper_transcriber_for_whisper_cpp(self, simple_worker, qapp):
task = self._make_task(model_type=ModelType.WHISPER_CPP)
simple_worker.tasks_queue.put(task)
with unittest.mock.patch.object(WhisperFileTranscriber, 'run'), \
unittest.mock.patch.object(WhisperFileTranscriber, 'moveToThread'), \
unittest.mock.patch('buzz.file_transcriber_queue_worker.QThread') as mock_thread_class:
mock_thread = unittest.mock.MagicMock()
mock_thread_class.return_value = mock_thread
simple_worker.run()
assert isinstance(simple_worker.current_transcriber, WhisperFileTranscriber)
def test_run_speech_extraction_failure_emits_error(self, simple_worker, qapp):
task = self._make_task(extract_speech=True)
simple_worker.tasks_queue.put(task)
error_spy = unittest.mock.Mock()
simple_worker.task_error.connect(error_spy)
with unittest.mock.patch('buzz.file_transcriber_queue_worker.demucsApi.Separator',
side_effect=RuntimeError("No internet")):
simple_worker.run()
error_spy.assert_called_once()
args = error_spy.call_args[0]
assert args[0] == task
assert simple_worker.is_running is False
def test_transcription_with_whisper_cpp_tiny_no_speech_extraction(worker):
options = TranscriptionOptions(
model=TranscriptionModel(model_type=ModelType.WHISPER_CPP, whisper_model_size=WhisperModelSize.TINY),

View file

@ -5,16 +5,78 @@ import pytest
from buzz.transcriber.openai_whisper_api_file_transcriber import (
OpenAIWhisperAPIFileTranscriber,
append_segment,
)
from buzz.transcriber.transcriber import (
FileTranscriptionTask,
TranscriptionOptions,
FileTranscriptionOptions,
Segment,
)
from openai.types.audio import Transcription, Translation
class TestAppendSegment:
def test_valid_utf8(self):
result = []
success = append_segment(result, b"Hello world", 100, 200)
assert success is True
assert len(result) == 1
assert result[0].start == 1000 # 100 centiseconds to ms
assert result[0].end == 2000 # 200 centiseconds to ms
assert result[0].text == "Hello world"
def test_empty_bytes(self):
result = []
success = append_segment(result, b"", 100, 200)
assert success is True
assert len(result) == 0
def test_invalid_utf8(self):
result = []
# Invalid UTF-8 sequence
success = append_segment(result, b"\xff\xfe", 100, 200)
assert success is False
assert len(result) == 0
def test_multibyte_utf8(self):
result = []
success = append_segment(result, "Привет".encode("utf-8"), 50, 150)
assert success is True
assert len(result) == 1
assert result[0].text == "Привет"
class TestGetValue:
def test_get_value_from_dict(self):
obj = {"key": "value", "number": 42}
assert OpenAIWhisperAPIFileTranscriber.get_value(obj, "key") == "value"
assert OpenAIWhisperAPIFileTranscriber.get_value(obj, "number") == 42
def test_get_value_from_object(self):
class TestObj:
key = "value"
number = 42
obj = TestObj()
assert OpenAIWhisperAPIFileTranscriber.get_value(obj, "key") == "value"
assert OpenAIWhisperAPIFileTranscriber.get_value(obj, "number") == 42
def test_get_value_missing_key_dict(self):
obj = {"key": "value"}
assert OpenAIWhisperAPIFileTranscriber.get_value(obj, "missing") is None
assert OpenAIWhisperAPIFileTranscriber.get_value(obj, "missing", "default") == "default"
def test_get_value_missing_attribute_object(self):
class TestObj:
key = "value"
obj = TestObj()
assert OpenAIWhisperAPIFileTranscriber.get_value(obj, "missing") is None
assert OpenAIWhisperAPIFileTranscriber.get_value(obj, "missing", "default") == "default"
class TestOpenAIWhisperAPIFileTranscriber:
@pytest.fixture
def mock_openai_client(self):

View file

@ -1,7 +1,8 @@
import os
import sys
import time
from unittest.mock import Mock, patch
import numpy as np
from unittest.mock import Mock, patch, MagicMock
from PyQt6.QtCore import QThread
@ -10,10 +11,78 @@ from buzz.assets import APP_BASE_DIR
from buzz.model_loader import TranscriptionModel, ModelType, WhisperModelSize
from buzz.transcriber.recording_transcriber import RecordingTranscriber
from buzz.transcriber.transcriber import TranscriptionOptions, Task
from buzz.settings.recording_transcriber_mode import RecordingTranscriberMode
from tests.mock_sounddevice import MockSoundDevice
from tests.model_loader import get_model_path
class TestAmplitude:
def test_symmetric_array(self):
arr = np.array([1.0, -1.0, 2.0, -2.0])
amplitude = RecordingTranscriber.amplitude(arr)
# RMS: sqrt(mean([1, 1, 4, 4])) = sqrt(2.5) ≈ 1.5811
assert abs(amplitude - np.sqrt(2.5)) < 1e-6
def test_asymmetric_array(self):
arr = np.array([1.0, 2.0, 3.0, -1.0])
amplitude = RecordingTranscriber.amplitude(arr)
# RMS: sqrt(mean([1, 4, 9, 1])) = sqrt(3.75) ≈ 1.9365
assert abs(amplitude - np.sqrt(3.75)) < 1e-6
def test_all_zeros(self):
arr = np.array([0.0, 0.0, 0.0])
amplitude = RecordingTranscriber.amplitude(arr)
assert amplitude == 0.0
def test_all_positive(self):
arr = np.array([1.0, 2.0, 3.0, 4.0])
amplitude = RecordingTranscriber.amplitude(arr)
# RMS: sqrt(mean([1, 4, 9, 16])) = sqrt(7.5) ≈ 2.7386
assert abs(amplitude - np.sqrt(7.5)) < 1e-6
def test_all_negative(self):
arr = np.array([-1.0, -2.0, -3.0, -4.0])
amplitude = RecordingTranscriber.amplitude(arr)
# RMS is symmetric: same as all_positive
assert abs(amplitude - np.sqrt(7.5)) < 1e-6
def test_returns_float(self):
arr = np.array([0.5], dtype=np.float32)
amplitude = RecordingTranscriber.amplitude(arr)
assert isinstance(amplitude, float)
class TestGetDeviceSampleRate:
def test_returns_default_16khz_when_supported(self):
with patch("sounddevice.check_input_settings"):
rate = RecordingTranscriber.get_device_sample_rate(None)
assert rate == 16000
def test_falls_back_to_device_default(self):
import sounddevice
from sounddevice import PortAudioError
def raise_error(*args, **kwargs):
raise PortAudioError("Device doesn't support 16000")
device_info = {"default_samplerate": 44100}
with patch("sounddevice.check_input_settings", side_effect=raise_error), \
patch("sounddevice.query_devices", return_value=device_info):
rate = RecordingTranscriber.get_device_sample_rate(0)
assert rate == 44100
def test_returns_default_when_query_fails(self):
from sounddevice import PortAudioError
def raise_error(*args, **kwargs):
raise PortAudioError("Device doesn't support 16000")
with patch("sounddevice.check_input_settings", side_effect=raise_error), \
patch("sounddevice.query_devices", return_value=None):
rate = RecordingTranscriber.get_device_sample_rate(0)
assert rate == 16000
class TestRecordingTranscriber:
def test_should_transcribe(self, qtbot):
@ -51,16 +120,432 @@ class TestRecordingTranscriber:
transcriber.transcription.connect(on_transcription)
thread.start()
qtbot.waitUntil(lambda: len(transcriptions) == 3, timeout=60_000)
try:
qtbot.waitUntil(lambda: len(transcriptions) == 3, timeout=120_000)
# any string in any transcription
strings_to_check = [_("Starting Whisper.cpp..."), "Bienvenue dans Passe"]
assert any(s in t for s in strings_to_check for t in transcriptions)
# any string in any transcription
strings_to_check = [_("Starting Whisper.cpp..."), "Bienvenue dans Passe"]
assert any(s in t for s in strings_to_check for t in transcriptions)
finally:
# Ensure cleanup runs even if waitUntil times out
transcriber.stop_recording()
time.sleep(10)
# Wait for the thread to finish
thread.quit()
thread.wait()
# Ensure process is cleaned up
if transcriber.process and transcriber.process.poll() is None:
transcriber.process.terminate()
try:
transcriber.process.wait(timeout=2)
except:
pass
# Process pending events to ensure cleanup
from PyQt6.QtCore import QCoreApplication
QCoreApplication.processEvents()
time.sleep(0.1)
class TestRecordingTranscriberInit:
def test_init_default_mode(self):
transcription_options = TranscriptionOptions(
model=TranscriptionModel(model_type=ModelType.WHISPER_CPP),
language="en",
task=Task.TRANSCRIBE,
)
with patch("sounddevice.check_input_settings"):
transcriber = RecordingTranscriber(
transcription_options=transcription_options,
input_device_index=0,
sample_rate=16000,
model_path="/fake/path",
sounddevice=MockSoundDevice(),
)
assert transcriber.transcription_options == transcription_options
assert transcriber.input_device_index == 0
assert transcriber.sample_rate == 16000
assert transcriber.model_path == "/fake/path"
assert transcriber.n_batch_samples == 5 * 16000
assert transcriber.keep_sample_seconds == 0.15
assert transcriber.is_running is False
assert transcriber.openai_client is None
def test_init_append_and_correct_mode(self):
transcription_options = TranscriptionOptions(
model=TranscriptionModel(model_type=ModelType.WHISPER_CPP),
language="en",
task=Task.TRANSCRIBE,
)
with patch("sounddevice.check_input_settings"), \
patch("buzz.transcriber.recording_transcriber.Settings") as mock_settings_class:
# Mock settings to return APPEND_AND_CORRECT mode (index 2 in the enum)
mock_settings_instance = MagicMock()
mock_settings_class.return_value = mock_settings_instance
# Return 2 for APPEND_AND_CORRECT mode (it's the third item in the enum)
mock_settings_instance.value.return_value = 2
transcriber = RecordingTranscriber(
transcription_options=transcription_options,
input_device_index=0,
sample_rate=16000,
model_path="/fake/path",
sounddevice=MockSoundDevice(),
)
# APPEND_AND_CORRECT mode should use smaller batch size and longer keep duration
assert transcriber.n_batch_samples == int(transcription_options.transcription_step * 16000)
assert transcriber.keep_sample_seconds == 1.5
def test_init_stores_silence_threshold(self):
transcription_options = TranscriptionOptions(
model=TranscriptionModel(model_type=ModelType.WHISPER_CPP),
language="en",
task=Task.TRANSCRIBE,
silence_threshold=0.01,
)
with patch("sounddevice.check_input_settings"):
transcriber = RecordingTranscriber(
transcription_options=transcription_options,
input_device_index=0,
sample_rate=16000,
model_path="/fake/path",
sounddevice=MockSoundDevice(),
)
assert transcriber.transcription_options.silence_threshold == 0.01
def test_init_uses_default_sample_rate_when_none(self):
transcription_options = TranscriptionOptions(
model=TranscriptionModel(model_type=ModelType.WHISPER_CPP),
language="en",
task=Task.TRANSCRIBE,
)
with patch("sounddevice.check_input_settings"):
transcriber = RecordingTranscriber(
transcription_options=transcription_options,
input_device_index=0,
sample_rate=None,
model_path="/fake/path",
sounddevice=MockSoundDevice(),
)
# Should use default whisper sample rate
assert transcriber.sample_rate == 16000
class TestStreamCallback:
def test_stream_callback_adds_to_queue(self):
transcription_options = TranscriptionOptions(
model=TranscriptionModel(model_type=ModelType.WHISPER_CPP),
language="en",
task=Task.TRANSCRIBE,
)
with patch("sounddevice.check_input_settings"):
transcriber = RecordingTranscriber(
transcription_options=transcription_options,
input_device_index=0,
sample_rate=16000,
model_path="/fake/path",
sounddevice=MockSoundDevice(),
)
# Create test audio data
in_data = np.array([[0.1], [0.2], [0.3], [0.4]], dtype=np.float32)
initial_size = transcriber.queue.size
transcriber.stream_callback(in_data, 4, None, None)
# Queue should have grown by 4 samples
assert transcriber.queue.size == initial_size + 4
def test_stream_callback_emits_amplitude_changed(self):
transcription_options = TranscriptionOptions(
model=TranscriptionModel(model_type=ModelType.WHISPER_CPP),
language="en",
task=Task.TRANSCRIBE,
)
with patch("sounddevice.check_input_settings"):
transcriber = RecordingTranscriber(
transcription_options=transcription_options,
input_device_index=0,
sample_rate=16000,
model_path="/fake/path",
sounddevice=MockSoundDevice(),
)
# Mock the amplitude_changed signal
amplitude_values = []
transcriber.amplitude_changed.connect(lambda amp: amplitude_values.append(amp))
# Create test audio data
in_data = np.array([[0.1], [0.2], [0.3], [0.4]], dtype=np.float32)
transcriber.stream_callback(in_data, 4, None, None)
# Should have emitted one amplitude value
assert len(amplitude_values) == 1
assert amplitude_values[0] > 0
def test_stream_callback_drops_data_when_queue_full(self):
transcription_options = TranscriptionOptions(
model=TranscriptionModel(model_type=ModelType.WHISPER_CPP),
language="en",
task=Task.TRANSCRIBE,
)
with patch("sounddevice.check_input_settings"):
transcriber = RecordingTranscriber(
transcription_options=transcription_options,
input_device_index=0,
sample_rate=16000,
model_path="/fake/path",
sounddevice=MockSoundDevice(),
)
# Fill the queue beyond max_queue_size
transcriber.queue = np.ones(transcriber.max_queue_size, dtype=np.float32)
initial_size = transcriber.queue.size
# Try to add more data
in_data = np.array([[0.1], [0.2]], dtype=np.float32)
transcriber.stream_callback(in_data, 2, None, None)
# Queue should not have grown (data was dropped)
assert transcriber.queue.size == initial_size
class TestStopRecording:
def test_stop_recording_sets_is_running_false(self):
transcription_options = TranscriptionOptions(
model=TranscriptionModel(model_type=ModelType.WHISPER_CPP),
language="en",
task=Task.TRANSCRIBE,
)
with patch("sounddevice.check_input_settings"):
transcriber = RecordingTranscriber(
transcription_options=transcription_options,
input_device_index=0,
sample_rate=16000,
model_path="/fake/path",
sounddevice=MockSoundDevice(),
)
transcriber.is_running = True
transcriber.stop_recording()
time.sleep(10)
thread.quit()
thread.wait()
time.sleep(3)
assert transcriber.is_running is False
def test_stop_recording_terminates_process(self):
transcription_options = TranscriptionOptions(
model=TranscriptionModel(model_type=ModelType.WHISPER_CPP),
language="en",
task=Task.TRANSCRIBE,
)
with patch("sounddevice.check_input_settings"):
transcriber = RecordingTranscriber(
transcription_options=transcription_options,
input_device_index=0,
sample_rate=16000,
model_path="/fake/path",
sounddevice=MockSoundDevice(),
)
# Mock a running process
mock_process = MagicMock()
mock_process.poll.return_value = None # Process is running
transcriber.process = mock_process
transcriber.stop_recording()
# Process should have been terminated and waited
mock_process.terminate.assert_called_once()
mock_process.wait.assert_called_once_with(timeout=5)
def test_stop_recording_skips_terminated_process(self):
transcription_options = TranscriptionOptions(
model=TranscriptionModel(model_type=ModelType.WHISPER_CPP),
language="en",
task=Task.TRANSCRIBE,
)
with patch("sounddevice.check_input_settings"):
transcriber = RecordingTranscriber(
transcription_options=transcription_options,
input_device_index=0,
sample_rate=16000,
model_path="/fake/path",
sounddevice=MockSoundDevice(),
)
# Mock an already terminated process
mock_process = MagicMock()
mock_process.poll.return_value = 0 # Process already terminated
transcriber.process = mock_process
transcriber.stop_recording()
# terminate and wait should not be called
mock_process.terminate.assert_not_called()
mock_process.wait.assert_not_called()
class TestStartLocalWhisperServer:
def test_start_local_whisper_server_creates_openai_client(self):
transcription_options = TranscriptionOptions(
model=TranscriptionModel(model_type=ModelType.WHISPER_CPP),
language="en",
task=Task.TRANSCRIBE,
)
with patch("sounddevice.check_input_settings"), \
patch("subprocess.Popen") as mock_popen, \
patch("time.sleep"):
# Mock a successful process
mock_process = MagicMock()
mock_process.poll.return_value = None # Process is running
mock_popen.return_value = mock_process
transcriber = RecordingTranscriber(
transcription_options=transcription_options,
input_device_index=0,
sample_rate=16000,
model_path="/fake/path",
sounddevice=MockSoundDevice(),
)
try:
transcriber.is_running = True
transcriber.start_local_whisper_server()
# Should have created an OpenAI client
assert transcriber.openai_client is not None
assert transcriber.process is not None
finally:
# Clean up to prevent QThread warnings
transcriber.is_running = False
transcriber.process = None
def test_start_local_whisper_server_with_language(self):
transcription_options = TranscriptionOptions(
model=TranscriptionModel(model_type=ModelType.WHISPER_CPP),
language="fr",
task=Task.TRANSCRIBE,
)
with patch("sounddevice.check_input_settings"), \
patch("subprocess.Popen") as mock_popen, \
patch("time.sleep"):
mock_process = MagicMock()
mock_process.poll.return_value = None
mock_popen.return_value = mock_process
transcriber = RecordingTranscriber(
transcription_options=transcription_options,
input_device_index=0,
sample_rate=16000,
model_path="/fake/path",
sounddevice=MockSoundDevice(),
)
try:
transcriber.is_running = True
transcriber.start_local_whisper_server()
# Check that the language was passed to the command
call_args = mock_popen.call_args
cmd = call_args[0][0]
assert "--language" in cmd
assert "fr" in cmd
finally:
transcriber.is_running = False
transcriber.process = None
def test_start_local_whisper_server_auto_language(self):
transcription_options = TranscriptionOptions(
model=TranscriptionModel(model_type=ModelType.WHISPER_CPP),
language=None,
task=Task.TRANSCRIBE,
)
with patch("sounddevice.check_input_settings"), \
patch("subprocess.Popen") as mock_popen, \
patch("time.sleep"):
mock_process = MagicMock()
mock_process.poll.return_value = None
mock_popen.return_value = mock_process
transcriber = RecordingTranscriber(
transcription_options=transcription_options,
input_device_index=0,
sample_rate=16000,
model_path="/fake/path",
sounddevice=MockSoundDevice(),
)
try:
transcriber.is_running = True
transcriber.start_local_whisper_server()
# Check that auto language was used
call_args = mock_popen.call_args
cmd = call_args[0][0]
assert "--language" in cmd
assert "auto" in cmd
finally:
transcriber.is_running = False
transcriber.process = None
def test_start_local_whisper_server_handles_failure(self):
transcription_options = TranscriptionOptions(
model=TranscriptionModel(model_type=ModelType.WHISPER_CPP),
language="en",
task=Task.TRANSCRIBE,
)
with patch("sounddevice.check_input_settings"), \
patch("subprocess.Popen") as mock_popen, \
patch("time.sleep"):
# Mock a failed process
mock_process = MagicMock()
mock_process.poll.return_value = 1 # Process terminated with error
mock_process.stderr.read.return_value = b"Error loading model"
mock_popen.return_value = mock_process
transcriber = RecordingTranscriber(
transcription_options=transcription_options,
input_device_index=0,
sample_rate=16000,
model_path="/fake/path",
sounddevice=MockSoundDevice(),
)
transcriptions = []
transcriber.transcription.connect(lambda text: transcriptions.append(text))
try:
transcriber.is_running = True
transcriber.start_local_whisper_server()
# Should not have created a client when server failed
assert transcriber.openai_client is None
# Should have emitted starting and error messages
assert len(transcriptions) >= 1
# First message should be about starting Whisper.cpp
assert "Whisper" in transcriptions[0]
finally:
transcriber.is_running = False
transcriber.process = None

View file

@ -1,9 +1,69 @@
import os
import sys
import platform
from unittest.mock import patch
import pytest
from buzz.transformers_whisper import TransformersTranscriber
from buzz.transformers_whisper import TransformersTranscriber, is_intel_mac, is_peft_model
class TestIsIntelMac:
@pytest.mark.parametrize(
"sys_platform,machine,expected",
[
("linux", "x86_64", False),
("win32", "x86_64", False),
("darwin", "arm64", False),
("darwin", "x86_64", True),
("darwin", "i386", False),
],
)
def test_is_intel_mac(self, sys_platform, machine, expected):
with patch("buzz.transformers_whisper.sys.platform", sys_platform), \
patch("buzz.transformers_whisper.platform.machine", return_value=machine):
assert is_intel_mac() == expected
class TestIsPeftModel:
@pytest.mark.parametrize(
"model_id,expected",
[
("openai/whisper-tiny-peft", True),
("user/model-PEFT", True),
("openai/whisper-tiny", False),
("facebook/mms-1b-all", False),
("", False),
],
)
def test_peft_detection(self, model_id, expected):
assert is_peft_model(model_id) == expected
class TestGetPeftRepoId:
def test_repo_id_returned_as_is(self):
transcriber = TransformersTranscriber("user/whisper-tiny-peft")
with patch("os.path.exists", return_value=False):
assert transcriber._get_peft_repo_id() == "user/whisper-tiny-peft"
def test_linux_cache_path(self):
linux_path = "/home/user/.cache/Buzz/models/models--user--whisper-peft/snapshots/abc123"
transcriber = TransformersTranscriber(linux_path)
with patch("os.path.exists", return_value=True), \
patch("buzz.transformers_whisper.os.sep", "/"):
assert transcriber._get_peft_repo_id() == "user/whisper-peft"
def test_windows_cache_path(self):
windows_path = r"C:\Users\user\.cache\Buzz\models\models--user--whisper-peft\snapshots\abc123"
transcriber = TransformersTranscriber(windows_path)
with patch("os.path.exists", return_value=True), \
patch("buzz.transformers_whisper.os.sep", "\\"):
assert transcriber._get_peft_repo_id() == "user/whisper-peft"
def test_fallback_returns_model_id(self):
transcriber = TransformersTranscriber("some-local-model")
with patch("os.path.exists", return_value=True):
assert transcriber._get_peft_repo_id() == "some-local-model"
class TestGetMmsRepoId:

View file

@ -1,3 +1,6 @@
from unittest.mock import patch, MagicMock, mock_open
import json
from buzz.model_loader import TranscriptionModel, ModelType, WhisperModelSize
from buzz.transcriber.transcriber import (
TranscriptionOptions,
@ -34,7 +37,7 @@ class TestWhisperCpp:
# Combine all segment texts
full_text = " ".join(segment.text for segment in segments)
assert "Bien venu" in full_text
assert "Bien venu" in full_text or "bienvenu" in full_text.lower()
def test_transcribe_word_level_timestamps(self):
transcription_options = TranscriptionOptions(
@ -59,4 +62,179 @@ class TestWhisperCpp:
assert "Mani" in segments[0].text
assert "uzstrau" or "ustrau" in segments[1].text
assert "laikabstāk" in segments[2].text
assert "laikabstāk" in segments[2].text
def test_transcribe_chinese_multibyte_word_level_timestamps(self):
"""Test that Chinese characters split across multiple tokens are properly combined.
Chinese character (U+95FB) is encoded as UTF-8 bytes E9 97 BB.
Whisper.cpp may split this into separate tokens, e.g.:
- Token 1: bytes E9 97 (incomplete)
- Token 2: byte BB (completes the character)
The code should combine these bytes and output as a single segment.
"""
# Mock JSON data simulating whisper.cpp output with split Chinese characters
# The character 闻 is split into two tokens: \xe9\x97 and \xbb
# The character 新 is a complete token
# Together they form 新闻 (news)
mock_json_data = {
"transcription": [
{
"offsets": {"from": 0, "to": 5000},
"text": "", # Not used in word-level processing
"tokens": [
{
"text": "[_BEG_]",
"offsets": {"from": 0, "to": 0},
},
{
# 新 - complete character (UTF-8: E6 96 B0)
# When read as latin-1: \xe6\x96\xb0
"text": "\xe6\x96\xb0",
"offsets": {"from": 100, "to": 200},
},
{
# First two bytes of 闻 (UTF-8: E9 97 BB)
# When read as latin-1: \xe9\x97
"text": "\xe9\x97",
"offsets": {"from": 200, "to": 300},
},
{
# Last byte of 闻
# When read as latin-1: \xbb
"text": "\xbb",
"offsets": {"from": 300, "to": 400},
},
{
"text": "[_TT_500]",
"offsets": {"from": 500, "to": 500},
},
],
}
]
}
# Convert to JSON string using latin-1 compatible encoding
# We write bytes directly since the real file is read with latin-1
json_bytes = json.dumps(mock_json_data, ensure_ascii=False).encode("latin-1")
transcription_options = TranscriptionOptions(
language="zh",
task=Task.TRANSCRIBE,
word_level_timings=True,
model=TranscriptionModel(
model_type=ModelType.WHISPER_CPP,
whisper_model_size=WhisperModelSize.TINY,
),
)
task = FileTranscriptionTask(
transcription_options=transcription_options,
file_transcription_options=FileTranscriptionOptions(),
model_path="/fake/model/path",
file_path="/fake/audio.wav",
)
# Mock subprocess.Popen to simulate whisper-cli execution
mock_process = MagicMock()
mock_process.stderr.readline.side_effect = [""]
mock_process.wait.return_value = None
mock_process.returncode = 0
with patch("buzz.transcriber.whisper_cpp.subprocess.Popen", return_value=mock_process):
with patch("buzz.transcriber.whisper_cpp.os.path.exists", return_value=True):
with patch("builtins.open", mock_open(read_data=json_bytes.decode("latin-1"))):
segments = WhisperCpp.transcribe(task=task)
# Should have 2 segments: 新 and 闻 (each character separate)
assert len(segments) == 2
assert segments[0].text == ""
assert segments[1].text == ""
# Verify timestamps
assert segments[0].start == 100
assert segments[0].end == 200
# 闻 spans from token at 200 to token ending at 400
assert segments[1].start == 200
assert segments[1].end == 400
def test_transcribe_chinese_mixed_complete_and_split_chars(self):
"""Test a mix of complete and split Chinese characters."""
# 大家好 - "Hello everyone"
# 大 (E5 A4 A7) - complete token
# 家 (E5 AE B6) - split into E5 AE and B6
# 好 (E5 A5 BD) - complete token
mock_json_data = {
"transcription": [
{
"offsets": {"from": 0, "to": 5000},
"text": "", # Not used in word-level processing
"tokens": [
{
"text": "[_BEG_]",
"offsets": {"from": 0, "to": 0},
},
{
# 大 - complete
"text": "\xe5\xa4\xa7",
"offsets": {"from": 100, "to": 200},
},
{
# First two bytes of 家
"text": "\xe5\xae",
"offsets": {"from": 200, "to": 250},
},
{
# Last byte of 家
"text": "\xb6",
"offsets": {"from": 250, "to": 300},
},
{
# 好 - complete
"text": "\xe5\xa5\xbd",
"offsets": {"from": 300, "to": 400},
},
],
}
]
}
json_bytes = json.dumps(mock_json_data, ensure_ascii=False).encode("latin-1")
transcription_options = TranscriptionOptions(
language="zh",
task=Task.TRANSCRIBE,
word_level_timings=True,
model=TranscriptionModel(
model_type=ModelType.WHISPER_CPP,
whisper_model_size=WhisperModelSize.TINY,
),
)
task = FileTranscriptionTask(
transcription_options=transcription_options,
file_transcription_options=FileTranscriptionOptions(),
model_path="/fake/model/path",
file_path="/fake/audio.wav",
)
mock_process = MagicMock()
mock_process.stderr.readline.side_effect = [""]
mock_process.wait.return_value = None
mock_process.returncode = 0
with patch("buzz.transcriber.whisper_cpp.subprocess.Popen", return_value=mock_process):
with patch("buzz.transcriber.whisper_cpp.os.path.exists", return_value=True):
with patch("builtins.open", mock_open(read_data=json_bytes.decode("latin-1"))):
segments = WhisperCpp.transcribe(task=task)
# Should have 3 segments: 大, 家, 好
assert len(segments) == 3
assert segments[0].text == ""
assert segments[1].text == ""
assert segments[2].text == ""
# Combined text
full_text = "".join(s.text for s in segments)
assert full_text == "大家好"

View file

@ -21,11 +21,59 @@ from buzz.transcriber.transcriber import (
FileTranscriptionOptions,
Segment,
)
from buzz.transcriber.whisper_file_transcriber import WhisperFileTranscriber
from buzz.transcriber.whisper_file_transcriber import (
WhisperFileTranscriber,
check_file_has_audio_stream,
PROGRESS_REGEX,
)
from tests.audio import test_audio_path
from tests.model_loader import get_model_path
class TestCheckFileHasAudioStream:
def test_valid_audio_file(self):
# Should not raise exception for valid audio file
check_file_has_audio_stream(test_audio_path)
def test_missing_file(self):
with pytest.raises(ValueError, match="File not found"):
check_file_has_audio_stream("/nonexistent/path/to/file.mp3")
def test_invalid_media_file(self):
# Create a temporary text file (not a valid media file)
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
try:
temp_file.write(b"This is not a valid media file")
temp_file.close()
with pytest.raises(ValueError, match="Invalid media file"):
check_file_has_audio_stream(temp_file.name)
finally:
os.unlink(temp_file.name)
class TestProgressRegex:
def test_integer_percentage(self):
match = PROGRESS_REGEX.search("Progress: 50%")
assert match is not None
assert match.group() == "50%"
def test_decimal_percentage(self):
match = PROGRESS_REGEX.search("Progress: 75.5%")
assert match is not None
assert match.group() == "75.5%"
def test_no_match(self):
match = PROGRESS_REGEX.search("No percentage here")
assert match is None
def test_extract_percentage_value(self):
line = "Transcription progress: 85%"
match = PROGRESS_REGEX.search(line)
assert match is not None
percentage = int(match.group().strip("%"))
assert percentage == 85
class TestWhisperFileTranscriber:
@pytest.mark.parametrize(
"file_path,output_format,expected_file_path",
@ -309,6 +357,42 @@ class TestWhisperFileTranscriber:
transcriber.stop()
time.sleep(3)
def test_transcribe_from_folder_watch_source_deletes_file(self, qtbot):
file_path = tempfile.mktemp(suffix=".mp3")
shutil.copy(test_audio_path, file_path)
file_transcription_options = FileTranscriptionOptions(
file_paths=[file_path],
output_formats={OutputFormat.TXT},
)
transcription_options = TranscriptionOptions()
model_path = get_model_path(transcription_options.model)
output_directory = tempfile.mkdtemp()
transcriber = WhisperFileTranscriber(
task=FileTranscriptionTask(
model_path=model_path,
transcription_options=transcription_options,
file_transcription_options=file_transcription_options,
file_path=file_path,
original_file_path=file_path,
output_directory=output_directory,
source=FileTranscriptionTask.Source.FOLDER_WATCH,
delete_source_file=True,
)
)
with qtbot.wait_signal(transcriber.completed, timeout=10 * 6000):
transcriber.run()
assert not os.path.isfile(file_path)
assert not os.path.isfile(
os.path.join(output_directory, os.path.basename(file_path))
)
assert len(glob.glob("*.txt", root_dir=output_directory)) > 0
transcriber.stop()
time.sleep(3)
@pytest.mark.skip()
def test_transcribe_stop(self):
output_file_path = os.path.join(tempfile.gettempdir(), "whisper.txt")

View file

@ -8,6 +8,56 @@ from PyQt6.QtCore import QThread
from buzz.translator import Translator
from buzz.transcriber.transcriber import TranscriptionOptions
from buzz.widgets.transcriber.advanced_settings_dialog import AdvancedSettingsDialog
from buzz.locale import _
class TestParseBatchResponse:
def test_simple_batch(self):
response = "[1] Hello\n[2] World"
result = Translator._parse_batch_response(response, 2)
assert len(result) == 2
assert result[0] == "Hello"
assert result[1] == "World"
def test_missing_entries_fallback(self):
response = "[1] Hello\n[3] World"
result = Translator._parse_batch_response(response, 3)
assert len(result) == 3
assert result[0] == "Hello"
assert result[1] == ""
assert result[2] == "World"
def test_multiline_entries(self):
response = "[1] This is a long\nmultiline translation\n[2] Short"
result = Translator._parse_batch_response(response, 2)
assert len(result) == 2
assert "multiline" in result[0]
assert result[1] == "Short"
def test_single_item_batch(self):
response = "[1] Single translation"
result = Translator._parse_batch_response(response, 1)
assert len(result) == 1
assert result[0] == "Single translation"
def test_empty_response(self):
response = ""
result = Translator._parse_batch_response(response, 2)
assert len(result) == 2
assert result[0] == ""
assert result[1] == ""
def test_whitespace_handling(self):
response = "[1] Hello with spaces \n[2] World "
result = Translator._parse_batch_response(response, 2)
assert result[0] == "Hello with spaces"
assert result[1] == "World"
def test_out_of_order_entries(self):
response = "[2] Second\n[1] First"
result = Translator._parse_batch_response(response, 2)
assert result[0] == "First"
assert result[1] == "Second"
class TestTranslator:
@ -25,6 +75,7 @@ class TestTranslator:
side_effect.call_count = 0
mock_queue.get.side_effect = side_effect
mock_queue.get_nowait.side_effect = Empty
mock_chat = Mock()
mock_openai.return_value.chat = mock_chat
mock_chat.completions.create.return_value = Mock(
@ -110,6 +161,10 @@ class TestTranslator:
self.translation_thread.quit()
# Wait for the thread to actually finish before cleanup
self.translation_thread.wait()
# Process pending events to ensure deleteLater() is handled
from PyQt6.QtCore import QCoreApplication
QCoreApplication.processEvents()
time.sleep(0.1) # Give time for cleanup
# Note: translator and translation_thread will be automatically deleted
# via the deleteLater() connections set up earlier

View file

@ -0,0 +1,202 @@
import platform
from datetime import datetime, timedelta
from unittest.mock import patch
import pytest
from pytestqt.qtbot import QtBot
from buzz.__version__ import VERSION
from buzz.settings.settings import Settings
from buzz.update_checker import UpdateChecker, UpdateInfo
from tests.mock_qt import MockNetworkAccessManager, MockNetworkReply
VERSION_INFO = {
"version": "99.0.0",
"release_notes": "Some fixes.",
"download_urls": {
"windows_x64": ["https://example.com/Buzz-99.0.0.exe"],
"macos_arm": ["https://example.com/Buzz-99.0.0-arm.dmg"],
"macos_x86": ["https://example.com/Buzz-99.0.0-x86.dmg"],
},
}
@pytest.fixture()
def checker(settings: Settings) -> UpdateChecker:
reply = MockNetworkReply(data=VERSION_INFO)
manager = MockNetworkAccessManager(reply=reply)
return UpdateChecker(settings=settings, network_manager=manager)
class TestShouldCheckForUpdates:
def test_returns_false_on_linux(self, checker: UpdateChecker):
with patch.object(platform, "system", return_value="Linux"):
assert checker.should_check_for_updates() is False
def test_returns_true_on_windows_first_run(self, checker: UpdateChecker, settings: Settings):
settings.set_value(Settings.Key.LAST_UPDATE_CHECK, "")
with patch.object(platform, "system", return_value="Windows"):
assert checker.should_check_for_updates() is True
def test_returns_true_on_macos_first_run(self, checker: UpdateChecker, settings: Settings):
settings.set_value(Settings.Key.LAST_UPDATE_CHECK, "")
with patch.object(platform, "system", return_value="Darwin"):
assert checker.should_check_for_updates() is True
def test_returns_false_when_checked_recently(
self, checker: UpdateChecker, settings: Settings
):
recent = (datetime.now() - timedelta(days=2)).isoformat()
settings.set_value(Settings.Key.LAST_UPDATE_CHECK, recent)
with patch.object(platform, "system", return_value="Windows"):
assert checker.should_check_for_updates() is False
def test_returns_true_when_check_is_overdue(
self, checker: UpdateChecker, settings: Settings
):
old = (datetime.now() - timedelta(days=10)).isoformat()
settings.set_value(Settings.Key.LAST_UPDATE_CHECK, old)
with patch.object(platform, "system", return_value="Windows"):
assert checker.should_check_for_updates() is True
def test_returns_true_on_invalid_date_in_settings(
self, checker: UpdateChecker, settings: Settings
):
settings.set_value(Settings.Key.LAST_UPDATE_CHECK, "not-a-date")
with patch.object(platform, "system", return_value="Windows"):
assert checker.should_check_for_updates() is True
class TestIsNewerVersion:
def test_newer_major(self, checker: UpdateChecker):
with patch("buzz.update_checker.VERSION", "1.0.0"):
assert checker._is_newer_version("2.0.0") is True
def test_newer_minor(self, checker: UpdateChecker):
with patch("buzz.update_checker.VERSION", "1.0.0"):
assert checker._is_newer_version("1.1.0") is True
def test_newer_patch(self, checker: UpdateChecker):
with patch("buzz.update_checker.VERSION", "1.0.0"):
assert checker._is_newer_version("1.0.1") is True
def test_same_version(self, checker: UpdateChecker):
with patch("buzz.update_checker.VERSION", "1.0.0"):
assert checker._is_newer_version("1.0.0") is False
def test_older_version(self, checker: UpdateChecker):
with patch("buzz.update_checker.VERSION", "2.0.0"):
assert checker._is_newer_version("1.9.9") is False
def test_different_segment_count(self, checker: UpdateChecker):
with patch("buzz.update_checker.VERSION", "1.0"):
assert checker._is_newer_version("1.0.1") is True
def test_invalid_version_returns_false(self, checker: UpdateChecker):
with patch("buzz.update_checker.VERSION", "1.0.0"):
assert checker._is_newer_version("not-a-version") is False
class TestGetDownloadUrl:
def test_windows_returns_windows_urls(self, checker: UpdateChecker):
with patch.object(platform, "system", return_value="Windows"):
urls = checker._get_download_url(VERSION_INFO["download_urls"])
assert urls == ["https://example.com/Buzz-99.0.0.exe"]
def test_macos_arm_returns_arm_urls(self, checker: UpdateChecker):
with patch.object(platform, "system", return_value="Darwin"), \
patch.object(platform, "machine", return_value="arm64"):
urls = checker._get_download_url(VERSION_INFO["download_urls"])
assert urls == ["https://example.com/Buzz-99.0.0-arm.dmg"]
def test_macos_x86_returns_x86_urls(self, checker: UpdateChecker):
with patch.object(platform, "system", return_value="Darwin"), \
patch.object(platform, "machine", return_value="x86_64"):
urls = checker._get_download_url(VERSION_INFO["download_urls"])
assert urls == ["https://example.com/Buzz-99.0.0-x86.dmg"]
def test_linux_returns_empty(self, checker: UpdateChecker):
with patch.object(platform, "system", return_value="Linux"):
urls = checker._get_download_url(VERSION_INFO["download_urls"])
assert urls == []
def test_wraps_plain_string_in_list(self, checker: UpdateChecker):
with patch.object(platform, "system", return_value="Windows"):
urls = checker._get_download_url({"windows_x64": "https://example.com/a.exe"})
assert urls == ["https://example.com/a.exe"]
class TestCheckForUpdates:
def _make_checker(self, settings: Settings, version_data: dict) -> UpdateChecker:
settings.set_value(Settings.Key.LAST_UPDATE_CHECK, "")
reply = MockNetworkReply(data=version_data)
manager = MockNetworkAccessManager(reply=reply)
return UpdateChecker(settings=settings, network_manager=manager)
def test_emits_update_available_when_newer_version(self, settings: Settings):
received = []
checker = self._make_checker(settings, VERSION_INFO)
checker.update_available.connect(lambda info: received.append(info))
with patch.object(platform, "system", return_value="Windows"), \
patch.object(platform, "machine", return_value="x86_64"), \
patch("buzz.update_checker.VERSION", "1.0.0"):
checker.check_for_updates()
assert len(received) == 1
update_info: UpdateInfo = received[0]
assert update_info.version == "99.0.0"
assert update_info.release_notes == "Some fixes."
assert update_info.download_urls == ["https://example.com/Buzz-99.0.0.exe"]
def test_does_not_emit_when_version_is_current(self, settings: Settings):
received = []
checker = self._make_checker(settings, {**VERSION_INFO, "version": VERSION})
checker.update_available.connect(lambda info: received.append(info))
with patch.object(platform, "system", return_value="Windows"):
checker.check_for_updates()
assert received == []
def test_skips_network_call_on_linux(self, settings: Settings):
received = []
checker = self._make_checker(settings, VERSION_INFO)
checker.update_available.connect(lambda info: received.append(info))
with patch.object(platform, "system", return_value="Linux"):
checker.check_for_updates()
assert received == []
def test_stores_last_check_date_after_reply(self, settings: Settings):
checker = self._make_checker(settings, {**VERSION_INFO, "version": VERSION})
with patch.object(platform, "system", return_value="Windows"):
checker.check_for_updates()
stored = settings.value(Settings.Key.LAST_UPDATE_CHECK, "")
assert stored != ""
datetime.fromisoformat(stored) # should not raise
def test_stores_available_version_when_update_found(self, settings: Settings):
checker = self._make_checker(settings, VERSION_INFO)
with patch.object(platform, "system", return_value="Windows"), \
patch("buzz.update_checker.VERSION", "1.0.0"):
checker.check_for_updates()
assert settings.value(Settings.Key.UPDATE_AVAILABLE_VERSION, "") == "99.0.0"
def test_clears_available_version_when_up_to_date(self, settings: Settings):
settings.set_value(Settings.Key.UPDATE_AVAILABLE_VERSION, "99.0.0")
checker = self._make_checker(settings, {**VERSION_INFO, "version": VERSION})
with patch.object(platform, "system", return_value="Windows"):
checker.check_for_updates()
assert settings.value(Settings.Key.UPDATE_AVAILABLE_VERSION, "") == ""

View file

@ -0,0 +1,153 @@
import pytest
from pytestqt.qtbot import QtBot
from buzz.transcriber.transcriber import TranscriptionOptions
from buzz.widgets.transcriber.advanced_settings_dialog import AdvancedSettingsDialog
class TestAdvancedSettingsDialogSilenceThreshold:
def test_silence_threshold_spinbox_hidden_by_default(self, qtbot: QtBot):
"""Silence threshold UI is not shown when show_recording_settings=False."""
options = TranscriptionOptions()
dialog = AdvancedSettingsDialog(transcription_options=options)
qtbot.add_widget(dialog)
assert not hasattr(dialog, "silence_threshold_spin_box")
def test_silence_threshold_spinbox_shown_when_recording_settings(self, qtbot: QtBot):
"""Silence threshold spinbox is present when show_recording_settings=True."""
options = TranscriptionOptions()
dialog = AdvancedSettingsDialog(
transcription_options=options, show_recording_settings=True
)
qtbot.add_widget(dialog)
assert hasattr(dialog, "silence_threshold_spin_box")
assert dialog.silence_threshold_spin_box is not None
def test_silence_threshold_spinbox_initial_value(self, qtbot: QtBot):
"""Spinbox reflects the current silence_threshold from options."""
options = TranscriptionOptions(silence_threshold=0.0075)
dialog = AdvancedSettingsDialog(
transcription_options=options, show_recording_settings=True
)
qtbot.add_widget(dialog)
assert dialog.silence_threshold_spin_box.value() == pytest.approx(0.0075)
def test_silence_threshold_change_updates_options(self, qtbot: QtBot):
"""Changing spinbox value updates transcription_options.silence_threshold."""
options = TranscriptionOptions(silence_threshold=0.0025)
dialog = AdvancedSettingsDialog(
transcription_options=options, show_recording_settings=True
)
qtbot.add_widget(dialog)
dialog.silence_threshold_spin_box.setValue(0.005)
assert dialog.transcription_options.silence_threshold == pytest.approx(0.005)
def test_silence_threshold_change_emits_signal(self, qtbot: QtBot):
"""Changing the spinbox emits transcription_options_changed."""
options = TranscriptionOptions(silence_threshold=0.0025)
dialog = AdvancedSettingsDialog(
transcription_options=options, show_recording_settings=True
)
qtbot.add_widget(dialog)
emitted = []
dialog.transcription_options_changed.connect(lambda o: emitted.append(o))
dialog.silence_threshold_spin_box.setValue(0.005)
assert len(emitted) == 1
assert emitted[0].silence_threshold == pytest.approx(0.005)
class TestAdvancedSettingsDialogLineSeparator:
def test_line_separator_shown_when_recording_settings(self, qtbot: QtBot):
options = TranscriptionOptions()
dialog = AdvancedSettingsDialog(
transcription_options=options, show_recording_settings=True
)
qtbot.add_widget(dialog)
assert hasattr(dialog, "line_separator_line_edit")
assert dialog.line_separator_line_edit is not None
def test_line_separator_hidden_by_default(self, qtbot: QtBot):
options = TranscriptionOptions()
dialog = AdvancedSettingsDialog(transcription_options=options)
qtbot.add_widget(dialog)
assert not hasattr(dialog, "line_separator_line_edit")
def test_line_separator_initial_value_displayed_as_escape(self, qtbot: QtBot):
options = TranscriptionOptions(line_separator="\n\n")
dialog = AdvancedSettingsDialog(
transcription_options=options, show_recording_settings=True
)
qtbot.add_widget(dialog)
assert dialog.line_separator_line_edit.text() == r"\n\n"
def test_line_separator_change_updates_options(self, qtbot: QtBot):
options = TranscriptionOptions(line_separator="\n\n")
dialog = AdvancedSettingsDialog(
transcription_options=options, show_recording_settings=True
)
qtbot.add_widget(dialog)
dialog.line_separator_line_edit.setText(r"\n")
assert dialog.transcription_options.line_separator == "\n"
def test_line_separator_change_emits_signal(self, qtbot: QtBot):
options = TranscriptionOptions(line_separator="\n\n")
dialog = AdvancedSettingsDialog(
transcription_options=options, show_recording_settings=True
)
qtbot.add_widget(dialog)
emitted = []
dialog.transcription_options_changed.connect(lambda o: emitted.append(o))
dialog.line_separator_line_edit.setText(r"\n")
assert len(emitted) == 1
assert emitted[0].line_separator == "\n"
def test_line_separator_invalid_escape_does_not_crash(self, qtbot: QtBot):
options = TranscriptionOptions(line_separator="\n\n")
dialog = AdvancedSettingsDialog(
transcription_options=options, show_recording_settings=True
)
qtbot.add_widget(dialog)
dialog.line_separator_line_edit.setText("\\")
# Options unchanged — previous valid value kept
assert dialog.transcription_options.line_separator == "\n\n"
def test_line_separator_tab_character(self, qtbot: QtBot):
options = TranscriptionOptions()
dialog = AdvancedSettingsDialog(
transcription_options=options, show_recording_settings=True
)
qtbot.add_widget(dialog)
dialog.line_separator_line_edit.setText(r"\t")
assert dialog.transcription_options.line_separator == "\t"
def test_line_separator_plain_text(self, qtbot: QtBot):
options = TranscriptionOptions()
dialog = AdvancedSettingsDialog(
transcription_options=options, show_recording_settings=True
)
qtbot.add_widget(dialog)
dialog.line_separator_line_edit.setText(" | ")
assert dialog.transcription_options.line_separator == " | "
class TestTranscriptionOptionsLineSeparator:
def test_default_line_separator(self):
options = TranscriptionOptions()
assert options.line_separator == "\n\n"
def test_custom_line_separator(self):
options = TranscriptionOptions(line_separator="\n")
assert options.line_separator == "\n"
class TestTranscriptionOptionsSilenceThreshold:
def test_default_silence_threshold(self):
options = TranscriptionOptions()
assert options.silence_threshold == pytest.approx(0.0025)
def test_custom_silence_threshold(self):
options = TranscriptionOptions(silence_threshold=0.01)
assert options.silence_threshold == pytest.approx(0.01)

View file

@ -0,0 +1,56 @@
import pytest
from pytestqt.qtbot import QtBot
from buzz.widgets.audio_meter_widget import AudioMeterWidget
class TestAudioMeterWidget:
def test_initial_amplitude_is_zero(self, qtbot: QtBot):
widget = AudioMeterWidget()
qtbot.add_widget(widget)
assert widget.current_amplitude == 0.0
def test_initial_average_amplitude_is_zero(self, qtbot: QtBot):
widget = AudioMeterWidget()
qtbot.add_widget(widget)
assert widget.average_amplitude == 0.0
def test_update_amplitude(self, qtbot: QtBot):
widget = AudioMeterWidget()
qtbot.add_widget(widget)
widget.update_amplitude(0.5)
assert widget.current_amplitude == pytest.approx(0.5)
def test_update_amplitude_smoothing(self, qtbot: QtBot):
"""Lower amplitude should decay via smoothing factor, not drop instantly."""
widget = AudioMeterWidget()
qtbot.add_widget(widget)
widget.update_amplitude(1.0)
widget.update_amplitude(0.0)
# current_amplitude should be smoothed: max(0.0, 1.0 * SMOOTHING_FACTOR)
assert widget.current_amplitude == pytest.approx(1.0 * widget.SMOOTHING_FACTOR)
def test_update_average_amplitude(self, qtbot: QtBot):
widget = AudioMeterWidget()
qtbot.add_widget(widget)
widget.update_average_amplitude(0.0123)
assert widget.average_amplitude == pytest.approx(0.0123)
def test_reset_amplitude_clears_current(self, qtbot: QtBot):
widget = AudioMeterWidget()
qtbot.add_widget(widget)
widget.update_amplitude(0.8)
widget.reset_amplitude()
assert widget.current_amplitude == 0.0
def test_reset_amplitude_clears_average(self, qtbot: QtBot):
widget = AudioMeterWidget()
qtbot.add_widget(widget)
widget.update_average_amplitude(0.05)
widget.reset_amplitude()
assert widget.average_amplitude == 0.0
def test_fixed_height(self, qtbot: QtBot):
widget = AudioMeterWidget()
qtbot.add_widget(widget)
assert widget.height() == 56

View file

@ -1,8 +1,22 @@
import gc
import logging
import pytest
from unittest.mock import patch
from buzz.settings.settings import Settings
@pytest.fixture(autouse=True)
def mock_get_password():
with patch("buzz.widgets.recording_transcriber_widget.get_password", return_value=None):
yield
@pytest.fixture(autouse=True)
def force_gc_between_tests():
yield
gc.collect()
@pytest.fixture(scope="package")
def reset_settings():
settings = Settings()

View file

@ -0,0 +1,177 @@
import json
from unittest.mock import MagicMock, patch
import pytest
from PyQt6.QtCore import Qt, QEvent, QPoint
from PyQt6.QtGui import QKeyEvent
from PyQt6.QtNetwork import QNetworkReply, QNetworkAccessManager
from PyQt6.QtWidgets import QListWidgetItem
from pytestqt.qtbot import QtBot
from buzz.widgets.transcriber.hugging_face_search_line_edit import HuggingFaceSearchLineEdit
@pytest.fixture
def widget(qtbot: QtBot):
mock_manager = MagicMock(spec=QNetworkAccessManager)
mock_manager.finished = MagicMock()
mock_manager.finished.connect = MagicMock()
w = HuggingFaceSearchLineEdit(network_access_manager=mock_manager)
qtbot.add_widget(w)
# Prevent popup.show() from triggering a Wayland fatal protocol error
# in headless/CI environments where popup windows lack a transient parent.
w.popup.show = MagicMock()
return w
class TestHuggingFaceSearchLineEdit:
def test_initial_state(self, widget):
assert widget.text() == ""
assert widget.placeholderText() != ""
def test_default_value_set(self, qtbot: QtBot):
mock_manager = MagicMock(spec=QNetworkAccessManager)
mock_manager.finished = MagicMock()
mock_manager.finished.connect = MagicMock()
w = HuggingFaceSearchLineEdit(default_value="openai/whisper-tiny", network_access_manager=mock_manager)
qtbot.add_widget(w)
assert w.text() == "openai/whisper-tiny"
def test_on_text_edited_emits_model_selected(self, widget, qtbot: QtBot):
spy = MagicMock()
widget.model_selected.connect(spy)
widget.on_text_edited("some/model")
spy.assert_called_once_with("some/model")
def test_fetch_models_skips_short_text(self, widget):
widget.setText("ab")
result = widget.fetch_models()
assert result is None
def test_fetch_models_makes_request_for_long_text(self, widget):
widget.setText("whisper-tiny")
mock_reply = MagicMock()
widget.network_manager.get = MagicMock(return_value=mock_reply)
result = widget.fetch_models()
widget.network_manager.get.assert_called_once()
assert result == mock_reply
def test_fetch_models_url_contains_search_text(self, widget):
widget.setText("whisper")
widget.network_manager.get = MagicMock(return_value=MagicMock())
widget.fetch_models()
call_args = widget.network_manager.get.call_args[0][0]
assert "whisper" in call_args.url().toString()
def test_on_request_response_network_error_does_not_populate_popup(self, widget):
mock_reply = MagicMock(spec=QNetworkReply)
mock_reply.error.return_value = QNetworkReply.NetworkError.ConnectionRefusedError
widget.on_request_response(mock_reply)
assert widget.popup.count() == 0
def test_on_request_response_populates_popup(self, widget):
mock_reply = MagicMock(spec=QNetworkReply)
mock_reply.error.return_value = QNetworkReply.NetworkError.NoError
models = [{"id": "openai/whisper-tiny"}, {"id": "openai/whisper-base"}]
mock_reply.readAll.return_value.data.return_value = json.dumps(models).encode()
widget.on_request_response(mock_reply)
assert widget.popup.count() == 2
assert widget.popup.item(0).text() == "openai/whisper-tiny"
assert widget.popup.item(1).text() == "openai/whisper-base"
def test_on_request_response_empty_models_does_not_show_popup(self, widget):
mock_reply = MagicMock(spec=QNetworkReply)
mock_reply.error.return_value = QNetworkReply.NetworkError.NoError
mock_reply.readAll.return_value.data.return_value = json.dumps([]).encode()
widget.on_request_response(mock_reply)
assert widget.popup.count() == 0
widget.popup.show.assert_not_called()
def test_on_request_response_item_has_user_role_data(self, widget):
mock_reply = MagicMock(spec=QNetworkReply)
mock_reply.error.return_value = QNetworkReply.NetworkError.NoError
models = [{"id": "facebook/mms-1b-all"}]
mock_reply.readAll.return_value.data.return_value = json.dumps(models).encode()
widget.on_request_response(mock_reply)
item = widget.popup.item(0)
assert item.data(Qt.ItemDataRole.UserRole) == "facebook/mms-1b-all"
def test_on_select_item_emits_model_selected(self, widget, qtbot: QtBot):
item = QListWidgetItem("openai/whisper-tiny")
item.setData(Qt.ItemDataRole.UserRole, "openai/whisper-tiny")
widget.popup.addItem(item)
widget.popup.setCurrentItem(item)
spy = MagicMock()
widget.model_selected.connect(spy)
widget.on_select_item()
spy.assert_called_with("openai/whisper-tiny")
assert widget.text() == "openai/whisper-tiny"
def test_on_select_item_hides_popup(self, widget):
item = QListWidgetItem("openai/whisper-tiny")
item.setData(Qt.ItemDataRole.UserRole, "openai/whisper-tiny")
widget.popup.addItem(item)
widget.popup.setCurrentItem(item)
with patch.object(widget.popup, 'hide') as mock_hide:
widget.on_select_item()
mock_hide.assert_called_once()
def test_on_popup_selected_stops_timer(self, widget):
widget.timer.start()
assert widget.timer.isActive()
widget.on_popup_selected()
assert not widget.timer.isActive()
def test_event_filter_ignores_non_popup_target(self, widget):
other = MagicMock()
event = MagicMock()
assert widget.eventFilter(other, event) is False
def test_event_filter_mouse_press_hides_popup(self, widget):
event = MagicMock()
event.type.return_value = QEvent.Type.MouseButtonPress
with patch.object(widget.popup, 'hide') as mock_hide:
result = widget.eventFilter(widget.popup, event)
assert result is True
mock_hide.assert_called_once()
def test_event_filter_escape_hides_popup(self, widget, qtbot: QtBot):
event = QKeyEvent(QEvent.Type.KeyPress, Qt.Key.Key_Escape, Qt.KeyboardModifier.NoModifier)
with patch.object(widget.popup, 'hide') as mock_hide:
result = widget.eventFilter(widget.popup, event)
assert result is True
mock_hide.assert_called_once()
def test_event_filter_enter_selects_item(self, widget, qtbot: QtBot):
item = QListWidgetItem("openai/whisper-tiny")
item.setData(Qt.ItemDataRole.UserRole, "openai/whisper-tiny")
widget.popup.addItem(item)
widget.popup.setCurrentItem(item)
spy = MagicMock()
widget.model_selected.connect(spy)
event = QKeyEvent(QEvent.Type.KeyPress, Qt.Key.Key_Return, Qt.KeyboardModifier.NoModifier)
result = widget.eventFilter(widget.popup, event)
assert result is True
spy.assert_called_with("openai/whisper-tiny")
def test_event_filter_enter_no_item_returns_true(self, widget, qtbot: QtBot):
event = QKeyEvent(QEvent.Type.KeyPress, Qt.Key.Key_Return, Qt.KeyboardModifier.NoModifier)
result = widget.eventFilter(widget.popup, event)
assert result is True
def test_event_filter_navigation_keys_return_false(self, widget):
for key in [Qt.Key.Key_Up, Qt.Key.Key_Down, Qt.Key.Key_Home,
Qt.Key.Key_End, Qt.Key.Key_PageUp, Qt.Key.Key_PageDown]:
event = QKeyEvent(QEvent.Type.KeyPress, key, Qt.KeyboardModifier.NoModifier)
assert widget.eventFilter(widget.popup, event) is False
def test_event_filter_other_key_hides_popup(self, widget):
event = QKeyEvent(QEvent.Type.KeyPress, Qt.Key.Key_A, Qt.KeyboardModifier.NoModifier)
with patch.object(widget.popup, 'hide') as mock_hide:
widget.eventFilter(widget.popup, event)
mock_hide.assert_called_once()

View file

@ -1,5 +1,6 @@
import logging
import os
import tempfile
from typing import List
from unittest.mock import patch, Mock
@ -293,6 +294,67 @@ class TestMainWindow:
assert window.toolbar.open_transcript_action.isEnabled() is False
window.close()
def test_import_folder_opens_file_transcriber_with_supported_files(
self, qtbot, transcription_service
):
window = MainWindow(transcription_service)
qtbot.add_widget(window)
with tempfile.TemporaryDirectory() as folder:
# Create supported and unsupported files
supported = ["audio.mp3", "video.mp4", "clip.wav"]
unsupported = ["document.txt", "image.png"]
subdir = os.path.join(folder, "sub")
os.makedirs(subdir)
nested = "nested.flac"
for name in supported + unsupported:
open(os.path.join(folder, name), "w").close()
open(os.path.join(subdir, nested), "w").close()
with patch("PyQt6.QtWidgets.QFileDialog.getExistingDirectory") as mock_dir, \
patch.object(window, "open_file_transcriber_widget") as mock_open:
mock_dir.return_value = folder
window.on_import_folder_action_triggered()
collected = mock_open.call_args[0][0]
collected_names = {os.path.basename(p) for p in collected}
assert collected_names == {"audio.mp3", "video.mp4", "clip.wav", "nested.flac"}
window.close()
def test_import_folder_does_nothing_when_cancelled(
self, qtbot, transcription_service
):
window = MainWindow(transcription_service)
qtbot.add_widget(window)
with patch("PyQt6.QtWidgets.QFileDialog.getExistingDirectory") as mock_dir, \
patch.object(window, "open_file_transcriber_widget") as mock_open:
mock_dir.return_value = ""
window.on_import_folder_action_triggered()
mock_open.assert_not_called()
window.close()
def test_import_folder_does_nothing_when_no_supported_files(
self, qtbot, transcription_service
):
window = MainWindow(transcription_service)
qtbot.add_widget(window)
with tempfile.TemporaryDirectory() as folder:
open(os.path.join(folder, "readme.txt"), "w").close()
open(os.path.join(folder, "image.jpg"), "w").close()
with patch("PyQt6.QtWidgets.QFileDialog.getExistingDirectory") as mock_dir, \
patch.object(window, "open_file_transcriber_widget") as mock_open:
mock_dir.return_value = folder
window.on_import_folder_action_triggered()
mock_open.assert_not_called()
window.close()
@staticmethod
def _import_file_and_start_transcription(
window: MainWindow, long_audio: bool = False

View file

@ -1,3 +1,5 @@
from unittest.mock import patch, Mock
from PyQt6.QtCore import QSettings
from buzz.widgets.menu_bar import MenuBar
@ -6,6 +8,18 @@ from buzz.widgets.preferences_dialog.preferences_dialog import PreferencesDialog
class TestMenuBar:
def test_import_folder_action_emits_signal(self, qtbot, shortcuts):
menu_bar = MenuBar(
shortcuts=shortcuts, preferences=Preferences.load(QSettings())
)
qtbot.add_widget(menu_bar)
signal_mock = Mock()
menu_bar.import_folder_action_triggered.connect(signal_mock)
menu_bar.import_folder_action.trigger()
signal_mock.assert_called_once()
def test_open_preferences_dialog(self, qtbot, shortcuts):
menu_bar = MenuBar(
shortcuts=shortcuts, preferences=Preferences.load(QSettings())

View file

@ -3,7 +3,7 @@ from unittest.mock import Mock
from PyQt6.QtWidgets import QCheckBox, QLineEdit
from buzz.model_loader import TranscriptionModel
from buzz.transcriber.transcriber import Task, DEFAULT_WHISPER_TEMPERATURE
from buzz.transcriber.transcriber import Task
from buzz.widgets.preferences_dialog.folder_watch_preferences_widget import (
FolderWatchPreferencesWidget,
)
@ -28,7 +28,6 @@ class TestFolderWatchPreferencesWidget:
model=TranscriptionModel.default(),
word_level_timings=False,
extract_speech=False,
temperature=DEFAULT_WHISPER_TEMPERATURE,
initial_prompt="",
enable_llm_translation=False,
llm_model="",
@ -48,8 +47,12 @@ class TestFolderWatchPreferencesWidget:
assert not checkbox.isChecked()
assert input_folder_line_edit.text() == ""
assert output_folder_line_edit.text() == ""
assert not input_folder_line_edit.isEnabled()
assert not output_folder_line_edit.isEnabled()
checkbox.setChecked(True)
assert input_folder_line_edit.isEnabled()
assert output_folder_line_edit.isEnabled()
input_folder_line_edit.setText("test/input/folder")
output_folder_line_edit.setText("test/output/folder")
@ -57,3 +60,41 @@ class TestFolderWatchPreferencesWidget:
assert last_config_changed_call[0][0].enabled
assert last_config_changed_call[0][0].input_directory == "test/input/folder"
assert last_config_changed_call[0][0].output_directory == "test/output/folder"
def test_delete_processed_files_checkbox(self, qtbot):
widget = FolderWatchPreferencesWidget(
config=FolderWatchPreferences(
enabled=False,
input_directory="",
output_directory="",
file_transcription_options=FileTranscriptionPreferences(
language=None,
task=Task.TRANSCRIBE,
model=TranscriptionModel.default(),
word_level_timings=False,
extract_speech=False,
initial_prompt="",
enable_llm_translation=False,
llm_model="",
llm_prompt="",
output_formats=set(),
),
),
)
mock_config_changed = Mock()
widget.config_changed.connect(mock_config_changed)
qtbot.add_widget(widget)
delete_checkbox = widget.findChild(QCheckBox, "DeleteProcessedFilesCheckbox")
assert delete_checkbox is not None
assert not delete_checkbox.isChecked()
delete_checkbox.setChecked(True)
last_config = mock_config_changed.call_args_list[-1][0][0]
assert last_config.delete_processed_files is True
delete_checkbox.setChecked(False)
last_config = mock_config_changed.call_args_list[-1][0][0]
assert last_config.delete_processed_files is False

Some files were not shown because too many files have changed in this diff Show more