Fix chinease word level timestamps (#1355)

This commit is contained in:
Raivis Dejus 2026-01-16 14:31:48 +02:00 committed by GitHub
commit 97b1619902
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 363 additions and 82 deletions

1
CLAUDE.md Normal file
View file

@ -0,0 +1 @@
- Use uv to run tests and any scripts

View file

@ -1,5 +1,5 @@
# Change also in pyproject.toml and buzz/__version__.py
version := 1.4.2
version := 1.5.0
mac_app_path := ./dist/Buzz.app
mac_zip_path := ./dist/Buzz-${version}-mac.zip

View file

@ -1 +1 @@
VERSION = "1.4.2"
VERSION = "1.5.0"

View file

@ -49,5 +49,4 @@ def close_app_db():
return
if db.isOpen():
logging.debug("Closing database connection: %s", db.connectionName())
db.close()

View file

@ -180,79 +180,160 @@ class WhisperCpp:
# Extract word-level timestamps from tokens array
# Combine tokens into words using similar logic as whisper_cpp.py
transcription = result.get("transcription", [])
# Languages that don't use spaces between words
# For these, each token is treated as a separate word
non_space_languages = {"zh", "ja", "th", "lo", "km", "my"}
is_non_space_language = language in non_space_languages
for segment_data in transcription:
tokens = segment_data.get("tokens", [])
# Accumulate tokens into words
word_buffer = b""
word_start = 0
word_end = 0
def append_word(buffer: bytes, start: int, end: int):
"""Try to decode and append a word segment, handling multi-byte UTF-8"""
if not buffer:
return True
# Try to decode as UTF-8
# https://github.com/ggerganov/whisper.cpp/issues/1798
try:
text = buffer.decode("utf-8").strip()
if text:
segments.append(
Segment(
start=start,
end=end,
text=text,
translation=""
if is_non_space_language:
# For languages without spaces (Chinese, Japanese, etc.),
# each complete UTF-8 character is treated as a separate word.
# Some characters may be split across multiple tokens as raw bytes.
char_buffer = b""
char_start = 0
char_end = 0
def flush_complete_chars(buffer: bytes, start: int, end: int):
"""Extract and output all complete UTF-8 characters from buffer.
Returns any remaining incomplete bytes."""
nonlocal segments
remaining = buffer
pos = 0
while pos < len(remaining):
# Try to decode one character at a time
for char_len in range(1, min(5, len(remaining) - pos + 1)):
try:
char = remaining[pos:pos + char_len].decode("utf-8")
# Successfully decoded a character
if char.strip():
segments.append(
Segment(
start=start,
end=end,
text=char,
translation=""
)
)
pos += char_len
break
except UnicodeDecodeError:
if char_len == 4 or pos + char_len >= len(remaining):
# Incomplete character at end - return as remaining
return remaining[pos:]
else:
# Couldn't decode, might be incomplete at end
return remaining[pos:]
return b""
for token_data in tokens:
token_text = token_data.get("text", "")
# Skip special tokens like [_TT_], [_BEG_]
if token_text.startswith("[_"):
continue
if not token_text:
continue
token_start = int(token_data.get("offsets", {}).get("from", 0))
token_end = int(token_data.get("offsets", {}).get("to", 0))
# Convert latin-1 string back to original bytes
token_bytes = token_text.encode("latin-1")
if not char_buffer:
char_start = token_start
char_buffer += token_bytes
char_end = token_end
# Try to flush complete characters
char_buffer = flush_complete_chars(char_buffer, char_start, char_end)
# If buffer was fully flushed, reset start time for next char
if not char_buffer:
char_start = token_end
# Flush any remaining buffer at end of segment
if char_buffer:
flush_complete_chars(char_buffer, char_start, char_end)
else:
# For space-separated languages, accumulate tokens into words
word_buffer = b""
word_start = 0
word_end = 0
def append_word(buffer: bytes, start: int, end: int):
"""Try to decode and append a word segment, handling multi-byte UTF-8"""
if not buffer:
return True
# Try to decode as UTF-8
# https://github.com/ggerganov/whisper.cpp/issues/1798
try:
text = buffer.decode("utf-8").strip()
if text:
segments.append(
Segment(
start=start,
end=end,
text=text,
translation=""
)
)
)
return True
except UnicodeDecodeError:
# Multi-byte character is split, continue accumulating
return False
for token_data in tokens:
# Token text is read as latin-1, need to convert to bytes to get original data
token_text = token_data.get("text", "")
# Skip special tokens like [_TT_], [_BEG_]
if token_text.startswith("[_"):
continue
if not token_text:
continue
token_start = int(token_data.get("offsets", {}).get("from", 0))
token_end = int(token_data.get("offsets", {}).get("to", 0))
# Convert latin-1 string back to original bytes
# (latin-1 preserves byte values as code points)
token_bytes = token_text.encode("latin-1")
# Check if token starts with space - indicates new word
if token_bytes.startswith(b" ") and word_buffer:
# Save previous word
append_word(word_buffer, word_start, word_end)
# Start new word
word_buffer = token_bytes
word_start = token_start
word_end = token_end
elif token_bytes.startswith(b", "):
# Handle comma - save word with comma, then start new word
word_buffer += b","
append_word(word_buffer, word_start, word_end)
word_buffer = token_bytes.lstrip(b",")
word_start = token_start
word_end = token_end
else:
# Accumulate token into current word
if not word_buffer:
return True
except UnicodeDecodeError:
# Multi-byte character is split, continue accumulating
return False
for token_data in tokens:
# Token text is read as latin-1, need to convert to bytes to get original data
token_text = token_data.get("text", "")
# Skip special tokens like [_TT_], [_BEG_]
if token_text.startswith("[_"):
continue
if not token_text:
continue
token_start = int(token_data.get("offsets", {}).get("from", 0))
token_end = int(token_data.get("offsets", {}).get("to", 0))
# Convert latin-1 string back to original bytes
# (latin-1 preserves byte values as code points)
token_bytes = token_text.encode("latin-1")
# Check if token starts with space - indicates new word
if token_bytes.startswith(b" ") and word_buffer:
# Save previous word
append_word(word_buffer, word_start, word_end)
# Start new word
word_buffer = token_bytes
word_start = token_start
word_buffer += token_bytes
word_end = token_end
# Add the last word
append_word(word_buffer, word_start, word_end)
word_end = token_end
elif token_bytes.startswith(b", "):
# Handle comma - save word with comma, then start new word
word_buffer += b","
append_word(word_buffer, word_start, word_end)
word_buffer = token_bytes.lstrip(b",")
word_start = token_start
word_end = token_end
else:
# Accumulate token into current word
if not word_buffer:
word_start = token_start
word_buffer += token_bytes
word_end = token_end
# Add the last word
append_word(word_buffer, word_start, word_end)
else:
# Use segment-level timestamps
transcription = result.get("transcription", [])

View file

@ -417,8 +417,6 @@ class MainWindow(QMainWindow):
self.save_geometry()
def closeEvent(self, event: QtGui.QCloseEvent) -> None:
logging.debug("Starting MainWindow closeEvent")
self.save_geometry()
self.settings.settings.sync()

View file

@ -2,7 +2,7 @@ from typing import Optional
import os
from PyQt6.QtCore import pyqtSignal, Qt
from PyQt6.QtWidgets import QComboBox, QWidget
from PyQt6.QtWidgets import QComboBox, QWidget, QFrame
from PyQt6.QtGui import QStandardItem, QStandardItemModel
from buzz.locale import _
@ -51,3 +51,9 @@ class LanguagesComboBox(QComboBox):
def on_index_changed(self, index: int):
self.languageChanged.emit(self.languages[index][0])
def showPopup(self):
super().showPopup()
popup = self.findChild(QFrame)
if popup and popup.height() > 400:
popup.setFixedHeight(400)

View file

@ -36,6 +36,9 @@ from buzz.widgets.preferences_dialog.models.file_transcription_preferences impor
SENTENCE_END = re.compile(r'.*[.!?。!?]')
# Languages that don't use spaces between words
NON_SPACE_LANGUAGES = {"zh", "ja", "th", "lo", "km", "my"}
class TranscriptionWorker(QObject):
finished = pyqtSignal(list)
@ -51,16 +54,23 @@ class TranscriptionWorker(QObject):
transcription_id=self.transcription.id_as_uuid
)
# Check if the language uses spaces between words
language = self.transcription.language or ""
is_non_space_language = language in NON_SPACE_LANGUAGES
# For non-space languages, don't add spaces between words
separator = "" if is_non_space_language else " "
segments = []
words = []
text = ""
for buzz_segment in buzz_segments:
words.append({
'word': buzz_segment.text + " ",
'word': buzz_segment.text + separator,
'start': buzz_segment.start_time / 100,
'end': buzz_segment.end_time / 100,
})
text += buzz_segment.text + " "
text += buzz_segment.text + separator
if SENTENCE_END.match(buzz_segment.text):
segments.append({
@ -70,6 +80,13 @@ class TranscriptionWorker(QObject):
words = []
text = ""
# Add any remaining words that weren't terminated by sentence-ending punctuation
if words:
segments.append({
'text': text,
'words': words
})
return {
'language': self.transcription.language,
'segments': segments

View file

@ -8,4 +8,6 @@ When transcript of some audio or video file is generated you can edit it and exp
Transcription view screen has option to resize the transcripts. Click on the "Resize" button so see available options. Transcripts that have been generated **with word-level timings** setting enabled can be combined into subtitles specifying different options, like maximum length of a subtitle and if subtitles should be split on punctuation. For transcripts that have been generated **without word-level timings** setting enabled can only be recombined specifying desired max length of a subtitle.
If audio file is still present on the system word-level timing merge will also analyze the audio for silences to improve subtitle accuracy. Subtitle generation from transcripts with word-level timings is available since version 1.3.0.
If audio file is still present on the system word-level timing merge will also analyze the audio for silences to improve subtitle accuracy. Subtitle generation from transcripts with word-level timings is available since version 1.3.0.
The resize tool also has an option to extend end time of segments if you want the subtitles to be on the screen for longer. You can specify the amount of time in seconds to extend each subtitle segment. Buzz will add this amount of time to the end of each subtitle segment making sure that the end of a segment does not go over start of the next segment. This feature is available since 1.5.0.

View file

@ -1,7 +1,7 @@
[project]
name = "buzz-captions"
# Change also in Makefile and buzz/__version__.py
version = "1.4.2"
version = "1.5.0"
description = ""
authors = [{ name = "Chidi Williams", email = "williamschidi1@gmail.com" }]
requires-python = ">=3.12,<3.13"

View file

@ -1,3 +1,6 @@
from unittest.mock import patch, MagicMock, mock_open
import json
from buzz.model_loader import TranscriptionModel, ModelType, WhisperModelSize
from buzz.transcriber.transcriber import (
TranscriptionOptions,
@ -59,4 +62,179 @@ class TestWhisperCpp:
assert "Mani" in segments[0].text
assert "uzstrau" or "ustrau" in segments[1].text
assert "laikabstāk" in segments[2].text
assert "laikabstāk" in segments[2].text
def test_transcribe_chinese_multibyte_word_level_timestamps(self):
"""Test that Chinese characters split across multiple tokens are properly combined.
Chinese character (U+95FB) is encoded as UTF-8 bytes E9 97 BB.
Whisper.cpp may split this into separate tokens, e.g.:
- Token 1: bytes E9 97 (incomplete)
- Token 2: byte BB (completes the character)
The code should combine these bytes and output as a single segment.
"""
# Mock JSON data simulating whisper.cpp output with split Chinese characters
# The character 闻 is split into two tokens: \xe9\x97 and \xbb
# The character 新 is a complete token
# Together they form 新闻 (news)
mock_json_data = {
"transcription": [
{
"offsets": {"from": 0, "to": 5000},
"text": "", # Not used in word-level processing
"tokens": [
{
"text": "[_BEG_]",
"offsets": {"from": 0, "to": 0},
},
{
# 新 - complete character (UTF-8: E6 96 B0)
# When read as latin-1: \xe6\x96\xb0
"text": "\xe6\x96\xb0",
"offsets": {"from": 100, "to": 200},
},
{
# First two bytes of 闻 (UTF-8: E9 97 BB)
# When read as latin-1: \xe9\x97
"text": "\xe9\x97",
"offsets": {"from": 200, "to": 300},
},
{
# Last byte of 闻
# When read as latin-1: \xbb
"text": "\xbb",
"offsets": {"from": 300, "to": 400},
},
{
"text": "[_TT_500]",
"offsets": {"from": 500, "to": 500},
},
],
}
]
}
# Convert to JSON string using latin-1 compatible encoding
# We write bytes directly since the real file is read with latin-1
json_bytes = json.dumps(mock_json_data, ensure_ascii=False).encode("latin-1")
transcription_options = TranscriptionOptions(
language="zh",
task=Task.TRANSCRIBE,
word_level_timings=True,
model=TranscriptionModel(
model_type=ModelType.WHISPER_CPP,
whisper_model_size=WhisperModelSize.TINY,
),
)
task = FileTranscriptionTask(
transcription_options=transcription_options,
file_transcription_options=FileTranscriptionOptions(),
model_path="/fake/model/path",
file_path="/fake/audio.wav",
)
# Mock subprocess.Popen to simulate whisper-cli execution
mock_process = MagicMock()
mock_process.stderr.readline.side_effect = [""]
mock_process.wait.return_value = None
mock_process.returncode = 0
with patch("buzz.transcriber.whisper_cpp.subprocess.Popen", return_value=mock_process):
with patch("buzz.transcriber.whisper_cpp.os.path.exists", return_value=True):
with patch("builtins.open", mock_open(read_data=json_bytes.decode("latin-1"))):
segments = WhisperCpp.transcribe(task=task)
# Should have 2 segments: 新 and 闻 (each character separate)
assert len(segments) == 2
assert segments[0].text == ""
assert segments[1].text == ""
# Verify timestamps
assert segments[0].start == 100
assert segments[0].end == 200
# 闻 spans from token at 200 to token ending at 400
assert segments[1].start == 200
assert segments[1].end == 400
def test_transcribe_chinese_mixed_complete_and_split_chars(self):
"""Test a mix of complete and split Chinese characters."""
# 大家好 - "Hello everyone"
# 大 (E5 A4 A7) - complete token
# 家 (E5 AE B6) - split into E5 AE and B6
# 好 (E5 A5 BD) - complete token
mock_json_data = {
"transcription": [
{
"offsets": {"from": 0, "to": 5000},
"text": "", # Not used in word-level processing
"tokens": [
{
"text": "[_BEG_]",
"offsets": {"from": 0, "to": 0},
},
{
# 大 - complete
"text": "\xe5\xa4\xa7",
"offsets": {"from": 100, "to": 200},
},
{
# First two bytes of 家
"text": "\xe5\xae",
"offsets": {"from": 200, "to": 250},
},
{
# Last byte of 家
"text": "\xb6",
"offsets": {"from": 250, "to": 300},
},
{
# 好 - complete
"text": "\xe5\xa5\xbd",
"offsets": {"from": 300, "to": 400},
},
],
}
]
}
json_bytes = json.dumps(mock_json_data, ensure_ascii=False).encode("latin-1")
transcription_options = TranscriptionOptions(
language="zh",
task=Task.TRANSCRIBE,
word_level_timings=True,
model=TranscriptionModel(
model_type=ModelType.WHISPER_CPP,
whisper_model_size=WhisperModelSize.TINY,
),
)
task = FileTranscriptionTask(
transcription_options=transcription_options,
file_transcription_options=FileTranscriptionOptions(),
model_path="/fake/model/path",
file_path="/fake/audio.wav",
)
mock_process = MagicMock()
mock_process.stderr.readline.side_effect = [""]
mock_process.wait.return_value = None
mock_process.returncode = 0
with patch("buzz.transcriber.whisper_cpp.subprocess.Popen", return_value=mock_process):
with patch("buzz.transcriber.whisper_cpp.os.path.exists", return_value=True):
with patch("builtins.open", mock_open(read_data=json_bytes.decode("latin-1"))):
segments = WhisperCpp.transcribe(task=task)
# Should have 3 segments: 大, 家, 好
assert len(segments) == 3
assert segments[0].text == ""
assert segments[1].text == ""
assert segments[2].text == ""
# Combined text
full_text = "".join(s.text for s in segments)
assert full_text == "大家好"

3
uv.lock generated
View file

@ -274,7 +274,7 @@ wheels = [
[[package]]
name = "buzz-captions"
version = "1.4.2"
version = "1.5.0"
source = { editable = "." }
dependencies = [
{ name = "accelerate" },
@ -1132,7 +1132,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/f8/0a/a3871375c7b9727edaeeea994bfff7c63ff7804c9829c19309ba2e058807/greenlet-3.3.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:b01548f6e0b9e9784a2c99c5651e5dc89ffcbe870bc5fb2e5ef864e9cc6b5dcb", size = 276379, upload-time = "2025-12-04T14:23:30.498Z" },
{ url = "https://files.pythonhosted.org/packages/43/ab/7ebfe34dce8b87be0d11dae91acbf76f7b8246bf9d6b319c741f99fa59c6/greenlet-3.3.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:349345b770dc88f81506c6861d22a6ccd422207829d2c854ae2af8025af303e3", size = 597294, upload-time = "2025-12-04T14:50:06.847Z" },
{ url = "https://files.pythonhosted.org/packages/a4/39/f1c8da50024feecd0793dbd5e08f526809b8ab5609224a2da40aad3a7641/greenlet-3.3.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e8e18ed6995e9e2c0b4ed264d2cf89260ab3ac7e13555b8032b25a74c6d18655", size = 607742, upload-time = "2025-12-04T14:57:42.349Z" },
{ url = "https://files.pythonhosted.org/packages/77/cb/43692bcd5f7a0da6ec0ec6d58ee7cddb606d055ce94a62ac9b1aa481e969/greenlet-3.3.0-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c024b1e5696626890038e34f76140ed1daf858e37496d33f2af57f06189e70d7", size = 622297, upload-time = "2025-12-04T15:07:13.552Z" },
{ url = "https://files.pythonhosted.org/packages/75/b0/6bde0b1011a60782108c01de5913c588cf51a839174538d266de15e4bf4d/greenlet-3.3.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:047ab3df20ede6a57c35c14bf5200fcf04039d50f908270d3f9a7a82064f543b", size = 609885, upload-time = "2025-12-04T14:26:02.368Z" },
{ url = "https://files.pythonhosted.org/packages/49/0e/49b46ac39f931f59f987b7cd9f34bfec8ef81d2a1e6e00682f55be5de9f4/greenlet-3.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2d9ad37fc657b1102ec880e637cccf20191581f75c64087a549e66c57e1ceb53", size = 1567424, upload-time = "2025-12-04T15:04:23.757Z" },
{ url = "https://files.pythonhosted.org/packages/05/f5/49a9ac2dff7f10091935def9165c90236d8f175afb27cbed38fb1d61ab6b/greenlet-3.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83cd0e36932e0e7f36a64b732a6f60c2fc2df28c351bae79fbaf4f8092fe7614", size = 1636017, upload-time = "2025-12-04T14:27:29.688Z" },