Fix chinease word level timestamps (#1355)

2026-03-14 14:45:46 +01:00 · 2026-01-16 14:31:48 +02:00 · 2026-01-16 14:31:48 +02:00 · 97b1619902
commit 97b1619902
parent 92fc405c4a
12 changed files with 363 additions and 82 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -0,0 +1 @@
+- Use uv to run tests and any scripts
--- a/2
+++ b/2
@ -1,5 +1,5 @@
 # Change also in pyproject.toml and buzz/__version__.py
-version := 1.4.2
+version := 1.5.0

 mac_app_path := ./dist/Buzz.app
 mac_zip_path := ./dist/Buzz-${version}-mac.zip
--- a/buzz/version.py
+++ b/buzz/version.py
@ -1 +1 @@
-VERSION = "1.4.2"
+VERSION = "1.5.0"
--- a/buzz/db/db.py
+++ b/buzz/db/db.py
@ -49,5 +49,4 @@ def close_app_db():
        return

    if db.isOpen():
-        logging.debug("Closing database connection: %s", db.connectionName())
        db.close()
--- a/buzz/transcriber/whisper_cpp.py
+++ b/buzz/transcriber/whisper_cpp.py
@ -180,79 +180,160 @@ class WhisperCpp:
                # Extract word-level timestamps from tokens array
                # Combine tokens into words using similar logic as whisper_cpp.py
                transcription = result.get("transcription", [])
+
+                # Languages that don't use spaces between words
+                # For these, each token is treated as a separate word
+                non_space_languages = {"zh", "ja", "th", "lo", "km", "my"}
+                is_non_space_language = language in non_space_languages
+
                for segment_data in transcription:
                    tokens = segment_data.get("tokens", [])
-    
-                    # Accumulate tokens into words
-                    word_buffer = b""
-                    word_start = 0
-                    word_end = 0
-    
-                    def append_word(buffer: bytes, start: int, end: int):
-                        """Try to decode and append a word segment, handling multi-byte UTF-8"""
-                        if not buffer:
-                            return True
-    
-                        # Try to decode as UTF-8
-                        # https://github.com/ggerganov/whisper.cpp/issues/1798
-                        try:
-                            text = buffer.decode("utf-8").strip()
-                            if text:
-                                segments.append(
-                                    Segment(
-                                        start=start,
-                                        end=end,
-                                        text=text,
-                                        translation=""
+
+                    if is_non_space_language:
+                        # For languages without spaces (Chinese, Japanese, etc.),
+                        # each complete UTF-8 character is treated as a separate word.
+                        # Some characters may be split across multiple tokens as raw bytes.
+                        char_buffer = b""
+                        char_start = 0
+                        char_end = 0
+
+                        def flush_complete_chars(buffer: bytes, start: int, end: int):
+                            """Extract and output all complete UTF-8 characters from buffer.
+                            Returns any remaining incomplete bytes."""
+                            nonlocal segments
+                            remaining = buffer
+                            pos = 0
+
+                            while pos < len(remaining):
+                                # Try to decode one character at a time
+                                for char_len in range(1, min(5, len(remaining) - pos + 1)):
+                                    try:
+                                        char = remaining[pos:pos + char_len].decode("utf-8")
+                                        # Successfully decoded a character
+                                        if char.strip():
+                                            segments.append(
+                                                Segment(
+                                                    start=start,
+                                                    end=end,
+                                                    text=char,
+                                                    translation=""
+                                                )
+                                            )
+                                        pos += char_len
+                                        break
+                                    except UnicodeDecodeError:
+                                        if char_len == 4 or pos + char_len >= len(remaining):
+                                            # Incomplete character at end - return as remaining
+                                            return remaining[pos:]
+                                else:
+                                    # Couldn't decode, might be incomplete at end
+                                    return remaining[pos:]
+
+                            return b""
+
+                        for token_data in tokens:
+                            token_text = token_data.get("text", "")
+
+                            # Skip special tokens like [_TT_], [_BEG_]
+                            if token_text.startswith("[_"):
+                                continue
+
+                            if not token_text:
+                                continue
+
+                            token_start = int(token_data.get("offsets", {}).get("from", 0))
+                            token_end = int(token_data.get("offsets", {}).get("to", 0))
+
+                            # Convert latin-1 string back to original bytes
+                            token_bytes = token_text.encode("latin-1")
+
+                            if not char_buffer:
+                                char_start = token_start
+
+                            char_buffer += token_bytes
+                            char_end = token_end
+
+                            # Try to flush complete characters
+                            char_buffer = flush_complete_chars(char_buffer, char_start, char_end)
+
+                            # If buffer was fully flushed, reset start time for next char
+                            if not char_buffer:
+                                char_start = token_end
+
+                        # Flush any remaining buffer at end of segment
+                        if char_buffer:
+                            flush_complete_chars(char_buffer, char_start, char_end)
+                    else:
+                        # For space-separated languages, accumulate tokens into words
+                        word_buffer = b""
+                        word_start = 0
+                        word_end = 0
+
+                        def append_word(buffer: bytes, start: int, end: int):
+                            """Try to decode and append a word segment, handling multi-byte UTF-8"""
+                            if not buffer:
+                                return True
+
+                            # Try to decode as UTF-8
+                            # https://github.com/ggerganov/whisper.cpp/issues/1798
+                            try:
+                                text = buffer.decode("utf-8").strip()
+                                if text:
+                                    segments.append(
+                                        Segment(
+                                            start=start,
+                                            end=end,
+                                            text=text,
+                                            translation=""
+                                        )
                                    )
-                                )
-                            return True
-                        except UnicodeDecodeError:
-                            # Multi-byte character is split, continue accumulating
-                            return False
-    
-                    for token_data in tokens:
-                        # Token text is read as latin-1, need to convert to bytes to get original data
-                        token_text = token_data.get("text", "")
-    
-                        # Skip special tokens like [_TT_], [_BEG_]
-                        if token_text.startswith("[_"):
-                            continue
-    
-                        if not token_text:
-                            continue
-    
-                        token_start = int(token_data.get("offsets", {}).get("from", 0))
-                        token_end = int(token_data.get("offsets", {}).get("to", 0))
-    
-                        # Convert latin-1 string back to original bytes
-                        # (latin-1 preserves byte values as code points)
-                        token_bytes = token_text.encode("latin-1")
-    
-                        # Check if token starts with space - indicates new word
-                        if token_bytes.startswith(b" ") and word_buffer:
-                            # Save previous word
-                            append_word(word_buffer, word_start, word_end)
-                            # Start new word
-                            word_buffer = token_bytes
-                            word_start = token_start
-                            word_end = token_end
-                        elif token_bytes.startswith(b", "):
-                            # Handle comma - save word with comma, then start new word
-                            word_buffer += b","
-                            append_word(word_buffer, word_start, word_end)
-                            word_buffer = token_bytes.lstrip(b",")
-                            word_start = token_start
-                            word_end = token_end
-                        else:
-                            # Accumulate token into current word
-                            if not word_buffer:
+                                return True
+                            except UnicodeDecodeError:
+                                # Multi-byte character is split, continue accumulating
+                                return False
+
+                        for token_data in tokens:
+                            # Token text is read as latin-1, need to convert to bytes to get original data
+                            token_text = token_data.get("text", "")
+
+                            # Skip special tokens like [_TT_], [_BEG_]
+                            if token_text.startswith("[_"):
+                                continue
+
+                            if not token_text:
+                                continue
+
+                            token_start = int(token_data.get("offsets", {}).get("from", 0))
+                            token_end = int(token_data.get("offsets", {}).get("to", 0))
+
+                            # Convert latin-1 string back to original bytes
+                            # (latin-1 preserves byte values as code points)
+                            token_bytes = token_text.encode("latin-1")
+
+                            # Check if token starts with space - indicates new word
+                            if token_bytes.startswith(b" ") and word_buffer:
+                                # Save previous word
+                                append_word(word_buffer, word_start, word_end)
+                                # Start new word
+                                word_buffer = token_bytes
                                word_start = token_start
-                            word_buffer += token_bytes
-                            word_end = token_end
-    
-                    # Add the last word
-                    append_word(word_buffer, word_start, word_end)
+                                word_end = token_end
+                            elif token_bytes.startswith(b", "):
+                                # Handle comma - save word with comma, then start new word
+                                word_buffer += b","
+                                append_word(word_buffer, word_start, word_end)
+                                word_buffer = token_bytes.lstrip(b",")
+                                word_start = token_start
+                                word_end = token_end
+                            else:
+                                # Accumulate token into current word
+                                if not word_buffer:
+                                    word_start = token_start
+                                word_buffer += token_bytes
+                                word_end = token_end
+
+                        # Add the last word
+                        append_word(word_buffer, word_start, word_end)
            else:
                # Use segment-level timestamps
                transcription = result.get("transcription", [])
--- a/buzz/widgets/main_window.py
+++ b/buzz/widgets/main_window.py
@ -417,8 +417,6 @@ class MainWindow(QMainWindow):
        self.save_geometry()

    def closeEvent(self, event: QtGui.QCloseEvent) -> None:
-        logging.debug("Starting MainWindow closeEvent")
-
        self.save_geometry()
        self.settings.settings.sync()

--- a/buzz/widgets/transcriber/languages_combo_box.py
+++ b/buzz/widgets/transcriber/languages_combo_box.py
@ -2,7 +2,7 @@ from typing import Optional
 import os

 from PyQt6.QtCore import pyqtSignal, Qt
-from PyQt6.QtWidgets import QComboBox, QWidget
+from PyQt6.QtWidgets import QComboBox, QWidget, QFrame
 from PyQt6.QtGui import QStandardItem, QStandardItemModel

 from buzz.locale import _
@ -51,3 +51,9 @@ class LanguagesComboBox(QComboBox):

    def on_index_changed(self, index: int):
        self.languageChanged.emit(self.languages[index][0])
+
+    def showPopup(self):
+        super().showPopup()
+        popup = self.findChild(QFrame)
+        if popup and popup.height() > 400:
+            popup.setFixedHeight(400)
--- a/buzz/widgets/transcription_viewer/transcription_resizer_widget.py
+++ b/buzz/widgets/transcription_viewer/transcription_resizer_widget.py
@ -36,6 +36,9 @@ from buzz.widgets.preferences_dialog.models.file_transcription_preferences impor

 SENTENCE_END = re.compile(r'.*[.!?。！？]')

+# Languages that don't use spaces between words
+NON_SPACE_LANGUAGES = {"zh", "ja", "th", "lo", "km", "my"}
+
 class TranscriptionWorker(QObject):
    finished = pyqtSignal(list)

@ -51,16 +54,23 @@ class TranscriptionWorker(QObject):
            transcription_id=self.transcription.id_as_uuid
        )

+        # Check if the language uses spaces between words
+        language = self.transcription.language or ""
+        is_non_space_language = language in NON_SPACE_LANGUAGES
+
+        # For non-space languages, don't add spaces between words
+        separator = "" if is_non_space_language else " "
+
        segments = []
        words = []
        text = ""
        for buzz_segment in buzz_segments:
            words.append({
-                'word': buzz_segment.text + " ",
+                'word': buzz_segment.text + separator,
                'start': buzz_segment.start_time / 100,
                'end': buzz_segment.end_time / 100,
            })
-            text += buzz_segment.text + " "
+            text += buzz_segment.text + separator

            if SENTENCE_END.match(buzz_segment.text):
                segments.append({
@ -70,6 +80,13 @@ class TranscriptionWorker(QObject):
                words = []
                text = ""

+        # Add any remaining words that weren't terminated by sentence-ending punctuation
+        if words:
+            segments.append({
+                'text': text,
+                'words': words
+            })
+
        return {
            'language': self.transcription.language,
            'segments': segments
--- a/docs/docs/usage/4_edit_and_resize.md
+++ b/docs/docs/usage/4_edit_and_resize.md
@ -8,4 +8,6 @@ When transcript of some audio or video file is generated you can edit it and exp

 Transcription view screen has option to resize the transcripts. Click on the "Resize" button so see available options. Transcripts that have been generated **with word-level timings** setting enabled can be combined into subtitles specifying different options, like maximum length of a subtitle and if subtitles should be split on punctuation. For transcripts that have been generated **without word-level timings** setting enabled can only be recombined specifying desired max length of a subtitle.  

-If audio file is still present on the system word-level timing merge will also analyze the audio for silences to improve subtitle accuracy. Subtitle generation from transcripts with word-level timings is available since version 1.3.0.
+If audio file is still present on the system word-level timing merge will also analyze the audio for silences to improve subtitle accuracy. Subtitle generation from transcripts with word-level timings is available since version 1.3.0.
+
+The resize tool also has an option to extend end time of segments if you want the subtitles to be on the screen for longer. You can specify the amount of time in seconds to extend each subtitle segment. Buzz will add this amount of time to the end of each subtitle segment making sure that the end of a segment does not go over start of the next segment. This feature is available since 1.5.0.
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,7 +1,7 @@
 [project]
 name = "buzz-captions"
 # Change also in Makefile and buzz/__version__.py
-version = "1.4.2"
+version = "1.5.0"
 description = ""
 authors = [{ name = "Chidi Williams", email = "williamschidi1@gmail.com" }]
 requires-python = ">=3.12,<3.13"
--- a/tests/transcriber/whisper_cpp_test.py
+++ b/tests/transcriber/whisper_cpp_test.py
@ -1,3 +1,6 @@
+from unittest.mock import patch, MagicMock, mock_open
+import json
+
 from buzz.model_loader import TranscriptionModel, ModelType, WhisperModelSize
 from buzz.transcriber.transcriber import (
    TranscriptionOptions,
@ -59,4 +62,179 @@ class TestWhisperCpp:

        assert "Mani" in segments[0].text
        assert "uzstrau" or "ustrau" in segments[1].text
-        assert "laikabstāk" in segments[2].text
+        assert "laikabstāk" in segments[2].text
+
+    def test_transcribe_chinese_multibyte_word_level_timestamps(self):
+        """Test that Chinese characters split across multiple tokens are properly combined.
+
+        Chinese character 闻 (U+95FB) is encoded as UTF-8 bytes E9 97 BB.
+        Whisper.cpp may split this into separate tokens, e.g.:
+        - Token 1: bytes E9 97 (incomplete)
+        - Token 2: byte BB (completes the character)
+
+        The code should combine these bytes and output 闻 as a single segment.
+        """
+        # Mock JSON data simulating whisper.cpp output with split Chinese characters
+        # The character 闻 is split into two tokens: \xe9\x97 and \xbb
+        # The character 新 is a complete token
+        # Together they form 新闻 (news)
+        mock_json_data = {
+            "transcription": [
+                {
+                    "offsets": {"from": 0, "to": 5000},
+                    "text": "",  # Not used in word-level processing
+                    "tokens": [
+                        {
+                            "text": "[_BEG_]",
+                            "offsets": {"from": 0, "to": 0},
+                        },
+                        {
+                            # 新 - complete character (UTF-8: E6 96 B0)
+                            # When read as latin-1: \xe6\x96\xb0
+                            "text": "\xe6\x96\xb0",
+                            "offsets": {"from": 100, "to": 200},
+                        },
+                        {
+                            # First two bytes of 闻 (UTF-8: E9 97 BB)
+                            # When read as latin-1: \xe9\x97
+                            "text": "\xe9\x97",
+                            "offsets": {"from": 200, "to": 300},
+                        },
+                        {
+                            # Last byte of 闻
+                            # When read as latin-1: \xbb
+                            "text": "\xbb",
+                            "offsets": {"from": 300, "to": 400},
+                        },
+                        {
+                            "text": "[_TT_500]",
+                            "offsets": {"from": 500, "to": 500},
+                        },
+                    ],
+                }
+            ]
+        }
+
+        # Convert to JSON string using latin-1 compatible encoding
+        # We write bytes directly since the real file is read with latin-1
+        json_bytes = json.dumps(mock_json_data, ensure_ascii=False).encode("latin-1")
+
+        transcription_options = TranscriptionOptions(
+            language="zh",
+            task=Task.TRANSCRIBE,
+            word_level_timings=True,
+            model=TranscriptionModel(
+                model_type=ModelType.WHISPER_CPP,
+                whisper_model_size=WhisperModelSize.TINY,
+            ),
+        )
+
+        task = FileTranscriptionTask(
+            transcription_options=transcription_options,
+            file_transcription_options=FileTranscriptionOptions(),
+            model_path="/fake/model/path",
+            file_path="/fake/audio.wav",
+        )
+
+        # Mock subprocess.Popen to simulate whisper-cli execution
+        mock_process = MagicMock()
+        mock_process.stderr.readline.side_effect = [""]
+        mock_process.wait.return_value = None
+        mock_process.returncode = 0
+
+        with patch("buzz.transcriber.whisper_cpp.subprocess.Popen", return_value=mock_process):
+            with patch("buzz.transcriber.whisper_cpp.os.path.exists", return_value=True):
+                with patch("builtins.open", mock_open(read_data=json_bytes.decode("latin-1"))):
+                    segments = WhisperCpp.transcribe(task=task)
+
+        # Should have 2 segments: 新 and 闻 (each character separate)
+        assert len(segments) == 2
+        assert segments[0].text == "新"
+        assert segments[1].text == "闻"
+
+        # Verify timestamps
+        assert segments[0].start == 100
+        assert segments[0].end == 200
+        # 闻 spans from token at 200 to token ending at 400
+        assert segments[1].start == 200
+        assert segments[1].end == 400
+
+    def test_transcribe_chinese_mixed_complete_and_split_chars(self):
+        """Test a mix of complete and split Chinese characters."""
+        # 大家好 - "Hello everyone"
+        # 大 (E5 A4 A7) - complete token
+        # 家 (E5 AE B6) - split into E5 AE and B6
+        # 好 (E5 A5 BD) - complete token
+        mock_json_data = {
+            "transcription": [
+                {
+                    "offsets": {"from": 0, "to": 5000},
+                    "text": "",  # Not used in word-level processing
+                    "tokens": [
+                        {
+                            "text": "[_BEG_]",
+                            "offsets": {"from": 0, "to": 0},
+                        },
+                        {
+                            # 大 - complete
+                            "text": "\xe5\xa4\xa7",
+                            "offsets": {"from": 100, "to": 200},
+                        },
+                        {
+                            # First two bytes of 家
+                            "text": "\xe5\xae",
+                            "offsets": {"from": 200, "to": 250},
+                        },
+                        {
+                            # Last byte of 家
+                            "text": "\xb6",
+                            "offsets": {"from": 250, "to": 300},
+                        },
+                        {
+                            # 好 - complete
+                            "text": "\xe5\xa5\xbd",
+                            "offsets": {"from": 300, "to": 400},
+                        },
+                    ],
+                }
+            ]
+        }
+
+        json_bytes = json.dumps(mock_json_data, ensure_ascii=False).encode("latin-1")
+
+        transcription_options = TranscriptionOptions(
+            language="zh",
+            task=Task.TRANSCRIBE,
+            word_level_timings=True,
+            model=TranscriptionModel(
+                model_type=ModelType.WHISPER_CPP,
+                whisper_model_size=WhisperModelSize.TINY,
+            ),
+        )
+
+        task = FileTranscriptionTask(
+            transcription_options=transcription_options,
+            file_transcription_options=FileTranscriptionOptions(),
+            model_path="/fake/model/path",
+            file_path="/fake/audio.wav",
+        )
+
+        mock_process = MagicMock()
+        mock_process.stderr.readline.side_effect = [""]
+        mock_process.wait.return_value = None
+        mock_process.returncode = 0
+
+        with patch("buzz.transcriber.whisper_cpp.subprocess.Popen", return_value=mock_process):
+            with patch("buzz.transcriber.whisper_cpp.os.path.exists", return_value=True):
+                with patch("builtins.open", mock_open(read_data=json_bytes.decode("latin-1"))):
+                    segments = WhisperCpp.transcribe(task=task)
+
+        # Should have 3 segments: 大, 家, 好
+        assert len(segments) == 3
+        assert segments[0].text == "大"
+        assert segments[1].text == "家"
+        assert segments[2].text == "好"
+
+        # Combined text
+        full_text = "".join(s.text for s in segments)
+        assert full_text == "大家好"
--- a/uv.lock
+++ b/uv.lock
@ -274,7 +274,7 @@ wheels = [

 [[package]]
 name = "buzz-captions"
-version = "1.4.2"
+version = "1.5.0"
 source = { editable = "." }
 dependencies = [
    { name = "accelerate" },
@ -1132,7 +1132,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/f8/0a/a3871375c7b9727edaeeea994bfff7c63ff7804c9829c19309ba2e058807/greenlet-3.3.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:b01548f6e0b9e9784a2c99c5651e5dc89ffcbe870bc5fb2e5ef864e9cc6b5dcb", size = 276379, upload-time = "2025-12-04T14:23:30.498Z" },
    { url = "https://files.pythonhosted.org/packages/43/ab/7ebfe34dce8b87be0d11dae91acbf76f7b8246bf9d6b319c741f99fa59c6/greenlet-3.3.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:349345b770dc88f81506c6861d22a6ccd422207829d2c854ae2af8025af303e3", size = 597294, upload-time = "2025-12-04T14:50:06.847Z" },
    { url = "https://files.pythonhosted.org/packages/a4/39/f1c8da50024feecd0793dbd5e08f526809b8ab5609224a2da40aad3a7641/greenlet-3.3.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e8e18ed6995e9e2c0b4ed264d2cf89260ab3ac7e13555b8032b25a74c6d18655", size = 607742, upload-time = "2025-12-04T14:57:42.349Z" },
-    { url = "https://files.pythonhosted.org/packages/77/cb/43692bcd5f7a0da6ec0ec6d58ee7cddb606d055ce94a62ac9b1aa481e969/greenlet-3.3.0-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c024b1e5696626890038e34f76140ed1daf858e37496d33f2af57f06189e70d7", size = 622297, upload-time = "2025-12-04T15:07:13.552Z" },
    { url = "https://files.pythonhosted.org/packages/75/b0/6bde0b1011a60782108c01de5913c588cf51a839174538d266de15e4bf4d/greenlet-3.3.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:047ab3df20ede6a57c35c14bf5200fcf04039d50f908270d3f9a7a82064f543b", size = 609885, upload-time = "2025-12-04T14:26:02.368Z" },
    { url = "https://files.pythonhosted.org/packages/49/0e/49b46ac39f931f59f987b7cd9f34bfec8ef81d2a1e6e00682f55be5de9f4/greenlet-3.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2d9ad37fc657b1102ec880e637cccf20191581f75c64087a549e66c57e1ceb53", size = 1567424, upload-time = "2025-12-04T15:04:23.757Z" },
    { url = "https://files.pythonhosted.org/packages/05/f5/49a9ac2dff7f10091935def9165c90236d8f175afb27cbed38fb1d61ab6b/greenlet-3.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83cd0e36932e0e7f36a64b732a6f60c2fc2df28c351bae79fbaf4f8092fe7614", size = 1636017, upload-time = "2025-12-04T14:27:29.688Z" },