from unittest.mock import patch, MagicMock, mock_open import json from buzz.model_loader import TranscriptionModel, ModelType, WhisperModelSize from buzz.transcriber.transcriber import ( TranscriptionOptions, Task, FileTranscriptionTask, FileTranscriptionOptions, ) from buzz.transcriber.whisper_cpp import WhisperCpp from tests.audio import test_audio_path, test_multibyte_utf8_audio_path from tests.model_loader import get_model_path class TestWhisperCpp: def test_transcribe(self): transcription_options = TranscriptionOptions( language="fr", task=Task.TRANSCRIBE, word_level_timings=False, model=TranscriptionModel( model_type=ModelType.WHISPER_CPP, whisper_model_size=WhisperModelSize.TINY, ), ) model_path = get_model_path(transcription_options.model) task = FileTranscriptionTask( transcription_options=transcription_options, file_transcription_options=FileTranscriptionOptions(), model_path=model_path, file_path=test_audio_path, ) segments = WhisperCpp.transcribe(task=task) # Combine all segment texts full_text = " ".join(segment.text for segment in segments) assert "Bien venu" in full_text or "bienvenu" in full_text.lower() def test_transcribe_word_level_timestamps(self): transcription_options = TranscriptionOptions( language="lv", task=Task.TRANSCRIBE, word_level_timings=True, model=TranscriptionModel( model_type=ModelType.WHISPER_CPP, whisper_model_size=WhisperModelSize.TINY, ), ) model_path = get_model_path(transcription_options.model) task = FileTranscriptionTask( transcription_options=transcription_options, file_transcription_options=FileTranscriptionOptions(), model_path=model_path, file_path=test_multibyte_utf8_audio_path, ) segments = WhisperCpp.transcribe(task=task) assert "Mani" in segments[0].text assert "uzstrau" or "ustrau" in segments[1].text assert "laikabstāk" in segments[2].text def test_transcribe_chinese_multibyte_word_level_timestamps(self): """Test that Chinese characters split across multiple tokens are properly combined. Chinese character 闻 (U+95FB) is encoded as UTF-8 bytes E9 97 BB. Whisper.cpp may split this into separate tokens, e.g.: - Token 1: bytes E9 97 (incomplete) - Token 2: byte BB (completes the character) The code should combine these bytes and output 闻 as a single segment. """ # Mock JSON data simulating whisper.cpp output with split Chinese characters # The character 闻 is split into two tokens: \xe9\x97 and \xbb # The character 新 is a complete token # Together they form 新闻 (news) mock_json_data = { "transcription": [ { "offsets": {"from": 0, "to": 5000}, "text": "", # Not used in word-level processing "tokens": [ { "text": "[_BEG_]", "offsets": {"from": 0, "to": 0}, }, { # 新 - complete character (UTF-8: E6 96 B0) # When read as latin-1: \xe6\x96\xb0 "text": "\xe6\x96\xb0", "offsets": {"from": 100, "to": 200}, }, { # First two bytes of 闻 (UTF-8: E9 97 BB) # When read as latin-1: \xe9\x97 "text": "\xe9\x97", "offsets": {"from": 200, "to": 300}, }, { # Last byte of 闻 # When read as latin-1: \xbb "text": "\xbb", "offsets": {"from": 300, "to": 400}, }, { "text": "[_TT_500]", "offsets": {"from": 500, "to": 500}, }, ], } ] } # Convert to JSON string using latin-1 compatible encoding # We write bytes directly since the real file is read with latin-1 json_bytes = json.dumps(mock_json_data, ensure_ascii=False).encode("latin-1") transcription_options = TranscriptionOptions( language="zh", task=Task.TRANSCRIBE, word_level_timings=True, model=TranscriptionModel( model_type=ModelType.WHISPER_CPP, whisper_model_size=WhisperModelSize.TINY, ), ) task = FileTranscriptionTask( transcription_options=transcription_options, file_transcription_options=FileTranscriptionOptions(), model_path="/fake/model/path", file_path="/fake/audio.wav", ) # Mock subprocess.Popen to simulate whisper-cli execution mock_process = MagicMock() mock_process.stderr.readline.side_effect = [""] mock_process.wait.return_value = None mock_process.returncode = 0 with patch("buzz.transcriber.whisper_cpp.subprocess.Popen", return_value=mock_process): with patch("buzz.transcriber.whisper_cpp.os.path.exists", return_value=True): with patch("builtins.open", mock_open(read_data=json_bytes.decode("latin-1"))): segments = WhisperCpp.transcribe(task=task) # Should have 2 segments: 新 and 闻 (each character separate) assert len(segments) == 2 assert segments[0].text == "新" assert segments[1].text == "闻" # Verify timestamps assert segments[0].start == 100 assert segments[0].end == 200 # 闻 spans from token at 200 to token ending at 400 assert segments[1].start == 200 assert segments[1].end == 400 def test_transcribe_chinese_mixed_complete_and_split_chars(self): """Test a mix of complete and split Chinese characters.""" # 大家好 - "Hello everyone" # 大 (E5 A4 A7) - complete token # 家 (E5 AE B6) - split into E5 AE and B6 # 好 (E5 A5 BD) - complete token mock_json_data = { "transcription": [ { "offsets": {"from": 0, "to": 5000}, "text": "", # Not used in word-level processing "tokens": [ { "text": "[_BEG_]", "offsets": {"from": 0, "to": 0}, }, { # 大 - complete "text": "\xe5\xa4\xa7", "offsets": {"from": 100, "to": 200}, }, { # First two bytes of 家 "text": "\xe5\xae", "offsets": {"from": 200, "to": 250}, }, { # Last byte of 家 "text": "\xb6", "offsets": {"from": 250, "to": 300}, }, { # 好 - complete "text": "\xe5\xa5\xbd", "offsets": {"from": 300, "to": 400}, }, ], } ] } json_bytes = json.dumps(mock_json_data, ensure_ascii=False).encode("latin-1") transcription_options = TranscriptionOptions( language="zh", task=Task.TRANSCRIBE, word_level_timings=True, model=TranscriptionModel( model_type=ModelType.WHISPER_CPP, whisper_model_size=WhisperModelSize.TINY, ), ) task = FileTranscriptionTask( transcription_options=transcription_options, file_transcription_options=FileTranscriptionOptions(), model_path="/fake/model/path", file_path="/fake/audio.wav", ) mock_process = MagicMock() mock_process.stderr.readline.side_effect = [""] mock_process.wait.return_value = None mock_process.returncode = 0 with patch("buzz.transcriber.whisper_cpp.subprocess.Popen", return_value=mock_process): with patch("buzz.transcriber.whisper_cpp.os.path.exists", return_value=True): with patch("builtins.open", mock_open(read_data=json_bytes.decode("latin-1"))): segments = WhisperCpp.transcribe(task=task) # Should have 3 segments: 大, 家, 好 assert len(segments) == 3 assert segments[0].text == "大" assert segments[1].text == "家" assert segments[2].text == "好" # Combined text full_text = "".join(s.text for s in segments) assert full_text == "大家好"