Fix missing spaces after punctuation in speaker identification (#1344)

Co-authored-by: Robrecht Siera <rob.developer.securemail@holoncom.eu>
This commit is contained in:
Rob Siera 2026-01-10 17:58:27 +01:00 committed by GitHub
commit dc27281e34
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 4 additions and 2 deletions

View file

@ -187,7 +187,8 @@ class IdentificationWorker(QObject):
transcription_id=self.transcription.id_as_uuid
)
full_transcript = "".join(segment.text for segment in segments)
full_transcript = " ".join(segment.text for segment in segments)
full_transcript = re.sub(r' {2,}', ' ', full_transcript)
if self._is_cancelled:
logging.debug("Speaker identification worker: Cancelled at step 2")

View file

@ -87,7 +87,8 @@ class TestSpeakerIdentificationWidget:
assert worker.transcription == transcription
assert len(result) == 1
assert isinstance(result[0], list)
assert result == [[{'end_time': 8904, 'speaker': 'Speaker 0', 'start_time': 140, 'text': 'Bienvenue dans. '}]]
assert (result == [[{'end_time': 8904, 'speaker': 'Speaker 0', 'start_time': 140, 'text': 'Bien venue dans. '}]]
or result == [[{'end_time': 8904, 'speaker': 'Speaker 0', 'start_time': 140, 'text': 'Bienvenue dans. '}]])
def test_batch_processing_with_many_words(self):
"""Test batch processing when there are more than 200 words."""