buzz/buzz/model_loader.py

import enum
import hashlib
import logging
import os
import time
import threading
import shutil
import subprocess
import sys
import warnings
import platform
import requests
import whisper
import huggingface_hub
import zipfile
from dataclasses import dataclass
from typing import Optional, List

from PyQt6.QtCore import QObject, pyqtSignal, QRunnable
from platformdirs import user_cache_dir
from huggingface_hub.errors import LocalEntryNotFoundError

from buzz.locale import _


model_root_dir = user_cache_dir("Buzz")
model_root_dir = os.path.join(model_root_dir, "models")
model_root_dir = os.getenv("BUZZ_MODEL_ROOT", model_root_dir)
os.makedirs(model_root_dir, exist_ok=True)

logging.debug("Model root directory: %s", model_root_dir)


class WhisperModelSize(str, enum.Enum):
    TINY = "tiny"
    TINYEN = "tiny.en"
    BASE = "base"
    BASEEN = "base.en"
    SMALL = "small"
    SMALLEN = "small.en"
    MEDIUM = "medium"
    MEDIUMEN = "medium.en"
    LARGE = "large"
    LARGEV2 = "large-v2"
    LARGEV3 = "large-v3"
    LARGEV3TURBO = "large-v3-turbo"
    CUSTOM = "custom"
    LUMII = "lumii"

    def to_faster_whisper_model_size(self) -> str:
        if self == WhisperModelSize.LARGE:
            return "large-v1"
        return self.value

    def to_whisper_cpp_model_size(self) -> str:
        if self == WhisperModelSize.LARGE:
            return "large-v1"
        return self.value

    def __str__(self):
        return self.value.capitalize()


class ModelType(enum.Enum):
    WHISPER = "Whisper"
    WHISPER_CPP = "Whisper.cpp"
    HUGGING_FACE = "Hugging Face"
    FASTER_WHISPER = "Faster Whisper"
    OPEN_AI_WHISPER_API = "OpenAI Whisper API"

    @property
    def supports_initial_prompt(self):
        return self in (
            ModelType.WHISPER,
            ModelType.WHISPER_CPP,
            ModelType.OPEN_AI_WHISPER_API,
            ModelType.FASTER_WHISPER,
        )

    def is_available(self):
        if (
            # Hide Faster Whisper option on macOS x86_64
            # See: https://github.com/SYSTRAN/faster-whisper/issues/541
            (self == ModelType.FASTER_WHISPER
                and platform.system() == "Darwin" and platform.machine() == "x86_64")
        ):
            return False
        return True

    def is_manually_downloadable(self):
        return self in (
            ModelType.WHISPER,
            ModelType.WHISPER_CPP,
            ModelType.FASTER_WHISPER,
        )


HUGGING_FACE_MODEL_ALLOW_PATTERNS = [
    "model.safetensors",  # largest by size first
    "pytorch_model.bin",
    "model-00001-of-00002.safetensors",
    "model-00002-of-00002.safetensors",
    "model.safetensors.index.json",
    "added_tokens.json",
    "config.json",
    "generation_config.json",
    "merges.txt",
    "normalizer.json",
    "preprocessor_config.json",
    "special_tokens_map.json",
    "tokenizer.json",
    "tokenizer_config.json",
    "vocab.json",
]


@dataclass()
class TranscriptionModel:
    def __init__(
        self,
        model_type: ModelType = ModelType.WHISPER,
        whisper_model_size: Optional[WhisperModelSize] = WhisperModelSize.TINY,
        hugging_face_model_id: Optional[str] = ""
    ):
        self.model_type = model_type
        self.whisper_model_size = whisper_model_size
        self.hugging_face_model_id = hugging_face_model_id

    def __str__(self):
        match self.model_type:
            case ModelType.WHISPER:
                return f"Whisper ({self.whisper_model_size})"
            case ModelType.WHISPER_CPP:
                return f"Whisper.cpp ({self.whisper_model_size})"
            case ModelType.HUGGING_FACE:
                return f"Hugging Face ({self.hugging_face_model_id})"
            case ModelType.FASTER_WHISPER:
                return f"Faster Whisper ({self.whisper_model_size})"
            case ModelType.OPEN_AI_WHISPER_API:
                return "OpenAI Whisper API"
            case _:
                raise Exception("Unknown model type")

    def is_deletable(self):
        return (
            self.model_type == ModelType.WHISPER
            or self.model_type == ModelType.WHISPER_CPP
            or self.model_type == ModelType.FASTER_WHISPER
        ) and self.get_local_model_path() is not None

    def open_file_location(self):
        model_path = self.get_local_model_path()

        if (self.model_type == ModelType.HUGGING_FACE
                or self.model_type == ModelType.FASTER_WHISPER):
            model_path = os.path.dirname(model_path)

        if model_path is None:
            return
        self.open_path(path=os.path.dirname(model_path))

    @staticmethod
    def default():
        model_type = next(
            model_type for model_type in ModelType if model_type.is_available()
        )
        return TranscriptionModel(model_type=model_type)

    @staticmethod
    def open_path(path: str):
        if sys.platform == "win32":
            os.startfile(path)
        else:
            opener = "open" if sys.platform == "darwin" else "xdg-open"
            subprocess.call([opener, path])

    def delete_local_file(self):
        model_path = self.get_local_model_path()

        if (self.model_type == ModelType.HUGGING_FACE
                or self.model_type == ModelType.FASTER_WHISPER):
            model_path = os.path.dirname(os.path.dirname(model_path))

            logging.debug("Deleting model directory: %s", model_path)

            shutil.rmtree(model_path, ignore_errors=True)
            return

        logging.debug("Deleting model file: %s", model_path)
        os.remove(model_path)

    def get_local_model_path(self) -> Optional[str]:
        if self.model_type == ModelType.WHISPER_CPP:
            file_path = get_whisper_cpp_file_path(size=self.whisper_model_size)
            if not file_path or not os.path.exists(file_path) or not os.path.isfile(file_path):
                return None
            return file_path

        if self.model_type == ModelType.WHISPER:
            file_path = get_whisper_file_path(size=self.whisper_model_size)
            if not os.path.exists(file_path) or not os.path.isfile(file_path):
                return None
            return file_path

        if self.model_type == ModelType.FASTER_WHISPER:
            try:
                return download_faster_whisper_model(
                    model=self, local_files_only=True
                )
            except (ValueError, FileNotFoundError):
                return None

        if self.model_type == ModelType.OPEN_AI_WHISPER_API:
            return ""

        if self.model_type == ModelType.HUGGING_FACE:
            try:
                return huggingface_hub.snapshot_download(
                    self.hugging_face_model_id,
                    allow_patterns=HUGGING_FACE_MODEL_ALLOW_PATTERNS,
                    local_files_only=True,
                    cache_dir=model_root_dir,
                    etag_timeout=60
                )
            except (ValueError, FileNotFoundError):
                return None

        raise Exception("Unknown model type")


WHISPER_CPP_REPO_ID = "ggerganov/whisper.cpp"
WHISPER_CPP_LUMII_REPO_ID = "RaivisDejus/whisper.cpp-lv"


def get_whisper_cpp_file_path(size: WhisperModelSize) -> str:
    if size == WhisperModelSize.CUSTOM:
        return os.path.join(model_root_dir, f"ggml-model-whisper-custom.bin")

    repo_id = WHISPER_CPP_REPO_ID

    if size == WhisperModelSize.LUMII:
        repo_id = WHISPER_CPP_LUMII_REPO_ID

    model_filename = f"ggml-{size.to_whisper_cpp_model_size()}.bin"

    try:
        model_path =  huggingface_hub.snapshot_download(
            repo_id=repo_id,
            allow_patterns=[model_filename],
            local_files_only=True,
            cache_dir=model_root_dir,
            etag_timeout=60
        )

        return os.path.join(model_path, model_filename)
    except LocalEntryNotFoundError:
        return ''


def get_whisper_file_path(size: WhisperModelSize) -> str:
    root_dir = os.path.join(model_root_dir, "whisper")

    if size == WhisperModelSize.CUSTOM:
        return os.path.join(root_dir, "custom")

    url = whisper._MODELS[size.value]
    return os.path.join(root_dir, os.path.basename(url))


class HuggingfaceDownloadMonitor:
    def __init__(self, model_root: str, progress: pyqtSignal(tuple), total_file_size: int):
        self.model_root = model_root
        self.progress = progress
        self.total_file_size = round(total_file_size * 1.1)  # To keep dialog open even if it reports 100%
        self.incomplete_download_root = None
        self.stop_event = threading.Event()
        self.monitor_thread = None
        self.set_download_roots()

    def set_download_roots(self):
        normalized_model_root = os.path.normpath(self.model_root)
        two_dirs_up = os.path.normpath(os.path.join(normalized_model_root, "..", ".."))
        self.incomplete_download_root = os.path.normpath(os.path.join(two_dirs_up, "blobs"))

    def clean_tmp_files(self):
        for filename in os.listdir(model_root_dir):
            if filename.startswith("tmp"):
                os.remove(os.path.join(model_root_dir, filename))

    def monitor_file_size(self):
        while not self.stop_event.is_set():
            if model_root_dir is not None:
                for filename in os.listdir(model_root_dir):
                    if filename.startswith("tmp"):
                        file_size = os.path.getsize(os.path.join(model_root_dir, filename))
                        self.progress.emit((file_size, self.total_file_size))

            for filename in os.listdir(self.incomplete_download_root):
                if filename.endswith(".incomplete"):
                    file_size = os.path.getsize(os.path.join(self.incomplete_download_root, filename))
                    self.progress.emit((file_size, self.total_file_size))

            time.sleep(2)

    def start_monitoring(self):
        self.clean_tmp_files()
        self.monitor_thread = threading.Thread(target=self.monitor_file_size)
        self.monitor_thread.start()

    def stop_monitoring(self):
        self.progress.emit((self.total_file_size, self.total_file_size))

        if self.monitor_thread is not None:
            self.stop_event.set()
            self.monitor_thread.join()


def get_file_size(url):
    response = requests.head(url, allow_redirects=True)
    response.raise_for_status()
    return int(response.headers['Content-Length'])


def download_from_huggingface(
        repo_id: str,
        allow_patterns: List[str],
        progress: pyqtSignal(tuple),
        num_large_files: int = 1
):
    progress.emit((0, 100))

    try:
        model_root = huggingface_hub.snapshot_download(
            repo_id,
            allow_patterns=allow_patterns[num_large_files:],  # all, but largest
            cache_dir=model_root_dir,
            etag_timeout=60
        )
    except Exception as exc:
        logging.exception(exc)
        return ""

    progress.emit((1, 100))

    largest_file_size = 0
    for pattern in allow_patterns[:num_large_files]:
        try:
            file_url = huggingface_hub.hf_hub_url(repo_id, pattern)
            file_size = get_file_size(file_url)

            if file_size > largest_file_size:
                largest_file_size = file_size

        except requests.exceptions.RequestException as e:
            continue

    model_download_monitor = HuggingfaceDownloadMonitor(model_root, progress, largest_file_size)
    model_download_monitor.start_monitoring()

    try:
        huggingface_hub.snapshot_download(
            repo_id,
            allow_patterns=allow_patterns[:num_large_files],  # largest
            cache_dir=model_root_dir,
            etag_timeout=60
        )
    except Exception as exc:
        logging.exception(exc)
        model_download_monitor.stop_monitoring()
        # Cleanup to prevent incomplete downloads errors
        if os.path.exists(model_root):
            shutil.rmtree(model_root)
        return ""

    model_download_monitor.stop_monitoring()

    return model_root


def download_faster_whisper_model(
    model: TranscriptionModel, local_files_only=False, progress: pyqtSignal(tuple) = None
):
    size = model.whisper_model_size.to_faster_whisper_model_size()
    custom_repo_id = model.hugging_face_model_id

    if size == WhisperModelSize.CUSTOM and custom_repo_id == "":
        raise ValueError("Custom model id is not provided")

    if size == WhisperModelSize.CUSTOM:
        repo_id = custom_repo_id
    # Replicating models from faster-whisper code https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/utils.py#L29
    elif size == WhisperModelSize.LARGEV3TURBO:
        repo_id = "mobiuslabsgmbh/faster-whisper-large-v3-turbo"
    else:
        repo_id = "Systran/faster-whisper-%s" % size

    allow_patterns = [
        "model.bin",  # largest by size first
        "pytorch_model.bin",  # possible alternative model filename
        "config.json",
        "preprocessor_config.json",
        "tokenizer.json",
        "vocabulary.*",
    ]

    if local_files_only:
        return huggingface_hub.snapshot_download(
            repo_id,
            allow_patterns=allow_patterns,
            local_files_only=True,
            cache_dir=model_root_dir,
            etag_timeout=60
        )

    return download_from_huggingface(
        repo_id,
        allow_patterns=allow_patterns,
        progress=progress,
        num_large_files=2
    )


class ModelDownloader(QRunnable):
    class Signals(QObject):
        finished = pyqtSignal(str)
        progress = pyqtSignal(tuple)  # (current, total)
        error = pyqtSignal(str)

    def __init__(self, model: TranscriptionModel, custom_model_url: Optional[str] = None):
        super().__init__()

        self.is_coreml_supported = platform.system() == "Darwin" and platform.machine() == "arm64"
        self.signals = self.Signals()
        self.model = model
        self.stopped = False
        self.custom_model_url = custom_model_url

    def run(self) -> None:
        logging.debug("Downloading model: %s, %s", self.model, self.model.hugging_face_model_id)

        if self.model.model_type == ModelType.WHISPER_CPP:
            if self.custom_model_url:
                url = self.custom_model_url
                file_path = get_whisper_cpp_file_path(size=self.model.whisper_model_size)
                return self.download_model_to_path(url=url, file_path=file_path)

            repo_id = WHISPER_CPP_REPO_ID

            if self.model.whisper_model_size == WhisperModelSize.LUMII:
                repo_id = WHISPER_CPP_LUMII_REPO_ID

            model_name = self.model.whisper_model_size.to_whisper_cpp_model_size()

            whisper_cpp_model_files = [
                f"ggml-{model_name}.bin",
                "README.md"
            ]
            num_large_files = 1
            if self.is_coreml_supported:
                whisper_cpp_model_files = [
                        f"ggml-{model_name}.bin",
                        f"ggml-{model_name}-encoder.mlmodelc.zip",
                        "README.md"
                ]
                num_large_files = 2

            model_path = download_from_huggingface(
                repo_id=repo_id,
                allow_patterns=whisper_cpp_model_files,
                progress=self.signals.progress,
                num_large_files=num_large_files
            )

            if self.is_coreml_supported:
                with zipfile.ZipFile(
                        os.path.join(model_path, f"ggml-{model_name}-encoder.mlmodelc.zip"), 'r') as zip_ref:
                    zip_ref.extractall(model_path)

            self.signals.finished.emit(os.path.join(model_path, f"ggml-{model_name}.bin"))
            return

        if self.model.model_type == ModelType.WHISPER:
            url = whisper._MODELS[self.model.whisper_model_size.value]
            file_path = get_whisper_file_path(size=self.model.whisper_model_size)
            expected_sha256 = url.split("/")[-2]
            return self.download_model_to_path(
                url=url, file_path=file_path, expected_sha256=expected_sha256
            )

        if self.model.model_type == ModelType.FASTER_WHISPER:
            model_path = download_faster_whisper_model(
                model=self.model,
                progress=self.signals.progress,
            )

            if model_path == "":
                self.signals.error.emit(_("Error"))

            self.signals.finished.emit(model_path)
            return

        if self.model.model_type == ModelType.HUGGING_FACE:
            model_path = download_from_huggingface(
                self.model.hugging_face_model_id,
                allow_patterns=HUGGING_FACE_MODEL_ALLOW_PATTERNS,
                progress=self.signals.progress,
                num_large_files=4
            )

            if model_path == "":
                self.signals.error.emit(_("Error"))

            self.signals.finished.emit(model_path)
            return

        if self.model.model_type == ModelType.OPEN_AI_WHISPER_API:
            self.signals.finished.emit("")
            return

        raise Exception("Invalid model type: " + self.model.model_type.value)

    def download_model_to_path(
        self, url: str, file_path: str, expected_sha256: Optional[str] = None
    ):
        try:
            downloaded = self.download_model(url, file_path, expected_sha256)
            if downloaded:
                self.signals.finished.emit(file_path)
        except requests.RequestException:
            self.signals.error.emit(_("A connection error occurred"))
            if os.path.exists(file_path):
                os.remove(file_path)
            logging.exception("")
        except Exception as exc:
            self.signals.error.emit(str(exc))
            if os.path.exists(file_path):
                os.remove(file_path)
            logging.exception(exc)

    def download_model(
        self, url: str, file_path: str, expected_sha256: Optional[str]
    ) -> bool:
        logging.debug(f"Downloading model from {url} to {file_path}")

        os.makedirs(os.path.dirname(file_path), exist_ok=True)

        if os.path.exists(file_path) and not os.path.isfile(file_path):
            raise RuntimeError(f"{file_path} exists and is not a regular file")

        if os.path.isfile(file_path):
            if expected_sha256 is None:
                return True

            model_bytes = open(file_path, "rb").read()
            model_sha256 = hashlib.sha256(model_bytes).hexdigest()
            if model_sha256 == expected_sha256:
                return True
            else:
                warnings.warn(
                    f"{file_path} exists, but the SHA256 checksum does not match; re-downloading the file"
                )

        # Downloads the model using the requests module instead of urllib to
        # use the certs from certifi when the app is running in frozen mode
        with requests.get(url, stream=True, timeout=15) as source, open(
            file_path, "wb"
        ) as output:
            source.raise_for_status()
            total_size = float(source.headers.get("Content-Length", 0))
            current = 0.0
            self.signals.progress.emit((current, total_size))
            for chunk in source.iter_content(chunk_size=8192):
                if self.stopped:
                    return False
                output.write(chunk)
                current += len(chunk)
                self.signals.progress.emit((current, total_size))

        if expected_sha256 is not None:
            model_bytes = open(file_path, "rb").read()
            if hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
                raise RuntimeError(
                    "Model has been downloaded but the SHA256 checksum does not match. Please retry loading the "
                    "model."
                )

        logging.debug("Downloaded model")

        return True

    def cancel(self):
        self.stopped = True


def get_custom_api_whisper_model(base_url: str):
    if "api.groq.com" in base_url:
        return "whisper-large-v3"

    return "whisper-1"