From e7f6206a40dc4e10d44f21ec51043b7551322705 Mon Sep 17 00:00:00 2001 From: Francois Vieille Date: Sat, 16 Nov 2019 18:30:08 +0100 Subject: [PATCH] classes clustering --- iss/clustering/AbstractClustering.py | 33 ++++++-- iss/clustering/AdvancedClustering.py | 117 ++++++++++++++++++++++++++ iss/clustering/ClassicalClustering.py | 27 +++++- iss/clustering/N2DClustering.py | 52 ++++++++++++ iss/clustering_debug.py | 22 +++++ 5 files changed, 242 insertions(+), 9 deletions(-) create mode 100644 iss/clustering/AdvancedClustering.py create mode 100644 iss/clustering/N2DClustering.py create mode 100644 iss/clustering_debug.py diff --git a/iss/clustering/AbstractClustering.py b/iss/clustering/AbstractClustering.py index 19828fd..d76d5fd 100644 --- a/iss/clustering/AbstractClustering.py +++ b/iss/clustering/AbstractClustering.py @@ -1,12 +1,35 @@ # -*- coding: utf-8 -*- +from iss.tools import Tools class AbstractClustering: - def __init__(self, config, pictures_id, pictures_np): + def __init__(self, config, pictures_id, pictures_np): - self.config = config - self.pictures_id = pictures_id - self.pictures_np = pictures_np + self.config = config + self.pictures_id = pictures_id + self.pictures_np = pictures_np + self.final_labels = None + self.colors = None + if self.config['save_directory']: + Tools.create_dir_if_not_exists(self.config['save_directory']) - \ No newline at end of file + def compute_final_labels(self): + raise NotImplementedError + + def get_results(self): + return list(zip(self.pictures_id, self.final_labels, self.pictures_np)) + + def compute_silhouette_score(self): + raise NotImplementedError + + def compute_colors(self): + n_classes = len(list(set(self.final_labels))) + self.colors = [Tools.get_color_from_label(label, n_classes) for label in self.final_labels] + return self + + def save(self): + raise NotImplementedError + + def load(self): + raise NotImplementedError \ No newline at end of file diff --git a/iss/clustering/AdvancedClustering.py b/iss/clustering/AdvancedClustering.py new file mode 100644 index 0000000..a392812 --- /dev/null +++ b/iss/clustering/AdvancedClustering.py @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- + +import os +import numpy as np +from iss.clustering import AbstractClustering +from sklearn.decomposition import PCA +from sklearn.cluster import KMeans +from sklearn.cluster import AgglomerativeClustering +from sklearn.cluster import DBSCAN +from iss.tools import Tools +from sklearn.externals import joblib +import pandas as pd + +class AdvancedClustering(AbstractClustering): + + def __init__(self, config, pictures_id = None, pictures_np = None): + + super().__init__(config, pictures_id, pictures_np) + + self.pca_fit = None + self.pca_args = self.config['PCA'] + self.pca_reduction = None + self.pca_save_name = "PCA_model_v%s.pkl" % (self.config['version']) + + self.kmeans_fit = None + self.kmeans_args = self.config['strong_kmeans'] + self.kmeans_labels = None + self.kmeans_centers = [] + self.kmeans_save_name = "kmeans_model_v%s.pkl" % (self.config['version']) + + self.dbscan_fit = None + self.dbscan_args = self.config['dbscan'] + self.dbscan_labels = None + self.dbscan_save_name = "dbscan_model_v%s.pkl" % (self.config['version']) + + self.final_labels = None + + + def compute_pca(self): + + np.random.seed(self.pca_args['random_state']) + self.pca_fit = PCA(**self.pca_args) + self.pca_fit.fit(self.pictures_np) + self.pca_reduction = self.pca_fit.transform(self.pictures_np) + print(self.pca_reduction) + return self + + def compute_kmeans(self): + + tmp_labels = pd.DataFrame() + tmp_iter = self.kmeans_args['iter'] + tmp_range = range(0, tmp_iter) + tmp_low = self.kmeans_args['low'] + tmp_high = self.kmeans_args['high'] + tmp_treshold = self.kmeans_args['threshold'] + tmp_cols = ['run_%s' % i for i in tmp_range] + np.random.seed(self.kmeans_args['seed']*2) + tmp_n_clusters = np.random.randint(low = tmp_low, high = tmp_high, size = tmp_iter) + print(tmp_n_clusters) + + for i in tmp_range: + km_model = KMeans(n_clusters = tmp_n_clusters[i], random_state = self.kmeans_args['seed']+i) + km_res = km_model.fit(self.pca_reduction) + tmp_labels[tmp_cols[i]] = km_res.labels_ + + tmp_labels['dummy'] = 1 + tmp_labels['group_id'] = tmp_labels.groupby(by = tmp_cols, as_index = False).grouper.group_info[0] + tmp_labels['count'] = tmp_labels.groupby(by = 'group_id', as_index = False)['dummy'].transform(np.size) + tmp_labels = tmp_labels.drop(labels = 'dummy', axis = 1) + + tmp_group_id = tmp_labels[tmp_labels['count'] >= tmp_treshold]['group_id'].unique() + + print(tmp_group_id) + + pca_init = np.zeros((len(tmp_group_id), self.pca_reduction.shape[1])) + + for i in range(0, len(tmp_group_id)): + gp_id = tmp_group_id[i] + index_sel = tmp_labels[tmp_labels['group_id'] == gp_id].index + pca_init[i, :] = np.mean(self.pca_reduction[index_sel, :], axis = 0) + + + self.kmeans_fit = KMeans(n_clusters = pca_init.shape[0], init = pca_init, n_init = 1, random_state = self.kmeans_args['seed']+tmp_iter) + self.kmeans_fit.fit(self.pca_reduction) + self.kmeans_labels = self.kmeans_fit.labels_ + return self + + def compute_kmeans_centers(self): + for cl in list(np.unique(self.kmeans_fit.labels_)): + tmp = self.pca_reduction[np.where(self.kmeans_labels == cl)] + self.kmeans_centers.append(np.mean(tmp, axis = 0)) + return self + + def compute_dbscan(self): + self.dbscan_fit = DBSCAN(**self.dbscan_args) + self.dbscan_fit.fit_predict(self.kmeans_centers) + self.dbscan_labels = self.dbscan_fit.labels_ + return self + + def compute_dbscan_labels(self): + self.final_labels = [self.dbscan_labels[old_cl] for old_cl in self.kmeans_labels] + + def get_zip_results(self): + return zip(self.pictures_id, self.final_labels, self.kmeans_labels, self.pictures_np) + + def save(self): + Tools.create_dir_if_not_exists(self.config['save_directory']) + + joblib.dump(self.pca_fit, os.path.join(self.config['save_directory'], self.pca_save_name)) + joblib.dump(self.kmeans_fit, os.path.join(self.config['save_directory'], self.kmeans_save_name)) + joblib.dump(self.dbscan_fit, os.path.join(self.config['save_directory'], self.dbscan_save_name)) + + def load(self): + self.pca_fit = joblib.load(os.path.join(self.config['save_directory'], self.pca_save_name)) + self.kmeans_fit = joblib.load(os.path.join(self.config['save_directory'], self.kmeans_save_name)) + self.dbscan_fit = joblib.load(os.path.join(self.config['save_directory'], self.dbscan_save_name)) + diff --git a/iss/clustering/ClassicalClustering.py b/iss/clustering/ClassicalClustering.py index e425620..7c0f004 100644 --- a/iss/clustering/ClassicalClustering.py +++ b/iss/clustering/ClassicalClustering.py @@ -6,8 +6,10 @@ from iss.clustering import AbstractClustering from sklearn.decomposition import PCA from sklearn.cluster import KMeans from sklearn.cluster import AgglomerativeClustering +from sklearn.metrics import silhouette_samples from iss.tools import Tools from sklearn.externals import joblib +from sklearn.manifold import TSNE class ClassicalClustering(AbstractClustering): @@ -31,8 +33,13 @@ class ClassicalClustering(AbstractClustering): self.cah_args = self.config['CAH'] self.cah_labels = None self.cah_save_name = "cah_model_v%s.pkl" % (self.config['version']) + + self.tsne_fit = None + self.tsne_args = self.config['TSNE'] + self.tsne_embedding = None self.final_labels = None + self.silhouette_score_labels = {} def compute_pca(self): @@ -62,11 +69,23 @@ class ClassicalClustering(AbstractClustering): self.cah_labels = self.cah_fit.labels_ return self - def compute_cah_labels(self): - self.final_labels = [self.cah_labels[old_cl] for old_cl in self.kmeans_labels] + def compute_final_labels(self): + self.final_labels = np.array([self.cah_labels[old_cl] for old_cl in self.kmeans_labels]) + + def compute_tsne(self): + self.tsne_fit = TSNE(**self.tsne_args) + self.tsne_embedding = self.tsne_fit.fit_transform(self.pca_reduction) + return self + + def get_results(self): + return list(zip(self.pictures_id, self.final_labels, self.kmeans_labels, self.pictures_np)) + + def compute_silhouette_score(self): + self.silhouette_score = silhouette_samples(self.pictures_np, self.final_labels) + self.silhouette_score_labels = {cluster: np.mean(self.silhouette_score[self.final_labels == cluster]) for + cluster in np.unique(self.final_labels)} + return self.silhouette_score_labels - def get_zip_results(self): - return zip(self.pictures_id, self.final_labels, self.kmeans_labels, self.pictures_np) def save(self): Tools.create_dir_if_not_exists(self.config['save_directory']) diff --git a/iss/clustering/N2DClustering.py b/iss/clustering/N2DClustering.py new file mode 100644 index 0000000..4839ca2 --- /dev/null +++ b/iss/clustering/N2DClustering.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- + +import os +import numpy as np +import umap +from iss.tools import Tools +from iss.clustering import AbstractClustering +from sklearn.cluster import KMeans +from sklearn.metrics import silhouette_samples +from sklearn.externals import joblib + +class N2DClustering(AbstractClustering): + """ + Cf: https://github.com/rymc/n2d + """ + + def __init__(self, config, pictures_id = None, pictures_np = None): + + super().__init__(config, pictures_id, pictures_np) + + self.umap_args = self.config['umap'] + self.umap_fit = None + self.umap_embedding = None + + self.kmeans_fit = None + self.kmeans_args = self.config['kmeans'] + self.kmeans_labels = None + self.kmeans_centers = [] + self.kmeans_save_name = "kmeans_model_v%s.pkl" % (self.config['version']) + + + def compute_umap(self): + self.umap_fit = umap.UMAP(**self.umap_args) + self.umap_embedding = self.umap_fit.fit_transform(self.pictures_np) + return self + + def compute_kmeans(self): + self.kmeans_fit = KMeans(**self.kmeans_args) + self.kmeans_fit.fit(self.umap_embedding) + self.kmeans_labels = self.kmeans_fit.labels_ + return self + + def compute_final_labels(self): + self.final_labels = self.kmeans_labels + return self + + def compute_silhouette_score(self): + self.silhouette_score = silhouette_samples(self.pictures_np, self.final_labels) + self.silhouette_score_labels = {cluster: np.mean(self.silhouette_score[self.final_labels == cluster]) for + cluster in np.unique(self.final_labels)} + return self.silhouette_score_labels + diff --git a/iss/clustering_debug.py b/iss/clustering_debug.py new file mode 100644 index 0000000..298ef6c --- /dev/null +++ b/iss/clustering_debug.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- + +from iss.tools import Config +from iss.tools import Tools +from iss.models import SimpleConvAutoEncoder +from iss.clustering import ClassicalClustering +from dotenv import find_dotenv, load_dotenv + +## Config +load_dotenv(find_dotenv()) +cfg = Config(project_dir = os.getenv("PROJECT_DIR"), mode = os.getenv("MODE")) + +## charger le modèle +model_type = 'simple_conv' +cfg.get('models')[model_type]['model_name'] = 'model_colab' +model = SimpleConvAutoEncoder(cfg.get('models')[model_type]) + +## Générateur d'image +filenames = Tools.list_directory_filenames('data/processed/models/autoencoder/train/k/') +generator_imgs = Tools.generator_np_picture_from_filenames(filenames, target_size = (27, 48), batch = 496, nb_batch = 2) + +## Générer des images \ No newline at end of file