From 45dbfd8db77c19c6e06d0bcafec87cbb69e454b5 Mon Sep 17 00:00:00 2001 From: Francois Vieille Date: Sun, 8 Dec 2019 02:24:20 +0100 Subject: [PATCH] prepare clustering industrialisation --- iss/clustering/AbstractClustering.py | 5 +- iss/clustering/ClassicalClustering.py | 20 +- iss/clustering/N2DClustering.py | 12 +- iss/exec/clustering.py | 333 +++++++++++++++----------- 4 files changed, 222 insertions(+), 148 deletions(-) diff --git a/iss/clustering/AbstractClustering.py b/iss/clustering/AbstractClustering.py index d76d5fd..c1b9706 100644 --- a/iss/clustering/AbstractClustering.py +++ b/iss/clustering/AbstractClustering.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import os from iss.tools import Tools class AbstractClustering: @@ -6,13 +7,13 @@ class AbstractClustering: def __init__(self, config, pictures_id, pictures_np): self.config = config + self.save_directory = os.path.join(self.config['save_directory'], '%s_%s_%s' % (self.config['model']['type'], self.config['model']['name'], self.config['version'])) self.pictures_id = pictures_id self.pictures_np = pictures_np self.final_labels = None self.colors = None - if self.config['save_directory']: - Tools.create_dir_if_not_exists(self.config['save_directory']) + Tools.create_dir_if_not_exists(self.save_directory) def compute_final_labels(self): raise NotImplementedError diff --git a/iss/clustering/ClassicalClustering.py b/iss/clustering/ClassicalClustering.py index 7c0f004..1c957d7 100644 --- a/iss/clustering/ClassicalClustering.py +++ b/iss/clustering/ClassicalClustering.py @@ -20,19 +20,19 @@ class ClassicalClustering(AbstractClustering): self.pca_fit = None self.pca_args = self.config['PCA'] self.pca_reduction = None - self.pca_save_name = "PCA_model_v%s.pkl" % (self.config['version']) + self.pca_save_name = "PCA_model.pkl" self.kmeans_fit = None self.kmeans_args = self.config['kmeans'] self.kmeans_labels = None self.kmeans_centers = [] - self.kmeans_save_name = "kmeans_model_v%s.pkl" % (self.config['version']) + self.kmeans_save_name = "kmeans_model.pkl" self.cah_fit = None self.cah_args = self.config['CAH'] self.cah_labels = None - self.cah_save_name = "cah_model_v%s.pkl" % (self.config['version']) + self.cah_save_name = "cah_model.pkl" self.tsne_fit = None self.tsne_args = self.config['TSNE'] @@ -88,15 +88,15 @@ class ClassicalClustering(AbstractClustering): def save(self): - Tools.create_dir_if_not_exists(self.config['save_directory']) + Tools.create_dir_if_not_exists(self.save_directory) - joblib.dump(self.pca_fit, os.path.join(self.config['save_directory'], self.pca_save_name)) - joblib.dump(self.kmeans_fit, os.path.join(self.config['save_directory'], self.kmeans_save_name)) - joblib.dump(self.cah_fit, os.path.join(self.config['save_directory'], self.cah_save_name)) + joblib.dump(self.pca_fit, os.path.join(self.save_directory, self.pca_save_name)) + joblib.dump(self.kmeans_fit, os.path.join(self.save_directory, self.kmeans_save_name)) + joblib.dump(self.cah_fit, os.path.join(self.save_directory, self.cah_save_name)) def load(self): - self.pca_fit = joblib.load(os.path.join(self.config['save_directory'], self.pca_save_name)) - self.kmeans_fit = joblib.load(os.path.join(self.config['save_directory'], self.kmeans_save_name)) - self.cah_fit = joblib.load(os.path.join(self.config['save_directory'], self.cah_save_name)) + self.pca_fit = joblib.load(os.path.join(self.save_directory, self.pca_save_name)) + self.kmeans_fit = joblib.load(os.path.join(self.save_directory, self.kmeans_save_name)) + self.cah_fit = joblib.load(os.path.join(self.save_directory, self.cah_save_name)) diff --git a/iss/clustering/N2DClustering.py b/iss/clustering/N2DClustering.py index 4839ca2..9e28f73 100644 --- a/iss/clustering/N2DClustering.py +++ b/iss/clustering/N2DClustering.py @@ -21,12 +21,13 @@ class N2DClustering(AbstractClustering): self.umap_args = self.config['umap'] self.umap_fit = None self.umap_embedding = None + self.umap_save_name = 'UMAP_model.pkl' self.kmeans_fit = None self.kmeans_args = self.config['kmeans'] self.kmeans_labels = None self.kmeans_centers = [] - self.kmeans_save_name = "kmeans_model_v%s.pkl" % (self.config['version']) + self.kmeans_save_name = "kmeans_model.pkl" def compute_umap(self): @@ -50,3 +51,12 @@ class N2DClustering(AbstractClustering): cluster in np.unique(self.final_labels)} return self.silhouette_score_labels + def save(self): + Tools.create_dir_if_not_exists(self.save_directory) + + joblib.dump(self.umap_fit, os.path.join(self.save_directory, self.umap_save_name)) + joblib.dump(self.kmeans_fit, os.path.join(self.save_directory, self.kmeans_save_name)) + + def load(self): + self.umap_fit = joblib.load(os.path.join(self.save_directory, self.pca_save_name)) + self.kmeans_fit = joblib.load(os.path.join(self.save_directory, self.kmeans_save_name)) \ No newline at end of file diff --git a/iss/exec/clustering.py b/iss/exec/clustering.py index e1e2671..3b3069a 100644 --- a/iss/exec/clustering.py +++ b/iss/exec/clustering.py @@ -10,177 +10,240 @@ from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper from iss.init_config import CONFIG from iss.tools import Tools -from iss.models import SimpleConvAutoEncoder +from iss.models import SimpleConvAutoEncoder, SimpleAutoEncoder from iss.clustering import ClassicalClustering, AdvancedClustering, N2DClustering -## variable globales -_MODEL_TYPE = 'simple_conv' -_MODEL_NAME = 'model_colab' -_BATCH_SIZE = 496 -_N_BATCH = 10 _DEBUG = True -_CLUSTERING_TYPE = 'n2d' -_OUTPUT_IMAGE_WIDTH = 96 -_OUTPUT_IMAGE_HEIGHT = 54 -_MOSAIC_NROW = 10 -_MOSAIC_NCOL_MAX = 10 -## Charger le modèle -CONFIG.get('models')[_MODEL_TYPE]['model_name'] = _MODEL_NAME -model = SimpleConvAutoEncoder(CONFIG.get('models')[_MODEL_TYPE]) -model_config = CONFIG.get('models')[_MODEL_TYPE] +def load_model(config, clustering_type): + """ + Load model according to config + """ -## Charger les images -filenames = Tools.list_directory_filenames(os.path.join(CONFIG.get('directory')['autoencoder']['train'])) -generator_imgs = Tools.generator_np_picture_from_filenames(filenames, target_size = (model_config['input_height'], model_config['input_width']), batch = _BATCH_SIZE, nb_batch = _N_BATCH) + model_type = config.get('clustering')[clustering_type]['model']['type'] + model_name = config.get('clustering')[clustering_type]['model']['name'] + config.get('models')[model_type]['model_name'] = model_name -pictures_id, pictures_preds = Tools.encoded_pictures_from_generator(generator_imgs, model) -intermediate_output = pictures_preds.reshape((pictures_preds.shape[0], model_config['latent_width']*model_config['latent_height']*model_config['latent_channel'])) + if model_type == 'simple_conv': + model = SimpleConvAutoEncoder(config.get('models')[model_type]) + elif model_type == 'simple': + model = SimpleAutoEncoder(config.get('models')[model_type]) + else: + raise Exception + + model_config = config.get('models')[model_type] + + return model, model_config -if _DEBUG: - for i, p_id in enumerate(pictures_id[:2]): - print("%s: %s" % (p_id, pictures_preds[i])) - print(len(pictures_id)) - print(len(intermediate_output)) +def load_images(config, clustering_type, model, model_config, batch_size, n_batch): + """ + load images and predictions + """ + model_type = config.get('clustering')[clustering_type]['model']['type'] + filenames = Tools.list_directory_filenames(os.path.join(config.get('sampling')['autoencoder']['directory']['train'])) + generator_imgs = Tools.generator_np_picture_from_filenames(filenames, target_size = (model_config['input_height'], model_config['input_width']), batch = batch_size, nb_batch = n_batch) + + pictures_id, pictures_preds = Tools.encoded_pictures_from_generator(generator_imgs, model) + if model_type in ['simple_conv']: + intermediate_output = pictures_preds.reshape((pictures_preds.shape[0], model_config['latent_width']*model_config['latent_height']*model_config['latent_channel'])) + else: + intermediate_output = pictures_preds + + return pictures_id, intermediate_output -## Clustering -if _CLUSTERING_TYPE == 'classical': - if _DEBUG: - print("Classical Clustering") - clustering = ClassicalClustering(CONFIG.get('clustering')['classical'], pictures_id, intermediate_output) - clustering.compute_pca() - clustering.compute_kmeans() - clustering.compute_kmeans_centers() - clustering.compute_cah() - clustering.compute_final_labels() - clustering.compute_tsne() - clustering.compute_colors() -elif _CLUSTERING_TYPE == 'advanced': - if _DEBUG: - print("Advanced Clustering") - clustering = AdvancedClustering(CONFIG.get('clustering')['classical'], pictures_id, intermediate_output) -elif _CLUSTERING_TYPE == 'n2d': - if _DEBUG: - print("Not2Deep Clustering") - clustering = N2DClustering(CONFIG.get('clustering')['n2d'], pictures_id, intermediate_output) - clustering.compute_umap() - clustering.compute_kmeans() - clustering.compute_final_labels() - clustering.compute_colors() +def run_clustering(config, clustering_type, pictures_id, intermediate_output): + """ + Apply clustering on images + """ -silhouettes = clustering.compute_silhouette_score() -clustering_res = clustering.get_results() + if clustering_type == 'classical': + if _DEBUG: + print("Classical Clustering") + clustering = ClassicalClustering(config.get('clustering')['classical'], pictures_id, intermediate_output) + clustering.compute_pca() + clustering.compute_kmeans() + clustering.compute_kmeans_centers() + clustering.compute_cah() + clustering.compute_final_labels() + clustering.compute_tsne() + clustering.compute_colors() + elif clustering_type == 'advanced': + if _DEBUG: + print("Advanced Clustering") + clustering = AdvancedClustering(config.get('clustering')['classical'], pictures_id, intermediate_output) + elif clustering_type == 'n2d': + if _DEBUG: + print("Not2Deep Clustering") + clustering = N2DClustering(config.get('clustering')['n2d'], pictures_id, intermediate_output) + clustering.compute_umap() + clustering.compute_kmeans() + clustering.compute_final_labels() + clustering.compute_colors() -if _DEBUG: - print(clustering_res[:2]) - print(silhouettes) + return clustering -if _CLUSTERING_TYPE in ['classical']: - ## Graphs of PCA and final clusters - fig, ax = plt.subplots(figsize=(24, 14)) - scatter = ax.scatter(clustering.pca_reduction[:, 0], clustering.pca_reduction[:, 1], c = clustering.colors) - legend1 = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes") - ax.add_artist(legend1) - plt.savefig(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'pca_clusters.png')) +def run_plots(config, clustering_type, clustering): + """ + Plots specifics graphs + """ -if _CLUSTERING_TYPE in ['classical']: - ## Graphs of TSNE and final clusters - fig, ax = plt.subplots(figsize=(24, 14)) - classes = clustering.final_labels - scatter = ax.scatter(clustering.tsne_embedding[:, 0], clustering.tsne_embedding[:, 1], c = clustering.colors) - legend1 = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes") - ax.add_artist(legend1) - plt.savefig(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'tsne_clusters.png')) + if clustering_type in ['classical']: + ## Graphs of PCA and final clusters + fig, ax = plt.subplots(figsize=(24, 14)) + scatter = ax.scatter(clustering.pca_reduction[:, 0], clustering.pca_reduction[:, 1], c = clustering.colors) + legend1 = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes") + ax.add_artist(legend1) + plt.savefig(os.path.join(clustering.save_directory, 'pca_clusters.png')) -if _CLUSTERING_TYPE in ['n2d']: - ## Graphs of TSNE and final clusters - fig, ax = plt.subplots(figsize=(24, 14)) - classes = clustering.final_labels - scatter = ax.scatter(clustering.umap_embedding[:, 0], clustering.umap_embedding[:, 1], c = clustering.colors) - legend1 = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes") - ax.add_artist(legend1) - plt.savefig(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'umap_clusters.png')) + if clustering_type in ['classical']: + ## Graphs of TSNE and final clusters + fig, ax = plt.subplots(figsize=(24, 14)) + classes = clustering.final_labels + scatter = ax.scatter(clustering.tsne_embedding[:, 0], clustering.tsne_embedding[:, 1], c = clustering.colors) + legend1 = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes") + ax.add_artist(legend1) + plt.savefig(os.path.join(clustering.save_directory, 'tsne_clusters.png')) -if _CLUSTERING_TYPE in ['n2d']: - filenames = [os.path.join(CONFIG.get('directory')['collections'], "%s.jpg" % one_res[0]) for one_res in clustering_res] - images_array = [Tools.read_np_picture(img_filename, target_size = (54, 96)) for img_filename in filenames] - base64_images = [Tools.base64_image(img) for img in images_array] + if clustering_type in ['n2d']: + ## Graphs of TSNE and final clusters + fig, ax = plt.subplots(figsize=(24, 14)) + classes = clustering.final_labels + scatter = ax.scatter(clustering.umap_embedding[:, 0], clustering.umap_embedding[:, 1], c = clustering.colors) + legend1 = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes") + ax.add_artist(legend1) + plt.savefig(os.path.join(clustering.save_directory, 'umap_clusters.png')) - print(clustering.umap_embedding) - print(clustering.umap_embedding.shape) + if clustering_type in ['n2d', 'classical']: + filenames = [os.path.join(config.get('directory')['collections'], "%s.jpg" % one_res[0]) for one_res in clustering.get_results()] + images_array = [Tools.read_np_picture(img_filename, target_size = (54, 96)) for img_filename in filenames] + base64_images = [Tools.base64_image(img) for img in images_array] - x = clustering.umap_embedding[:, 0] - y = clustering.umap_embedding[:, 1] + if clustering_type == 'n2d': + x = clustering.umap_embedding[:, 0] + y = clustering.umap_embedding[:, 1] + html_file = 'umap_bokeh.html' + title = 'UMAP projection of iss clusters' + elif clustering_type == 'classical': + x = clustering.tsne_embedding[:, 0] + y = clustering.tsne_embedding[:, 1] + html_file = 'tsne_bokeh.html' + title = 't-SNE projection of iss clusters' - df = pd.DataFrame({'x': x, 'y': y}) - df['image'] = base64_images - df['label'] = clustering.final_labels.astype(str) - df['color'] = df['label'].apply(Tools.get_color_from_label) + df = pd.DataFrame({'x': x, 'y': y}) + df['image'] = base64_images + df['label'] = clustering.final_labels.astype(str) + df['color'] = df['label'].apply(Tools.get_color_from_label) - datasource = ColumnDataSource(df) + datasource = ColumnDataSource(df) - output_file(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'umap_bokeh.html')) + output_file(os.path.join(clustering.save_directory, html_file)) - plot_figure = figure( - title='UMAP projection of iss clusters', - # plot_width=1200, - # plot_height=1200, - tools=('pan, wheel_zoom, reset') - ) + plot_figure = figure( + title=title, + # plot_width=1200, + # plot_height=1200, + tools=('pan, wheel_zoom, reset') + ) - plot_figure.add_tools(HoverTool(tooltips=""" -
+ plot_figure.add_tools(HoverTool(tooltips="""
- +
+ +
+
+ Cluster: + @label +
-
- Cluster: - @label -
-
- """)) + """)) - plot_figure.circle( - 'x', - 'y', - source=datasource, - color=dict(field='color'), - line_alpha=0.6, - fill_alpha=0.6, - size=4 - ) + plot_figure.circle( + 'x', + 'y', + source=datasource, + color=dict(field='color'), + line_alpha=0.6, + fill_alpha=0.6, + size=4 + ) - show(plot_figure) + show(plot_figure) -if _CLUSTERING_TYPE in ['classical']: - ## Dendogram - fig, ax = plt.subplots(figsize=(24, 14)) - plt.title('Hierarchical Clustering Dendrogram') - Tools.plot_dendrogram(clustering.cah_fit, labels=clustering.cah_labels) - plt.savefig(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'dendograms.png')) + if clustering_type in ['classical']: + ## Dendogram + fig, ax = plt.subplots(figsize=(24, 14)) + plt.title('Hierarchical Clustering Dendrogram') + Tools.plot_dendrogram(clustering.cah_fit, labels=clustering.cah_labels) + plt.savefig(os.path.join(clustering.save_directory, 'dendograms.png')) + + return True + +def plot_silhouette(config, clustering_type, clustering): + + silhouettes = clustering.compute_silhouette_score() + + fig, ax = plt.subplots(figsize=(12, 7)) + ax.bar(silhouettes.keys(), silhouettes.values(), align='center') + ax.set_xticks(list(silhouettes.keys())) + ax.set_xticklabels(list(silhouettes.keys())) + plt.savefig(os.path.join(clustering.save_directory, 'silhouettes_score.png')) + + return silhouettes -## Silhouette -fig, ax = plt.subplots(figsize=(12, 7)) -ax.bar(silhouettes.keys(), silhouettes.values(), align='center') -ax.set_xticks(list(silhouettes.keys())) -ax.set_xticklabels(list(silhouettes.keys())) -plt.savefig(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'silhouettes_score.png')) +def plot_mosaics(config, clustering_type, clustering, output_image_width, output_image_height, mosaic_nrow, mosaic_ncol_max): + """ + Mosaic of each cluster + """ + clusters_id = np.unique(clustering.final_labels) + clustering_res = clustering.get_results() + + for cluster_id in clusters_id: + cluster_image_filenames = [os.path.join(config.get('directory')['collections'], "%s.jpg" % one_res[0]) for one_res in clustering_res if one_res[1] == cluster_id] + + images_array = [Tools.read_np_picture(img_filename, target_size = (output_image_height, output_image_width)) for img_filename in cluster_image_filenames] + + img = Tools.display_mosaic(images_array, nrow = mosaic_nrow, ncol_max = mosaic_ncol_max) + img.save(os.path.join(clustering.save_directory, "cluster_%s.png" % str(cluster_id).zfill(2)), "PNG") + + return clusters_id -## Mosaic of each cluster -clusters_id = np.unique(clustering.final_labels) -for cluster_id in clusters_id: - cluster_image_filenames = [os.path.join(CONFIG.get('directory')['collections'], "%s.jpg" % one_res[0]) for one_res in clustering_res if one_res[1] == cluster_id] +def main(): + _CLUSTERING_TYPE = 'classical' + _BATCH_SIZE = 496 + _N_BATCH = 1 + _PLOTS = True + _MOSAICS = True + _SILHOUETTE = True + _OUTPUT_IMAGE_WIDTH = 96 + _OUTPUT_IMAGE_HEIGHT = 54 + _MOSAIC_NROW = 10 + _MOSAIC_NCOL_MAX = 10 - images_array = [Tools.read_np_picture(img_filename, target_size = (_OUTPUT_IMAGE_HEIGHT, _OUTPUT_IMAGE_WIDTH)) for img_filename in cluster_image_filenames] + model, model_config = load_model(CONFIG, _CLUSTERING_TYPE) + pictures_id, intermediate_output = load_images(CONFIG, _CLUSTERING_TYPE, model, model_config, _BATCH_SIZE, _N_BATCH) + + clustering = run_clustering(CONFIG, _CLUSTERING_TYPE, pictures_id, intermediate_output) + + clustering.save() - img = Tools.display_mosaic(images_array, nrow = _MOSAIC_NROW, ncol_max = _MOSAIC_NCOL_MAX) - img.save(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], "cluster_%s.png" % str(cluster_id).zfill(2)), "PNG") + if _PLOTS: + run_plots(CONFIG, _CLUSTERING_TYPE, clustering) + + if _SILHOUETTE: + plot_silhouette(CONFIG, _CLUSTERING_TYPE, clustering) + + if _MOSAICS: + plot_mosaics(CONFIG, _CLUSTERING_TYPE, clustering, _OUTPUT_IMAGE_WIDTH, _OUTPUT_IMAGE_HEIGHT, _MOSAIC_NROW, _MOSAIC_NCOL_MAX) + + +if __name__ == '__main__': + main()