diff --git a/iss/clustering/__init__.py b/iss/clustering/__init__.py index 3a6c88f..9e37cc3 100644 --- a/iss/clustering/__init__.py +++ b/iss/clustering/__init__.py @@ -1,2 +1,4 @@ from .AbstractClustering import AbstractClustering -from .ClassicalClustering import ClassicalClustering \ No newline at end of file +from .ClassicalClustering import ClassicalClustering +from .AdvancedClustering import AdvancedClustering +from .N2DClustering import N2DClustering \ No newline at end of file diff --git a/iss/exec/clustering.py b/iss/exec/clustering.py new file mode 100644 index 0000000..e1e2671 --- /dev/null +++ b/iss/exec/clustering.py @@ -0,0 +1,186 @@ +import os +import numpy as np +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt + +import pandas as pd +from bokeh.plotting import figure, output_file, show +from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper + +from iss.init_config import CONFIG +from iss.tools import Tools +from iss.models import SimpleConvAutoEncoder +from iss.clustering import ClassicalClustering, AdvancedClustering, N2DClustering + +## variable globales + +_MODEL_TYPE = 'simple_conv' +_MODEL_NAME = 'model_colab' +_BATCH_SIZE = 496 +_N_BATCH = 10 +_DEBUG = True +_CLUSTERING_TYPE = 'n2d' +_OUTPUT_IMAGE_WIDTH = 96 +_OUTPUT_IMAGE_HEIGHT = 54 +_MOSAIC_NROW = 10 +_MOSAIC_NCOL_MAX = 10 + + +## Charger le modèle +CONFIG.get('models')[_MODEL_TYPE]['model_name'] = _MODEL_NAME +model = SimpleConvAutoEncoder(CONFIG.get('models')[_MODEL_TYPE]) +model_config = CONFIG.get('models')[_MODEL_TYPE] + +## Charger les images +filenames = Tools.list_directory_filenames(os.path.join(CONFIG.get('directory')['autoencoder']['train'])) +generator_imgs = Tools.generator_np_picture_from_filenames(filenames, target_size = (model_config['input_height'], model_config['input_width']), batch = _BATCH_SIZE, nb_batch = _N_BATCH) + +pictures_id, pictures_preds = Tools.encoded_pictures_from_generator(generator_imgs, model) +intermediate_output = pictures_preds.reshape((pictures_preds.shape[0], model_config['latent_width']*model_config['latent_height']*model_config['latent_channel'])) + + +if _DEBUG: + for i, p_id in enumerate(pictures_id[:2]): + print("%s: %s" % (p_id, pictures_preds[i])) + print(len(pictures_id)) + print(len(intermediate_output)) + + +## Clustering +if _CLUSTERING_TYPE == 'classical': + if _DEBUG: + print("Classical Clustering") + clustering = ClassicalClustering(CONFIG.get('clustering')['classical'], pictures_id, intermediate_output) + clustering.compute_pca() + clustering.compute_kmeans() + clustering.compute_kmeans_centers() + clustering.compute_cah() + clustering.compute_final_labels() + clustering.compute_tsne() + clustering.compute_colors() +elif _CLUSTERING_TYPE == 'advanced': + if _DEBUG: + print("Advanced Clustering") + clustering = AdvancedClustering(CONFIG.get('clustering')['classical'], pictures_id, intermediate_output) +elif _CLUSTERING_TYPE == 'n2d': + if _DEBUG: + print("Not2Deep Clustering") + clustering = N2DClustering(CONFIG.get('clustering')['n2d'], pictures_id, intermediate_output) + clustering.compute_umap() + clustering.compute_kmeans() + clustering.compute_final_labels() + clustering.compute_colors() + +silhouettes = clustering.compute_silhouette_score() +clustering_res = clustering.get_results() + +if _DEBUG: + print(clustering_res[:2]) + print(silhouettes) + + +if _CLUSTERING_TYPE in ['classical']: + ## Graphs of PCA and final clusters + fig, ax = plt.subplots(figsize=(24, 14)) + scatter = ax.scatter(clustering.pca_reduction[:, 0], clustering.pca_reduction[:, 1], c = clustering.colors) + legend1 = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes") + ax.add_artist(legend1) + plt.savefig(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'pca_clusters.png')) + +if _CLUSTERING_TYPE in ['classical']: + ## Graphs of TSNE and final clusters + fig, ax = plt.subplots(figsize=(24, 14)) + classes = clustering.final_labels + scatter = ax.scatter(clustering.tsne_embedding[:, 0], clustering.tsne_embedding[:, 1], c = clustering.colors) + legend1 = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes") + ax.add_artist(legend1) + plt.savefig(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'tsne_clusters.png')) + +if _CLUSTERING_TYPE in ['n2d']: + ## Graphs of TSNE and final clusters + fig, ax = plt.subplots(figsize=(24, 14)) + classes = clustering.final_labels + scatter = ax.scatter(clustering.umap_embedding[:, 0], clustering.umap_embedding[:, 1], c = clustering.colors) + legend1 = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes") + ax.add_artist(legend1) + plt.savefig(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'umap_clusters.png')) + +if _CLUSTERING_TYPE in ['n2d']: + filenames = [os.path.join(CONFIG.get('directory')['collections'], "%s.jpg" % one_res[0]) for one_res in clustering_res] + images_array = [Tools.read_np_picture(img_filename, target_size = (54, 96)) for img_filename in filenames] + base64_images = [Tools.base64_image(img) for img in images_array] + + print(clustering.umap_embedding) + print(clustering.umap_embedding.shape) + + x = clustering.umap_embedding[:, 0] + y = clustering.umap_embedding[:, 1] + + df = pd.DataFrame({'x': x, 'y': y}) + df['image'] = base64_images + df['label'] = clustering.final_labels.astype(str) + df['color'] = df['label'].apply(Tools.get_color_from_label) + + datasource = ColumnDataSource(df) + + output_file(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'umap_bokeh.html')) + + plot_figure = figure( + title='UMAP projection of iss clusters', + # plot_width=1200, + # plot_height=1200, + tools=('pan, wheel_zoom, reset') + ) + + plot_figure.add_tools(HoverTool(tooltips=""" +
+
+ +
+
+ Cluster: + @label +
+
+ """)) + + + plot_figure.circle( + 'x', + 'y', + source=datasource, + color=dict(field='color'), + line_alpha=0.6, + fill_alpha=0.6, + size=4 + ) + + show(plot_figure) + + +if _CLUSTERING_TYPE in ['classical']: + ## Dendogram + fig, ax = plt.subplots(figsize=(24, 14)) + plt.title('Hierarchical Clustering Dendrogram') + Tools.plot_dendrogram(clustering.cah_fit, labels=clustering.cah_labels) + plt.savefig(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'dendograms.png')) + + +## Silhouette +fig, ax = plt.subplots(figsize=(12, 7)) +ax.bar(silhouettes.keys(), silhouettes.values(), align='center') +ax.set_xticks(list(silhouettes.keys())) +ax.set_xticklabels(list(silhouettes.keys())) +plt.savefig(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'silhouettes_score.png')) + + +## Mosaic of each cluster +clusters_id = np.unique(clustering.final_labels) +for cluster_id in clusters_id: + cluster_image_filenames = [os.path.join(CONFIG.get('directory')['collections'], "%s.jpg" % one_res[0]) for one_res in clustering_res if one_res[1] == cluster_id] + + images_array = [Tools.read_np_picture(img_filename, target_size = (_OUTPUT_IMAGE_HEIGHT, _OUTPUT_IMAGE_WIDTH)) for img_filename in cluster_image_filenames] + + img = Tools.display_mosaic(images_array, nrow = _MOSAIC_NROW, ncol_max = _MOSAIC_NCOL_MAX) + img.save(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], "cluster_%s.png" % str(cluster_id).zfill(2)), "PNG") diff --git a/iss/tools/tools.py b/iss/tools/tools.py index 35d4ef9..9d65049 100644 --- a/iss/tools/tools.py +++ b/iss/tools/tools.py @@ -3,7 +3,11 @@ import PIL import os import numpy as np +from io import BytesIO +import base64 +from scipy.cluster.hierarchy import dendrogram from keras_preprocessing.image.utils import load_img +import matplotlib as plt class Tools: @@ -26,6 +30,26 @@ class Tools: def display_index_picture(array, index = 0): return Tools.display_one_picture(array[index]) + @staticmethod + def display_mosaic(array, nrow = 5, ncol_max = 10): + + tmp = [] + i = 0 + image_col = [] + while i < len(array): + tmp.append(array[i]) + + if len(tmp) % nrow == 0 and i > 0: + image_col.append(np.concatenate(tuple(tmp))) + tmp = [] + if len(image_col) == ncol_max: + break + i += 1 + if not image_col: + image_col.append(np.concatenate(tuple(tmp))) + image = np.concatenate(tuple(image_col), axis = 1) + return Tools.display_one_picture(image) + @staticmethod def create_dir_if_not_exists(path): if not os.path.exists(path): @@ -35,10 +59,16 @@ class Tools: @staticmethod def encoded_pictures_from_generator(generator, model): - predictions_tuple = tuple([model.get_encoded_prediction(imgs) for imgs in generator]) - predictions = np.concatenate(predictions_tuple, axis = 0) + predictions_list = [] + predictions_id = [] + for imgs in generator: + predictions_id.append(imgs[0]) + predictions_list.append(model.get_encoded_prediction(imgs[1])) + + predictions = np.concatenate(tuple(predictions_list), axis = 0) + predictions_id = [os.path.splitext(os.path.basename(id))[0] for sub_id in predictions_id for id in sub_id] - return predictions + return predictions_id, predictions @staticmethod def read_np_picture(path, target_size = None, scale = 1): @@ -51,7 +81,9 @@ class Tools: @staticmethod def list_directory_filenames(path): filenames = os.listdir(path) - filenames = [path + f for f in filenames] + np.random.seed(33213) + np.random.shuffle(filenames) + filenames = [os.path.join(path,f) for f in filenames] return filenames @@ -65,15 +97,45 @@ class Tools: nb_batch = div[0] + 1 * (div[1] != 0) for i in range(nb_batch): - # for i in [75, 76]: i_debut = i*batch i_fin = min(i_debut + batch, max_n) - print("i_debut:" + str(i_debut)) - print("i_fin:" + str(i_fin)) - yield np.array([Tools.read_np_picture(f, target_size, scale) for f in filenames[i_debut:i_fin]]) + yield (filenames[i_debut:i_fin], np.array([Tools.read_np_picture(f, target_size, scale) for f in filenames[i_debut:i_fin]])) + + @staticmethod + def bytes_image(array): + image = Tools.display_one_picture(array) + buffer = BytesIO() + image.save(buffer, format='png') + im_bytes = buffer.getvalue() + + return im_bytes + + @staticmethod + def base64_image(array): + for_encoding = Tools.bytes_image(array) + return 'data:image/png;base64,' + base64.b64encode(for_encoding).decode() + + @staticmethod + def get_color_from_label(label, n_labels = 50, palette = 'viridis'): + cmap = plt.cm.get_cmap(palette, n_labels) + return plt.colors.to_hex(cmap(int(label))) + @staticmethod + def plot_dendrogram(model, **kwargs): + # Children of hierarchical clustering + children = model.children_ + # Distances between each pair of children + # Since we don't have this information, we can use a uniform one for plotting + distance = np.arange(children.shape[0]) + # The number of observations contained in each cluster level + no_of_observations = np.arange(2, children.shape[0]+2) + # Create linkage matrix and then plot the dendrogram + linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float) + + # Plot the corresponding dendrogram + dendrogram(linkage_matrix, **kwargs) \ No newline at end of file