execution clustering

2024-05-03 14:13:10 +02:00 · 2019-11-16 18:30:50 +01:00 · 2019-11-16 18:30:50 +01:00 · 8592ee01ab
parent e7f6206a40
commit 8592ee01ab
3 changed files with 259 additions and 9 deletions
--- a/iss/clustering/init.py
+++ b/iss/clustering/init.py
@ -1,2 +1,4 @@
 from .AbstractClustering import AbstractClustering
-from .ClassicalClustering import ClassicalClustering
+from .ClassicalClustering import ClassicalClustering
 from .AdvancedClustering import AdvancedClustering
 from .N2DClustering import N2DClustering
--- a/iss/exec/clustering.py
+++ b/iss/exec/clustering.py
@ -0,0 +1,186 @@
 import os
 import numpy as np
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
 import pandas as pd
 from bokeh.plotting import figure, output_file, show
 from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper
 from iss.init_config import CONFIG
 from iss.tools import Tools
 from iss.models import SimpleConvAutoEncoder
 from iss.clustering import ClassicalClustering, AdvancedClustering, N2DClustering
 ## variable globales
 _MODEL_TYPE = 'simple_conv'
 _MODEL_NAME = 'model_colab'
 _BATCH_SIZE = 496
 _N_BATCH = 10
 _DEBUG = True
 _CLUSTERING_TYPE = 'n2d'
 _OUTPUT_IMAGE_WIDTH = 96
 _OUTPUT_IMAGE_HEIGHT = 54
 _MOSAIC_NROW = 10
 _MOSAIC_NCOL_MAX = 10
 ## Charger le modèle
 CONFIG.get('models')[_MODEL_TYPE]['model_name'] = _MODEL_NAME
 model = SimpleConvAutoEncoder(CONFIG.get('models')[_MODEL_TYPE])
 model_config = CONFIG.get('models')[_MODEL_TYPE]
 ## Charger les images
 filenames = Tools.list_directory_filenames(os.path.join(CONFIG.get('directory')['autoencoder']['train']))
 generator_imgs = Tools.generator_np_picture_from_filenames(filenames, target_size = (model_config['input_height'], model_config['input_width']), batch = _BATCH_SIZE, nb_batch = _N_BATCH)
 pictures_id, pictures_preds = Tools.encoded_pictures_from_generator(generator_imgs, model)
 intermediate_output = pictures_preds.reshape((pictures_preds.shape[0], model_config['latent_width']*model_config['latent_height']*model_config['latent_channel']))
 if _DEBUG:
    for i, p_id in enumerate(pictures_id[:2]):
        print("%s: %s" % (p_id, pictures_preds[i]))
    print(len(pictures_id))
    print(len(intermediate_output))
 ## Clustering
 if _CLUSTERING_TYPE == 'classical':
    if _DEBUG:
        print("Classical Clustering")
    clustering = ClassicalClustering(CONFIG.get('clustering')['classical'], pictures_id, intermediate_output)
    clustering.compute_pca()
    clustering.compute_kmeans()
    clustering.compute_kmeans_centers()
    clustering.compute_cah()
    clustering.compute_final_labels()
    clustering.compute_tsne()
    clustering.compute_colors()
 elif _CLUSTERING_TYPE == 'advanced':
    if _DEBUG:
        print("Advanced Clustering")
    clustering = AdvancedClustering(CONFIG.get('clustering')['classical'], pictures_id, intermediate_output)
 elif _CLUSTERING_TYPE == 'n2d':
    if _DEBUG:
        print("Not2Deep Clustering")
    clustering = N2DClustering(CONFIG.get('clustering')['n2d'], pictures_id, intermediate_output)
    clustering.compute_umap()
    clustering.compute_kmeans()
    clustering.compute_final_labels()
    clustering.compute_colors()
 silhouettes = clustering.compute_silhouette_score()
 clustering_res = clustering.get_results()
 if _DEBUG:
    print(clustering_res[:2])
    print(silhouettes)
 if _CLUSTERING_TYPE in ['classical']:
    ## Graphs of PCA and final clusters
    fig, ax = plt.subplots(figsize=(24, 14))
    scatter = ax.scatter(clustering.pca_reduction[:, 0], clustering.pca_reduction[:, 1], c = clustering.colors)
    legend1 = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes")
    ax.add_artist(legend1)
    plt.savefig(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'pca_clusters.png'))
 if _CLUSTERING_TYPE in ['classical']:
    ## Graphs of TSNE and final clusters
    fig, ax = plt.subplots(figsize=(24, 14))
    classes = clustering.final_labels
    scatter = ax.scatter(clustering.tsne_embedding[:, 0], clustering.tsne_embedding[:, 1], c = clustering.colors)
    legend1 = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes")
    ax.add_artist(legend1)
    plt.savefig(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'tsne_clusters.png'))
 if _CLUSTERING_TYPE in ['n2d']:
    ## Graphs of TSNE and final clusters
    fig, ax = plt.subplots(figsize=(24, 14))
    classes = clustering.final_labels
    scatter = ax.scatter(clustering.umap_embedding[:, 0], clustering.umap_embedding[:, 1], c = clustering.colors)
    legend1 = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes")
    ax.add_artist(legend1)
    plt.savefig(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'umap_clusters.png'))
 if _CLUSTERING_TYPE in ['n2d']:
    filenames = [os.path.join(CONFIG.get('directory')['collections'], "%s.jpg" % one_res[0]) for one_res in clustering_res]
    images_array = [Tools.read_np_picture(img_filename, target_size = (54, 96)) for img_filename in filenames]
    base64_images = [Tools.base64_image(img) for img in images_array]
    print(clustering.umap_embedding)
    print(clustering.umap_embedding.shape)
    x = clustering.umap_embedding[:, 0]
    y = clustering.umap_embedding[:, 1]
    df = pd.DataFrame({'x': x, 'y': y})
    df['image'] = base64_images
    df['label'] = clustering.final_labels.astype(str)
    df['color'] = df['label'].apply(Tools.get_color_from_label)
    datasource = ColumnDataSource(df)
    output_file(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'umap_bokeh.html'))
    plot_figure = figure(
        title='UMAP projection of iss clusters',
        # plot_width=1200,
        # plot_height=1200,
        tools=('pan, wheel_zoom, reset')
    )
    plot_figure.add_tools(HoverTool(tooltips="""
    <div>
        <div>
            <img src='@image' style='float: left; margin: 5px 5px 5px 5px'/>
        </div>
        <div>
            <span style='font-size: 16px'>Cluster:</span>
            <span style='font-size: 18px'>@label</span>
        </div>
    </div>
    """))
    plot_figure.circle(
        'x',
        'y',
        source=datasource,
        color=dict(field='color'),
        line_alpha=0.6,
        fill_alpha=0.6,
        size=4
    )
    show(plot_figure)
 if _CLUSTERING_TYPE in ['classical']:
    ## Dendogram
    fig, ax = plt.subplots(figsize=(24, 14))
    plt.title('Hierarchical Clustering Dendrogram')
    Tools.plot_dendrogram(clustering.cah_fit, labels=clustering.cah_labels)
    plt.savefig(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'dendograms.png'))
 ## Silhouette
 fig, ax = plt.subplots(figsize=(12, 7))
 ax.bar(silhouettes.keys(), silhouettes.values(), align='center')
 ax.set_xticks(list(silhouettes.keys()))
 ax.set_xticklabels(list(silhouettes.keys()))
 plt.savefig(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'silhouettes_score.png'))
 ## Mosaic of each cluster
 clusters_id = np.unique(clustering.final_labels)
 for cluster_id in clusters_id:
    cluster_image_filenames = [os.path.join(CONFIG.get('directory')['collections'], "%s.jpg" % one_res[0]) for one_res in clustering_res if one_res[1] == cluster_id]
    images_array = [Tools.read_np_picture(img_filename, target_size = (_OUTPUT_IMAGE_HEIGHT, _OUTPUT_IMAGE_WIDTH)) for img_filename in cluster_image_filenames]
    img = Tools.display_mosaic(images_array, nrow = _MOSAIC_NROW, ncol_max = _MOSAIC_NCOL_MAX)
    img.save(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], "cluster_%s.png" % str(cluster_id).zfill(2)), "PNG")
--- a/iss/tools/tools.py
+++ b/iss/tools/tools.py
@ -3,7 +3,11 @@
 import PIL
 import os
 import numpy as np
 from io import BytesIO
 import base64
 from scipy.cluster.hierarchy import dendrogram
 from keras_preprocessing.image.utils import load_img
 import matplotlib as plt
 class Tools:
@ -26,6 +30,26 @@ class Tools:
 	def display_index_picture(array, index = 0):
 		return Tools.display_one_picture(array[index])
 	@staticmethod
 	def display_mosaic(array, nrow = 5, ncol_max = 10):
 		tmp = []
 		i = 0
 		image_col = []
 		while i < len(array):
 			tmp.append(array[i])
 			if len(tmp) % nrow == 0 and i > 0:
 				image_col.append(np.concatenate(tuple(tmp)))
 				tmp = []
 				if len(image_col) == ncol_max:
 					break
 			i += 1
 		if not image_col:
 			image_col.append(np.concatenate(tuple(tmp)))
 		image = np.concatenate(tuple(image_col), axis = 1)
 		return Tools.display_one_picture(image)
 	@staticmethod
 	def create_dir_if_not_exists(path):
 		if not os.path.exists(path):
@ -35,10 +59,16 @@ class Tools:
 	@staticmethod
 	def encoded_pictures_from_generator(generator, model):
-		predictions_tuple = tuple([model.get_encoded_prediction(imgs) for imgs in generator])
+		predictions_list = []
-		predictions = np.concatenate(predictions_tuple, axis = 0)
+		predictions_id = []
 		for imgs in generator:
 			predictions_id.append(imgs[0])
 			predictions_list.append(model.get_encoded_prediction(imgs[1]))
 		predictions = np.concatenate(tuple(predictions_list), axis = 0)
 		predictions_id = [os.path.splitext(os.path.basename(id))[0] for sub_id in predictions_id for id in sub_id]
-		return predictions
+		return predictions_id, predictions
 	@staticmethod
 	def read_np_picture(path, target_size = None, scale = 1):
@ -51,7 +81,9 @@ class Tools:
 	@staticmethod
 	def list_directory_filenames(path):
 		filenames = os.listdir(path)
-		filenames = [path + f for f in filenames]
+		np.random.seed(33213)
 		np.random.shuffle(filenames)
 		filenames = [os.path.join(path,f) for f in filenames]
 		return filenames
@ -65,15 +97,45 @@ class Tools:
 			nb_batch = div[0] + 1 * (div[1] != 0)
 		for i in range(nb_batch):
 		# for i in [75, 76]:
 			i_debut = i*batch
 			i_fin = min(i_debut + batch, max_n)
-			print("i_debut:" + str(i_debut))
+			yield (filenames[i_debut:i_fin], np.array([Tools.read_np_picture(f, target_size, scale) for f in filenames[i_debut:i_fin]]))
-			print("i_fin:" + str(i_fin))
+
-			yield np.array([Tools.read_np_picture(f, target_size, scale) for f in filenames[i_debut:i_fin]])
+	@staticmethod
 	def bytes_image(array):
 		image = Tools.display_one_picture(array)
 		buffer = BytesIO()
 		image.save(buffer, format='png')
 		im_bytes = buffer.getvalue()
 		return im_bytes
 	@staticmethod
 	def base64_image(array):
 		for_encoding = Tools.bytes_image(array)
 		return 'data:image/png;base64,' + base64.b64encode(for_encoding).decode()
 	@staticmethod
 	def get_color_from_label(label, n_labels = 50, palette = 'viridis'):
 		cmap = plt.cm.get_cmap(palette, n_labels)
 		return plt.colors.to_hex(cmap(int(label)))
 	@staticmethod
 	def plot_dendrogram(model, **kwargs):
 		# Children of hierarchical clustering
 		children = model.children_
 		# Distances between each pair of children
 		# Since we don't have this information, we can use a uniform one for plotting
 		distance = np.arange(children.shape[0])
 		# The number of observations contained in each cluster level
 		no_of_observations = np.arange(2, children.shape[0]+2)
 		# Create linkage matrix and then plot the dendrogram
 		linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)
 		# Plot the corresponding dendrogram
 		dendrogram(linkage_matrix, **kwargs)