mirror of
https://github.com/prise6/smart-iss-posts
synced 2024-05-03 14:13:10 +02:00
execution clustering
This commit is contained in:
parent
e7f6206a40
commit
8592ee01ab
|
@ -1,2 +1,4 @@
|
||||||
from .AbstractClustering import AbstractClustering
|
from .AbstractClustering import AbstractClustering
|
||||||
from .ClassicalClustering import ClassicalClustering
|
from .ClassicalClustering import ClassicalClustering
|
||||||
|
from .AdvancedClustering import AdvancedClustering
|
||||||
|
from .N2DClustering import N2DClustering
|
186
iss/exec/clustering.py
Normal file
186
iss/exec/clustering.py
Normal file
|
@ -0,0 +1,186 @@
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib
|
||||||
|
matplotlib.use('Agg')
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from bokeh.plotting import figure, output_file, show
|
||||||
|
from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper
|
||||||
|
|
||||||
|
from iss.init_config import CONFIG
|
||||||
|
from iss.tools import Tools
|
||||||
|
from iss.models import SimpleConvAutoEncoder
|
||||||
|
from iss.clustering import ClassicalClustering, AdvancedClustering, N2DClustering
|
||||||
|
|
||||||
|
## variable globales
|
||||||
|
|
||||||
|
_MODEL_TYPE = 'simple_conv'
|
||||||
|
_MODEL_NAME = 'model_colab'
|
||||||
|
_BATCH_SIZE = 496
|
||||||
|
_N_BATCH = 10
|
||||||
|
_DEBUG = True
|
||||||
|
_CLUSTERING_TYPE = 'n2d'
|
||||||
|
_OUTPUT_IMAGE_WIDTH = 96
|
||||||
|
_OUTPUT_IMAGE_HEIGHT = 54
|
||||||
|
_MOSAIC_NROW = 10
|
||||||
|
_MOSAIC_NCOL_MAX = 10
|
||||||
|
|
||||||
|
|
||||||
|
## Charger le modèle
|
||||||
|
CONFIG.get('models')[_MODEL_TYPE]['model_name'] = _MODEL_NAME
|
||||||
|
model = SimpleConvAutoEncoder(CONFIG.get('models')[_MODEL_TYPE])
|
||||||
|
model_config = CONFIG.get('models')[_MODEL_TYPE]
|
||||||
|
|
||||||
|
## Charger les images
|
||||||
|
filenames = Tools.list_directory_filenames(os.path.join(CONFIG.get('directory')['autoencoder']['train']))
|
||||||
|
generator_imgs = Tools.generator_np_picture_from_filenames(filenames, target_size = (model_config['input_height'], model_config['input_width']), batch = _BATCH_SIZE, nb_batch = _N_BATCH)
|
||||||
|
|
||||||
|
pictures_id, pictures_preds = Tools.encoded_pictures_from_generator(generator_imgs, model)
|
||||||
|
intermediate_output = pictures_preds.reshape((pictures_preds.shape[0], model_config['latent_width']*model_config['latent_height']*model_config['latent_channel']))
|
||||||
|
|
||||||
|
|
||||||
|
if _DEBUG:
|
||||||
|
for i, p_id in enumerate(pictures_id[:2]):
|
||||||
|
print("%s: %s" % (p_id, pictures_preds[i]))
|
||||||
|
print(len(pictures_id))
|
||||||
|
print(len(intermediate_output))
|
||||||
|
|
||||||
|
|
||||||
|
## Clustering
|
||||||
|
if _CLUSTERING_TYPE == 'classical':
|
||||||
|
if _DEBUG:
|
||||||
|
print("Classical Clustering")
|
||||||
|
clustering = ClassicalClustering(CONFIG.get('clustering')['classical'], pictures_id, intermediate_output)
|
||||||
|
clustering.compute_pca()
|
||||||
|
clustering.compute_kmeans()
|
||||||
|
clustering.compute_kmeans_centers()
|
||||||
|
clustering.compute_cah()
|
||||||
|
clustering.compute_final_labels()
|
||||||
|
clustering.compute_tsne()
|
||||||
|
clustering.compute_colors()
|
||||||
|
elif _CLUSTERING_TYPE == 'advanced':
|
||||||
|
if _DEBUG:
|
||||||
|
print("Advanced Clustering")
|
||||||
|
clustering = AdvancedClustering(CONFIG.get('clustering')['classical'], pictures_id, intermediate_output)
|
||||||
|
elif _CLUSTERING_TYPE == 'n2d':
|
||||||
|
if _DEBUG:
|
||||||
|
print("Not2Deep Clustering")
|
||||||
|
clustering = N2DClustering(CONFIG.get('clustering')['n2d'], pictures_id, intermediate_output)
|
||||||
|
clustering.compute_umap()
|
||||||
|
clustering.compute_kmeans()
|
||||||
|
clustering.compute_final_labels()
|
||||||
|
clustering.compute_colors()
|
||||||
|
|
||||||
|
silhouettes = clustering.compute_silhouette_score()
|
||||||
|
clustering_res = clustering.get_results()
|
||||||
|
|
||||||
|
if _DEBUG:
|
||||||
|
print(clustering_res[:2])
|
||||||
|
print(silhouettes)
|
||||||
|
|
||||||
|
|
||||||
|
if _CLUSTERING_TYPE in ['classical']:
|
||||||
|
## Graphs of PCA and final clusters
|
||||||
|
fig, ax = plt.subplots(figsize=(24, 14))
|
||||||
|
scatter = ax.scatter(clustering.pca_reduction[:, 0], clustering.pca_reduction[:, 1], c = clustering.colors)
|
||||||
|
legend1 = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes")
|
||||||
|
ax.add_artist(legend1)
|
||||||
|
plt.savefig(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'pca_clusters.png'))
|
||||||
|
|
||||||
|
if _CLUSTERING_TYPE in ['classical']:
|
||||||
|
## Graphs of TSNE and final clusters
|
||||||
|
fig, ax = plt.subplots(figsize=(24, 14))
|
||||||
|
classes = clustering.final_labels
|
||||||
|
scatter = ax.scatter(clustering.tsne_embedding[:, 0], clustering.tsne_embedding[:, 1], c = clustering.colors)
|
||||||
|
legend1 = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes")
|
||||||
|
ax.add_artist(legend1)
|
||||||
|
plt.savefig(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'tsne_clusters.png'))
|
||||||
|
|
||||||
|
if _CLUSTERING_TYPE in ['n2d']:
|
||||||
|
## Graphs of TSNE and final clusters
|
||||||
|
fig, ax = plt.subplots(figsize=(24, 14))
|
||||||
|
classes = clustering.final_labels
|
||||||
|
scatter = ax.scatter(clustering.umap_embedding[:, 0], clustering.umap_embedding[:, 1], c = clustering.colors)
|
||||||
|
legend1 = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes")
|
||||||
|
ax.add_artist(legend1)
|
||||||
|
plt.savefig(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'umap_clusters.png'))
|
||||||
|
|
||||||
|
if _CLUSTERING_TYPE in ['n2d']:
|
||||||
|
filenames = [os.path.join(CONFIG.get('directory')['collections'], "%s.jpg" % one_res[0]) for one_res in clustering_res]
|
||||||
|
images_array = [Tools.read_np_picture(img_filename, target_size = (54, 96)) for img_filename in filenames]
|
||||||
|
base64_images = [Tools.base64_image(img) for img in images_array]
|
||||||
|
|
||||||
|
print(clustering.umap_embedding)
|
||||||
|
print(clustering.umap_embedding.shape)
|
||||||
|
|
||||||
|
x = clustering.umap_embedding[:, 0]
|
||||||
|
y = clustering.umap_embedding[:, 1]
|
||||||
|
|
||||||
|
df = pd.DataFrame({'x': x, 'y': y})
|
||||||
|
df['image'] = base64_images
|
||||||
|
df['label'] = clustering.final_labels.astype(str)
|
||||||
|
df['color'] = df['label'].apply(Tools.get_color_from_label)
|
||||||
|
|
||||||
|
datasource = ColumnDataSource(df)
|
||||||
|
|
||||||
|
output_file(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'umap_bokeh.html'))
|
||||||
|
|
||||||
|
plot_figure = figure(
|
||||||
|
title='UMAP projection of iss clusters',
|
||||||
|
# plot_width=1200,
|
||||||
|
# plot_height=1200,
|
||||||
|
tools=('pan, wheel_zoom, reset')
|
||||||
|
)
|
||||||
|
|
||||||
|
plot_figure.add_tools(HoverTool(tooltips="""
|
||||||
|
<div>
|
||||||
|
<div>
|
||||||
|
<img src='@image' style='float: left; margin: 5px 5px 5px 5px'/>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<span style='font-size: 16px'>Cluster:</span>
|
||||||
|
<span style='font-size: 18px'>@label</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
"""))
|
||||||
|
|
||||||
|
|
||||||
|
plot_figure.circle(
|
||||||
|
'x',
|
||||||
|
'y',
|
||||||
|
source=datasource,
|
||||||
|
color=dict(field='color'),
|
||||||
|
line_alpha=0.6,
|
||||||
|
fill_alpha=0.6,
|
||||||
|
size=4
|
||||||
|
)
|
||||||
|
|
||||||
|
show(plot_figure)
|
||||||
|
|
||||||
|
|
||||||
|
if _CLUSTERING_TYPE in ['classical']:
|
||||||
|
## Dendogram
|
||||||
|
fig, ax = plt.subplots(figsize=(24, 14))
|
||||||
|
plt.title('Hierarchical Clustering Dendrogram')
|
||||||
|
Tools.plot_dendrogram(clustering.cah_fit, labels=clustering.cah_labels)
|
||||||
|
plt.savefig(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'dendograms.png'))
|
||||||
|
|
||||||
|
|
||||||
|
## Silhouette
|
||||||
|
fig, ax = plt.subplots(figsize=(12, 7))
|
||||||
|
ax.bar(silhouettes.keys(), silhouettes.values(), align='center')
|
||||||
|
ax.set_xticks(list(silhouettes.keys()))
|
||||||
|
ax.set_xticklabels(list(silhouettes.keys()))
|
||||||
|
plt.savefig(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], 'silhouettes_score.png'))
|
||||||
|
|
||||||
|
|
||||||
|
## Mosaic of each cluster
|
||||||
|
clusters_id = np.unique(clustering.final_labels)
|
||||||
|
for cluster_id in clusters_id:
|
||||||
|
cluster_image_filenames = [os.path.join(CONFIG.get('directory')['collections'], "%s.jpg" % one_res[0]) for one_res in clustering_res if one_res[1] == cluster_id]
|
||||||
|
|
||||||
|
images_array = [Tools.read_np_picture(img_filename, target_size = (_OUTPUT_IMAGE_HEIGHT, _OUTPUT_IMAGE_WIDTH)) for img_filename in cluster_image_filenames]
|
||||||
|
|
||||||
|
img = Tools.display_mosaic(images_array, nrow = _MOSAIC_NROW, ncol_max = _MOSAIC_NCOL_MAX)
|
||||||
|
img.save(os.path.join(CONFIG.get('clustering')[_CLUSTERING_TYPE]['save_directory'], "cluster_%s.png" % str(cluster_id).zfill(2)), "PNG")
|
|
@ -3,7 +3,11 @@
|
||||||
import PIL
|
import PIL
|
||||||
import os
|
import os
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from io import BytesIO
|
||||||
|
import base64
|
||||||
|
from scipy.cluster.hierarchy import dendrogram
|
||||||
from keras_preprocessing.image.utils import load_img
|
from keras_preprocessing.image.utils import load_img
|
||||||
|
import matplotlib as plt
|
||||||
|
|
||||||
|
|
||||||
class Tools:
|
class Tools:
|
||||||
|
@ -26,6 +30,26 @@ class Tools:
|
||||||
def display_index_picture(array, index = 0):
|
def display_index_picture(array, index = 0):
|
||||||
return Tools.display_one_picture(array[index])
|
return Tools.display_one_picture(array[index])
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def display_mosaic(array, nrow = 5, ncol_max = 10):
|
||||||
|
|
||||||
|
tmp = []
|
||||||
|
i = 0
|
||||||
|
image_col = []
|
||||||
|
while i < len(array):
|
||||||
|
tmp.append(array[i])
|
||||||
|
|
||||||
|
if len(tmp) % nrow == 0 and i > 0:
|
||||||
|
image_col.append(np.concatenate(tuple(tmp)))
|
||||||
|
tmp = []
|
||||||
|
if len(image_col) == ncol_max:
|
||||||
|
break
|
||||||
|
i += 1
|
||||||
|
if not image_col:
|
||||||
|
image_col.append(np.concatenate(tuple(tmp)))
|
||||||
|
image = np.concatenate(tuple(image_col), axis = 1)
|
||||||
|
return Tools.display_one_picture(image)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create_dir_if_not_exists(path):
|
def create_dir_if_not_exists(path):
|
||||||
if not os.path.exists(path):
|
if not os.path.exists(path):
|
||||||
|
@ -35,10 +59,16 @@ class Tools:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def encoded_pictures_from_generator(generator, model):
|
def encoded_pictures_from_generator(generator, model):
|
||||||
|
|
||||||
predictions_tuple = tuple([model.get_encoded_prediction(imgs) for imgs in generator])
|
predictions_list = []
|
||||||
predictions = np.concatenate(predictions_tuple, axis = 0)
|
predictions_id = []
|
||||||
|
for imgs in generator:
|
||||||
|
predictions_id.append(imgs[0])
|
||||||
|
predictions_list.append(model.get_encoded_prediction(imgs[1]))
|
||||||
|
|
||||||
|
predictions = np.concatenate(tuple(predictions_list), axis = 0)
|
||||||
|
predictions_id = [os.path.splitext(os.path.basename(id))[0] for sub_id in predictions_id for id in sub_id]
|
||||||
|
|
||||||
return predictions
|
return predictions_id, predictions
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def read_np_picture(path, target_size = None, scale = 1):
|
def read_np_picture(path, target_size = None, scale = 1):
|
||||||
|
@ -51,7 +81,9 @@ class Tools:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def list_directory_filenames(path):
|
def list_directory_filenames(path):
|
||||||
filenames = os.listdir(path)
|
filenames = os.listdir(path)
|
||||||
filenames = [path + f for f in filenames]
|
np.random.seed(33213)
|
||||||
|
np.random.shuffle(filenames)
|
||||||
|
filenames = [os.path.join(path,f) for f in filenames]
|
||||||
|
|
||||||
return filenames
|
return filenames
|
||||||
|
|
||||||
|
@ -65,15 +97,45 @@ class Tools:
|
||||||
nb_batch = div[0] + 1 * (div[1] != 0)
|
nb_batch = div[0] + 1 * (div[1] != 0)
|
||||||
|
|
||||||
for i in range(nb_batch):
|
for i in range(nb_batch):
|
||||||
# for i in [75, 76]:
|
|
||||||
i_debut = i*batch
|
i_debut = i*batch
|
||||||
i_fin = min(i_debut + batch, max_n)
|
i_fin = min(i_debut + batch, max_n)
|
||||||
print("i_debut:" + str(i_debut))
|
yield (filenames[i_debut:i_fin], np.array([Tools.read_np_picture(f, target_size, scale) for f in filenames[i_debut:i_fin]]))
|
||||||
print("i_fin:" + str(i_fin))
|
|
||||||
yield np.array([Tools.read_np_picture(f, target_size, scale) for f in filenames[i_debut:i_fin]])
|
@staticmethod
|
||||||
|
def bytes_image(array):
|
||||||
|
image = Tools.display_one_picture(array)
|
||||||
|
buffer = BytesIO()
|
||||||
|
image.save(buffer, format='png')
|
||||||
|
im_bytes = buffer.getvalue()
|
||||||
|
|
||||||
|
return im_bytes
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def base64_image(array):
|
||||||
|
for_encoding = Tools.bytes_image(array)
|
||||||
|
return 'data:image/png;base64,' + base64.b64encode(for_encoding).decode()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_color_from_label(label, n_labels = 50, palette = 'viridis'):
|
||||||
|
cmap = plt.cm.get_cmap(palette, n_labels)
|
||||||
|
return plt.colors.to_hex(cmap(int(label)))
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def plot_dendrogram(model, **kwargs):
|
||||||
|
|
||||||
|
# Children of hierarchical clustering
|
||||||
|
children = model.children_
|
||||||
|
|
||||||
|
# Distances between each pair of children
|
||||||
|
# Since we don't have this information, we can use a uniform one for plotting
|
||||||
|
distance = np.arange(children.shape[0])
|
||||||
|
|
||||||
|
# The number of observations contained in each cluster level
|
||||||
|
no_of_observations = np.arange(2, children.shape[0]+2)
|
||||||
|
|
||||||
|
# Create linkage matrix and then plot the dendrogram
|
||||||
|
linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)
|
||||||
|
|
||||||
|
# Plot the corresponding dendrogram
|
||||||
|
dendrogram(linkage_matrix, **kwargs)
|
Loading…
Reference in a new issue