1
0
Fork 0
mirror of https://github.com/prise6/smart-iss-posts synced 2024-04-26 11:10:28 +02:00

test du clustering simple

This commit is contained in:
Francois Vieille 2019-04-16 21:54:11 +02:00
parent 6a45ee0b04
commit 4c5a826ec7
5 changed files with 982 additions and 12 deletions

View file

@ -2,7 +2,7 @@
class AbstractClustering:
def __init__(config, pictures_id, pictures_np):
def __init__(self, config, pictures_id, pictures_np):
self.config = config
self.pictures_id = pictures_id

View file

@ -1,53 +1,63 @@
# -*- coding: utf-8 -*-
import os
import numpy as np
from iss.clustering import AbstractClustering
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from iss.tools import Tools
from sklearn.externals import joblib
class ClassicalClustering(AbstractClustering):
def __init__(config, pictures_id, pictures_np):
def __init__(self, config, pictures_id = None, pictures_np = None):
super().__init__(config, pictures_id, pictures_np)
self.pca_fit = None
self.pca_args = self.config['PCA']
self.pca_reduction = None
self.pca_save_name = "PCA_model_v%s.pkl" % (self.config['version'])
self.kmeans_fit = None
self.kmeans_args = self.config['kmeans']
self.kmeans_labels = None
self.kmeans_centers = []
self.kmeans_save_name = "kmeans_model_v%s.pkl" % (self.config['version'])
self.cah_fit = None
self.cah_args = self.config['CAH']
self.cah_labels = None
self.cah_save_name = "cah_model_v%s.pkl" % (self.config['version'])
self.final_labels = None
super().__init__(config, pictures_id, pictures_np)
def compute_pca(self):
def pca_fit(self):
self.pca_fit = PCA(**self.pca_args**)
self.pca_fit = PCA(**self.pca_args)
self.pca_fit.fit(self.pictures_np)
self.pca_reduction = self.pca_fit.transform(self.pictures_np)
return self
def kmeans_fit(self):
self.kmeans_fit = KMeans(self.kmeans_args**)
def compute_kmeans(self):
self.kmeans_fit = KMeans(**self.kmeans_args)
self.kmeans_fit.fit(self.pca_reduction)
self.kmeans_labels = self.kmeans_fit.labels_
return self
def compute_kmeans_centers(self):
for cl in range(self.kmeans_args['n_clusters']):
tmp = self.[np.where(self.kmeans_labels == cl)]
tmp = self.pca_reduction[np.where(self.kmeans_labels == cl)]
self.kmeans_centers.append(np.mean(tmp, axis = 0))
return self
def cah_fit(self):
def compute_cah(self):
self.cah_fit = AgglomerativeClustering(self.cah_args**)
self.cah_fit = AgglomerativeClustering(**self.cah_args)
self.cah_fit.fit_predict(self.kmeans_centers)
self.cah_labels = self.cah_fit.labels_
return self
@ -55,4 +65,19 @@ class ClassicalClustering(AbstractClustering):
def compute_cah_labels(self):
self.final_labels = [self.cah_labels[old_cl] for old_cl in self.kmeans_labels]
def get_zip_results(self):
return zip(self.pictures_id, self.final_labels, self.kmeans_labels, self.pictures_np)
def save(self):
Tools.create_dir_if_not_exists(self.config['save_directory'])
joblib.dump(self.pca_fit, os.path.join(self.config['save_directory'], self.pca_save_name))
joblib.dump(self.kmeans_fit, os.path.join(self.config['save_directory'], self.kmeans_save_name))
joblib.dump(self.cah_fit, os.path.join(self.config['save_directory'], self.cah_save_name))
def load(self):
self.pca_fit = joblib.load(os.path.join(self.config['save_directory'], self.pca_save_name))
self.kmeans_fit = joblib.load(os.path.join(self.config['save_directory'], self.kmeans_save_name))
self.cah_fit = joblib.load(os.path.join(self.config['save_directory'], self.cah_save_name))

View file

@ -1 +1,2 @@
from .AbstractClustering import AbstractClustering
from .AbstractClustering import AbstractClustering
from .ClassicalClustering import ClassicalClustering

View file

@ -0,0 +1,155 @@
#%% [markdown]
# # Clustering classique
#%% [markdown]
# ## import classique
import os
#%%
%load_ext autoreload
%autoreload 2
os.chdir('/home/jovyan/work')
#%% [markdown]
# ## Import iss
#%%
from iss.tools import Config
from iss.tools import Tools
from iss.models import SimpleConvAutoEncoder
from iss.clustering import ClassicalClustering
from dotenv import find_dotenv, load_dotenv
import numpy as np
#%% [markdown]
# ## Chargement de la config
#%%
load_dotenv(find_dotenv())
cfg = Config(project_dir = os.getenv("PROJECT_DIR"), mode = os.getenv("MODE"))
#%% [markdown]
# ## Chargement du modèle
#%%
## charger le modèle
model_type = 'simple_conv'
cfg.get('models')[model_type]['model_name'] = 'model_colab'
model = SimpleConvAutoEncoder(cfg.get('models')[model_type])
#%% [markdown]
## Chargement des images
#%%
filenames = Tools.list_directory_filenames('data/processed/models/autoencoder/train/k/')
generator_imgs = Tools.generator_np_picture_from_filenames(filenames, target_size = (27, 48), batch = 496, nb_batch = 10)
#%%
pictures_id, pictures_preds = Tools.encoded_pictures_from_generator(generator_imgs, model)
#%%
intermediate_output = pictures_preds.reshape((pictures_preds.shape[0], 3*6*16))
#%% [markdown]
# ## ACP
# Réduction de la dimension
#%%
clustering = ClassicalClustering(cfg.get('clustering')['classical'], pictures_id, intermediate_output)
#%%
clustering.compute_pca()
#%% [markdown]
# ## Kmeans
# Premiers clusters
#%%
clustering.compute_kmeans()
clustering.compute_kmeans_centers()
#%% [markdown]
# ## CAH
# Seconds clusters
#%%
clustering.compute_cah()
clustering.compute_cah_labels()
#%% [markdown]
# ## Résultats
#%% [markdown]
# ### Clusters intermediaires
#%%
fig = plt.figure(1, figsize=(12, 7))
plt.scatter(clustering.pca_reduction[:, 0], clustering.pca_reduction[:, 1], c = clustering.kmeans_labels)
#%% [markdown]
# ### Clusters finaux
#%%
plt.scatter(clustering.pca_reduction[:, 0], clustering.pca_reduction[:, 1], c = clustering.final_labels)
#%% [markdown]
# ### Sauvegarde des modèles
#%%
clustering.save()
#%%
# clustering = ClassicalClustering(cfg.get('clustering')['classical'])
clustering.load()
#%% [markdown]
# ## Visualisation des clusters
#%%
def select_cluster(clustering, id_cluster):
return [os.path.join('data/processed/models/autoencoder/train/k/', res[0] + '.jpg') for res in clustering.get_zip_results() if res[2] == id_cluster]
#%%
from IPython.display import Image
#%%
for cl in range(0,19):
print("Cluster %s" % (cl))
res_tmp = select_cluster(clustering, cl)
print(len(res_tmp))
image_array = [Tools.read_np_picture(f, target_size = (54, 96)) for f in res_tmp[:100]]
# img = Tools.display_mosaic(image_array, nrow = 10)
# fig = plt.figure(1, figsize=(12, 7))
# plt.imshow(img, aspect = 'auto')
# plt.show()
#%% [markdown]
# ## Zoom sur le cluster 0
#%%
res_tmp = select_cluster(clustering, 1)
#%%
print(len(res_tmp))
image_array = [Tools.read_np_picture(f, target_size = (54, 96)) for f in res_tmp]
#%%
Tools.display_mosaic(image_array, nrow = 18)
#%%
col = [1 if l == 1 else 0 for l in clustering.kmeans_labels]
plt.scatter(clustering.pca_reduction[:, 0], clustering.pca_reduction[:, 1], c = col)
#%%
plt.scatter(clustering.pca_reduction[np.array(col) == 1, 0], clustering.pca_reduction[np.array(col) == 1, 1])
#%%

File diff suppressed because one or more lines are too long