tests clustering + mosaic: failure + test facets

2024-04-25 10:40:26 +02:00 · 2019-12-12 01:08:04 +01:00 · 2019-12-12 01:08:04 +01:00 · 4a95eb9d92
parent ac1f75d28e
commit 4a95eb9d92
10 changed files with 172 additions and 35 deletions
--- a/9
+++ b/9
@ -69,6 +69,15 @@ exec_clustering:
 	$(PYTHON_INTERPRETER) -m iss.exec.clustering


+#################################################################################
+# OUTSIDE CONTAINER                                                             #
+#################################################################################
+
+maximize_test:
+	cp $(PROJECT_DIR)/data/raw/collections/20180211-130001.jpg $(PROJECT_DIR)/data/isr/input/sample/
+	docker run -v "$(PROJECT_DIR)/data/isr:/home/isr/data" -v "$(PROJECT_DIR)/../image-super-resolution/weights:/home/isr/weights" -v "$(PROJECT_DIR)/config/config_isr.yml:/home/isr/config.yml" -it isr -d -p -c config.yml
+
+
 #################################################################################
 # FLOYDHUB                                                                      #
 #################################################################################
--- a/iss/clustering/AbstractClustering.py
+++ b/iss/clustering/AbstractClustering.py
@ -31,6 +31,9 @@ class AbstractClustering:

    def predict_embedding(self):
        raise NotImplementedError
+
+    def predict_label(self):
+        raise NotImplementedError
           
    def save(self):
        raise NotImplementedError
--- a/iss/clustering/DBScanClustering.py
+++ b/iss/clustering/DBScanClustering.py
@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+
+import os
+import numpy as np
+import umap
+import hdbscan
+from iss.tools import Tools
+from iss.clustering import AbstractClustering
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_samples
+from sklearn.externals import joblib
+
+class DBScanClustering(AbstractClustering):
+    """
+    Cf: https://umap-learn.readthedocs.io/en/latest/clustering.html
+    """
+
+    def __init__(self, config, pictures_id = None, pictures_np = None):
+
+        super().__init__(config, pictures_id, pictures_np)
+        
+        self.umap_args = self.config['umap']
+        self.umap_fit = None
+        self.umap_embedding = None
+        self.umap_save_name = 'UMAP_model.pkl'
+
+        self.dbscan_fit = None
+        self.dbscan_args = self.config['dbscan']
+        self.dbscan_labels = None
+        self.dbscan_centers = []
+        self.dbscan_save_name = "dbscan_model.pkl"
+
+        
+    def compute_umap(self):
+        self.umap_fit = umap.UMAP(**self.umap_args)
+        self.umap_embedding = self.umap_fit.fit_transform(self.pictures_np)
+        return self
+
+    def compute_dbscan(self):
+        self.dbscan_fit = hdbscan.HDBSCAN(**self.dbscan_args)
+        self.dbscan_fit.fit(self.umap_embedding)
+        self.dbscan_labels = self.dbscan_fit.labels_
+        return self
+
+    def compute_final_labels(self):
+        self.final_labels  = self.dbscan_labels
+        return self
+
+    def compute_silhouette_score(self):
+        self.silhouette_score = silhouette_samples(self.pictures_np, self.final_labels)
+        self.silhouette_score_labels = {cluster: np.mean(self.silhouette_score[self.final_labels == cluster]) for 
+        cluster in np.unique(self.final_labels)}
+        return self.silhouette_score_labels
+
+    def predict_embedding(self, pictures_np):
+        return self.umap_fit.transform(pictures_np)
+
+    def save(self):
+        Tools.create_dir_if_not_exists(self.save_directory)
+
+        joblib.dump(self.umap_fit, os.path.join(self.save_directory, self.umap_save_name))
+        joblib.dump(self.dbscan_fit, os.path.join(self.save_directory, self.dbscan_save_name))
+
+    def load(self):
+        self.umap_fit = joblib.load(os.path.join(self.save_directory, self.umap_save_name))
+        self.dbscan_fit = joblib.load(os.path.join(self.save_directory, self.dbscan_save_name))
--- a/iss/clustering/N2DClustering.py
+++ b/iss/clustering/N2DClustering.py
@ -53,6 +53,9 @@ class N2DClustering(AbstractClustering):

    def predict_embedding(self, pictures_np):
        return self.umap_fit.transform(pictures_np)
+    
+    def predict_label(self, pictures_embedding):
+        return self.kmeans_fit.predict(pictures_embedding)

    def save(self):
        Tools.create_dir_if_not_exists(self.save_directory)
--- a/iss/clustering/init.py
+++ b/iss/clustering/init.py
@ -1,4 +1,5 @@
 from .AbstractClustering import AbstractClustering
 from .ClassicalClustering import ClassicalClustering
 from .AdvancedClustering import AdvancedClustering
-from .N2DClustering import N2DClustering
+from .N2DClustering import N2DClustering
+from .DBScanClustering import DBScanClustering
--- a/iss/data/DataBaseManager.py
+++ b/iss/data/DataBaseManager.py
@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+import numpy as np


 class MysqlDataBaseManager:
@ -45,6 +46,7 @@ CREATE TABLE IF NOT EXISTS `iss`.`pictures_embedding` (
  `pictures_id` VARCHAR( 15 ) ,
  `pictures_x` FLOAT(8, 4),
  `pictures_y` FLOAT(8, 4),
+  `label` INT NULL,
  `clustering_type` VARCHAR(15),
  `clustering_version` VARCHAR(5),
  `clustering_model_type` VARCHAR(15),
@ -68,9 +70,17 @@ CREATE TABLE IF NOT EXISTS `iss`.`pictures_embedding` (

 	def insert_row_pictures_embedding(self, array):

-		sql_insert_template = "INSERT INTO `iss`.`pictures_embedding` (pictures_id, pictures_x, pictures_y, clustering_type, clustering_version, clustering_model_type, clustering_model_name) VALUES (%s, %s, %s, %s, %s, %s, %s);"
+		sql_insert_template = "INSERT INTO `iss`.`pictures_embedding` (pictures_id, pictures_x, pictures_y, label, clustering_type, clustering_version, clustering_model_type, clustering_model_name) VALUES (%s, %s, %s, %s, %s, %s, %s, %s);"

 		self.cursor.executemany(sql_insert_template, array)
 		self.conn.commit()

 		return self.cursor.rowcount
+
+	def select_close_embedding(self, x, y, limit):
+		sql_req = "SELECT pictures_id, SQRT(POWER(pictures_x - %s, 2) + POWER(pictures_y - %s, 2)) as distance FROM iss.pictures_embedding ORDER BY distance ASC LIMIT %s"
+
+		self.cursor.execute(sql_req, (float(np.round(x, 4)), float(np.round(y, 4)), limit))
+
+		return self.cursor.fetchall()
+
--- a/iss/exec/bdd.py
+++ b/iss/exec/bdd.py
@ -6,24 +6,10 @@ import datetime as dt
 import numpy as np

 from iss.init_config import CONFIG
-from iss.data.DataBaseManager import MysqlDataBaseManager
-from iss.clustering import ClassicalClustering, AdvancedClustering, N2DClustering
+from iss.clustering import ClassicalClustering, AdvancedClustering, N2DClustering, DBScanClustering
 from iss.tools import Tools


-def create_db_manager(config):
-
-    CON_MYSQL = mysql.connector.connect(
-        host = config.get('mysql')['database']['server'],
-        user = config.get('mysql')['database']['user'],
-        passwd = config.get('mysql')['database']['password'],
-        database = config.get('mysql')['database']['name'],
-        port = config.get('mysql')['database']['port']
-    )
-
-    return MysqlDataBaseManager(CON_MYSQL, config)
-
-
 def populate_locations(config, db_manager):

    history = pd.read_csv(os.path.join(CONFIG.get("directory")['data_dir'], "raw", "history", "history.txt"), sep=";", names=['latitude', 'longitude', 'id', 'location'])
@ -40,11 +26,7 @@ def populate_locations(config, db_manager):

 def populate_embedding(config, db_manager, clustering_type, clustering_version, clustering_model_type, clustering_model_name, drop=False):

-    db_manager.create_pictures_embedding_table()
-    clustering_config = config.get('clustering')[clustering_type]
-    clustering_config['version'] = clustering_version
-    clustering_config['model']['type'] = clustering_model_type
-    clustering_config['model']['name'] = clustering_model_name
+    clustering, clustering_config = Tools.load_clustering(CONFIG, clustering_type, clustering_version, clustering_model_type, clustering_model_name)

    if drop:
        db_manager.drop_embedding_partition(clustering_type, clustering_version, clustering_model_type, clustering_model_name)
@ -53,6 +35,8 @@ def populate_embedding(config, db_manager, clustering_type, clustering_version,
        clustering = N2DClustering(clustering_config)
    elif clustering_type == 'classical':
        clustering = ClassicalClustering(clustering_config)
+    elif clustering_type == 'dbscan':
+        clustering = DBScanClustering(clustering_config)
    else:
        raise Exception

@ -64,12 +48,14 @@ def populate_embedding(config, db_manager, clustering_type, clustering_version,
    count = 0
    for ids, latents in generator:
        pictures_embedding = clustering.predict_embedding(latents)
+        pictures_label = clustering.predict_label(pictures_embedding)
        rows = []
        for i, id in enumerate(ids):
            rows.append((
                id,
                float(np.round(pictures_embedding[i][0], 4)),
                float(np.round(pictures_embedding[i][1], 4)),
+                int(pictures_label[i]),
                clustering_type,
                clustering_version,
                clustering_model_type,
@ -84,12 +70,24 @@ def populate_embedding(config, db_manager, clustering_type, clustering_version,

 def main(action = 'populate_embedding'):

-    db_manager = create_db_manager(CONFIG)
+    db_manager = Tools.create_db_manager(CONFIG)

    if action == 'population_locations':
        populate_locations(CONFIG, db_manager)
    elif action == 'populate_embedding':
-        populate_embedding(CONFIG, db_manager, 'n2d', 1, 'simple_conv', 'model_colab')
+        db_manager.create_pictures_embedding_table(False)
+        to_load = [
+            {'clustering_type': 'n2d', 'clustering_version': 1, 'clustering_model_type': 'simple_conv', 'clustering_model_name': 'model_colab', 'drop': False},
+            {'clustering_type': 'n2d', 'clustering_version': 2, 'clustering_model_type': 'simple_conv', 'clustering_model_name': 'model_colab', 'drop': False},
+            {'clustering_type': 'n2d', 'clustering_version': 3, 'clustering_model_type': 'simple_conv', 'clustering_model_name': 'model_colab', 'drop': False},
+            ]
+        for kwargs in to_load:
+            try:
+                populate_embedding(CONFIG, db_manager, **kwargs)
+            except Exception as err:
+                print(err)
+                pass
+            
    else:
        pass

--- a/iss/exec/clustering.py
+++ b/iss/exec/clustering.py
@ -11,7 +11,7 @@ from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper
 from iss.init_config import CONFIG
 from iss.tools import Tools
 from iss.models import SimpleConvAutoEncoder, SimpleAutoEncoder
-from iss.clustering import ClassicalClustering, AdvancedClustering, N2DClustering
+from iss.clustering import ClassicalClustering, AdvancedClustering, N2DClustering, DBScanClustering


 _DEBUG = True
@ -45,6 +45,14 @@ def run_clustering(config, clustering_type, pictures_id, intermediate_output):
        clustering.compute_kmeans()
        clustering.compute_final_labels()
        clustering.compute_colors()
+    elif clustering_type == 'dbscan':
+        if _DEBUG:
+            print("HDBSCAN Clustering")
+        clustering = DBScanClustering(config.get('clustering')['dbscan'], pictures_id, intermediate_output)
+        clustering.compute_umap()
+        clustering.compute_dbscan()
+        clustering.compute_final_labels()
+        clustering.compute_colors()

    return clustering

@ -71,7 +79,7 @@ def run_plots(config, clustering_type, clustering):
        ax.add_artist(legend1)
        plt.savefig(os.path.join(clustering.save_directory, 'tsne_clusters.png'))

-    if clustering_type in ['n2d']:
+    if clustering_type in ['n2d', 'dbscan']:
        ## Graphs of TSNE and final clusters
        fig, ax = plt.subplots(figsize=(24, 14))
        classes = clustering.final_labels
@ -80,12 +88,12 @@ def run_plots(config, clustering_type, clustering):
        ax.add_artist(legend1)
        plt.savefig(os.path.join(clustering.save_directory, 'umap_clusters.png'))

-    if clustering_type in ['n2d', 'classical']:
+    if clustering_type in ['n2d', 'classical', 'dbscan']:
        filenames = [os.path.join(config.get('directory')['collections'], "%s.jpg" % one_res[0]) for one_res in clustering.get_results()]
        images_array = [Tools.read_np_picture(img_filename, target_size = (54, 96)) for img_filename in filenames]
        base64_images = [Tools.base64_image(img) for img in images_array]

-        if clustering_type == 'n2d':
+        if clustering_type in ['n2d', 'dbscan']:
            x = clustering.umap_embedding[:, 0]
            y = clustering.umap_embedding[:, 1]
            html_file = 'umap_bokeh.html'
@ -181,7 +189,7 @@ def plot_mosaics(config, clustering_type, clustering, output_image_width, output
 def main():
    _CLUSTERING_TYPE = 'n2d'
    _BATCH_SIZE = 496
-    _N_BATCH = 10
+    _N_BATCH = 5
    _PLOTS = True
    _MOSAICS = True
    _SILHOUETTE = True
--- a/iss/tools/tools.py
+++ b/iss/tools/tools.py
@ -4,12 +4,15 @@ import PIL
 import os
 import re
 import numpy as np
+import mysql.connector
 from io import BytesIO
 import base64
 from scipy.cluster.hierarchy import dendrogram
 from keras_preprocessing.image.utils import load_img
 import matplotlib as plt

+from iss.data.DataBaseManager import MysqlDataBaseManager
+

 class Tools:

@ -172,16 +175,35 @@ class Tools:
 		model_config = config.get('models')[model_type]

 		return model, model_config
+	
+	@staticmethod
+	def load_clustering(config, clustering_type, clustering_version, clustering_model_type, clustering_model_name):
+		from iss.clustering import ClassicalClustering, AdvancedClustering, N2DClustering
+
+		clustering_config = config.get('clustering')[clustering_type]
+		clustering_config['version'] = clustering_version
+		clustering_config['model']['type'] = clustering_model_type
+		clustering_config['model']['name'] = clustering_model_name
+
+		if clustering_type == 'n2d':
+			clustering = N2DClustering(clustering_config)
+		elif clustering_type == 'classical':
+			clustering = ClassicalClustering(clustering_config)
+		else:
+			raise Exception
+
+		clustering.load()
+		return clustering, clustering_config

 	@staticmethod
-	def load_latent_representation(config, model, model_config, filenames, batch_size, n_batch, by_step):
+	def load_latent_representation(config, model, model_config, filenames, batch_size, n_batch, by_step, scale=1./255):
 		"""
 		load images and predictions
 		"""
 		if by_step:
 			return Tools.load_latent_representation_by_step(config, model, model_config, filenames, batch_size, n_batch)
 		
-		generator_imgs = Tools.generator_np_picture_from_filenames(filenames, target_size = (model_config['input_height'], model_config['input_width']), batch = batch_size, nb_batch = n_batch)
+		generator_imgs = Tools.generator_np_picture_from_filenames(filenames, target_size = (model_config['input_height'], model_config['input_width']), scale=scale, batch = batch_size, nb_batch = n_batch)

 		pictures_id, pictures_preds = Tools.encoded_pictures_from_generator(generator_imgs, model, by_step)
 		intermediate_output = pictures_preds.reshape((pictures_preds.shape[0], -1))
@ -189,9 +211,22 @@ class Tools:
 		return pictures_id, intermediate_output

 	@staticmethod
-	def load_latent_representation_by_step(config, model, model_config, filenames, batch_size, n_batch):
-		generator_imgs = Tools.generator_np_picture_from_filenames(filenames, target_size = (model_config['input_height'], model_config['input_width']), batch = batch_size, nb_batch = n_batch)
+	def load_latent_representation_by_step(config, model, model_config, filenames, batch_size, n_batch, scale=1./255):
+		generator_imgs = Tools.generator_np_picture_from_filenames(filenames, target_size = (model_config['input_height'], model_config['input_width']), scale=scale, batch = batch_size, nb_batch = n_batch)

 		for pictures_id, pictures_preds in Tools.encoded_pictures_from_generator(generator_imgs, model, True):
 			intermediate_output = pictures_preds.reshape((pictures_preds.shape[0], -1))
 			yield pictures_id, intermediate_output
+
+	@staticmethod
+	def create_db_manager(config):
+
+		CON_MYSQL = mysql.connector.connect(
+			host = config.get('mysql')['database']['server'],
+			user = config.get('mysql')['database']['user'],
+			passwd = config.get('mysql')['database']['password'],
+			database = config.get('mysql')['database']['name'],
+			port = config.get('mysql')['database']['port']
+		)
+
+		return MysqlDataBaseManager(CON_MYSQL, config)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,7 @@
 setuptools==40.8.0
 Click==7.0
-numpy==1.13.3
+# numpy==1.13.3
+numpy==1.17.4
 pandas==0.23.4
 tensorflow==1.12.0
 Keras==2.2.4
@ -10,4 +11,7 @@ python-dotenv==0.10.1
 PyYAML==3.13
 matplotlib>=3.1.0
 umap-learn==0.3.10
-bokeh==0.13.0
+bokeh==0.13.0
+mysql-connector-python==8.0.18
+hdbscan==0.8.24
+facets_overview==1.0.0