From ac1f75d28e4d4a6bf744346dd412e43d38af45ee Mon Sep 17 00:00:00 2001 From: Francois Vieille Date: Wed, 11 Dec 2019 03:04:58 +0100 Subject: [PATCH] predict picture embedding and save it --- iss/clustering/AbstractClustering.py | 5 +- iss/clustering/N2DClustering.py | 5 +- iss/data/DataBaseManager.py | 55 ++++++++++++-- iss/exec/bdd.py | 106 ++++++++++++++++++++++----- iss/exec/clustering.py | 47 ++---------- iss/tools/tools.py | 70 ++++++++++++++++-- 6 files changed, 211 insertions(+), 77 deletions(-) diff --git a/iss/clustering/AbstractClustering.py b/iss/clustering/AbstractClustering.py index c1b9706..a70c518 100644 --- a/iss/clustering/AbstractClustering.py +++ b/iss/clustering/AbstractClustering.py @@ -29,8 +29,11 @@ class AbstractClustering: self.colors = [Tools.get_color_from_label(label, n_classes) for label in self.final_labels] return self + def predict_embedding(self): + raise NotImplementedError + def save(self): raise NotImplementedError def load(self): - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/iss/clustering/N2DClustering.py b/iss/clustering/N2DClustering.py index 9e28f73..c9d5889 100644 --- a/iss/clustering/N2DClustering.py +++ b/iss/clustering/N2DClustering.py @@ -51,6 +51,9 @@ class N2DClustering(AbstractClustering): cluster in np.unique(self.final_labels)} return self.silhouette_score_labels + def predict_embedding(self, pictures_np): + return self.umap_fit.transform(pictures_np) + def save(self): Tools.create_dir_if_not_exists(self.save_directory) @@ -58,5 +61,5 @@ class N2DClustering(AbstractClustering): joblib.dump(self.kmeans_fit, os.path.join(self.save_directory, self.kmeans_save_name)) def load(self): - self.umap_fit = joblib.load(os.path.join(self.save_directory, self.pca_save_name)) + self.umap_fit = joblib.load(os.path.join(self.save_directory, self.umap_save_name)) self.kmeans_fit = joblib.load(os.path.join(self.save_directory, self.kmeans_save_name)) \ No newline at end of file diff --git a/iss/data/DataBaseManager.py b/iss/data/DataBaseManager.py index 6d64964..1465585 100644 --- a/iss/data/DataBaseManager.py +++ b/iss/data/DataBaseManager.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- -class DataBaseManager: +class MysqlDataBaseManager: def __init__(self, connexion, config): self.conn = connexion @@ -9,25 +9,66 @@ class DataBaseManager: self.cursor = self.conn.cursor() - def createPicturesTable(self, force = False): + def create_pictures_location_table(self, force = False): if force: - self.cursor.execute("DROP TABLE IF EXISTS `iss`.`pictures`;") + self.cursor.execute("DROP TABLE IF EXISTS `iss`.`pictures_location`;") self.cursor.execute(""" -CREATE TABLE `iss`.`pictures` ( +CREATE TABLE IF NOT EXISTS `iss`.`pictures_location` ( `pictures_latitude` FLOAT(10, 6) NULL, `pictures_longitude` FLOAT(10, 6 ) NULL , `pictures_id` VARCHAR( 15 ) PRIMARY KEY , `pictures_timestamp` TIMESTAMP NULL , - `pictures_location` TEXT NULL + `pictures_location_text` TEXT NULL ) ENGINE = MYISAM ; """) - def insertRowPictures(self, array): + def insert_row_pictures_location(self, array): - sql_insert_template = "INSERT INTO `iss`.`pictures` (pictures_latitude, pictures_longitude, pictures_id, pictures_timestamp, pictures_location) VALUES (%s, %s, %s, %s, %s);" + sql_insert_template = "INSERT INTO `iss`.`pictures_location` (pictures_latitude, pictures_longitude, pictures_id, pictures_timestamp, pictures_location_text) VALUES (%s, %s, %s, %s, %s);" + + self.cursor.executemany(sql_insert_template, array) + self.conn.commit() + + return self.cursor.rowcount + + + def create_pictures_embedding_table(self, force = False): + + if force: + self.cursor.execute("DROP TABLE IF EXISTS `iss`.`pictures_embedding`;") + + self.cursor.execute(""" +CREATE TABLE IF NOT EXISTS `iss`.`pictures_embedding` ( + `pictures_id` VARCHAR( 15 ) , + `pictures_x` FLOAT(8, 4), + `pictures_y` FLOAT(8, 4), + `clustering_type` VARCHAR(15), + `clustering_version` VARCHAR(5), + `clustering_model_type` VARCHAR(15), + `clustering_model_name` VARCHAR(15), + UNIQUE KEY `unique_key` (`pictures_id`,`clustering_type`, `clustering_version`, `clustering_model_type`,`clustering_model_name`), + KEY `index_key_1` (`pictures_id`) +) ENGINE = MYISAM ; + """) + + + def drop_embedding_partition(self, clustering_type, clustering_version, clustering_model_type, clustering_model_name): + + req = "DELETE FROM `iss`.`pictures_embedding` WHERE clustering_type = %s AND clustering_version = %s AND clustering_model_type = %s AND clustering_model_name = %s" + + self.cursor.execute(req, (clustering_type, clustering_version, clustering_model_type, clustering_model_name)) + + self.conn.commit() + + return self.cursor.rowcount + + + def insert_row_pictures_embedding(self, array): + + sql_insert_template = "INSERT INTO `iss`.`pictures_embedding` (pictures_id, pictures_x, pictures_y, clustering_type, clustering_version, clustering_model_type, clustering_model_name) VALUES (%s, %s, %s, %s, %s, %s, %s);" self.cursor.executemany(sql_insert_template, array) self.conn.commit() diff --git a/iss/exec/bdd.py b/iss/exec/bdd.py index 33d4721..10bd893 100644 --- a/iss/exec/bdd.py +++ b/iss/exec/bdd.py @@ -3,28 +3,96 @@ import time import mysql.connector import pandas as pd import datetime as dt +import numpy as np from iss.init_config import CONFIG -from iss.data.DataBaseManager import DataBaseManager - -CON_MYSQL = mysql.connector.connect( - host = CONFIG.get('mysql')['database']['server'], - user = CONFIG.get('mysql')['database']['user'], - passwd = CONFIG.get('mysql')['database']['password'], - database = CONFIG.get('mysql')['database']['name'], - port = CONFIG.get('mysql')['database']['port'] -) - -dbm = DataBaseManager(CON_MYSQL, CONFIG) +from iss.data.DataBaseManager import MysqlDataBaseManager +from iss.clustering import ClassicalClustering, AdvancedClustering, N2DClustering +from iss.tools import Tools -history = pd.read_csv(os.path.join(CONFIG.get("directory")['data_dir'], "raw", "history", "history.txt"), sep=";", names=['latitude', 'longitude', 'id', 'location']) -history['timestamp'] = pd.to_datetime(history.id, format="%Y%m%d-%H%M%S").dt.strftime("%Y-%m-%d %H:%M:%S") -history.fillna('NULL', inplace=True) -history = history[['latitude', 'longitude', 'id', 'timestamp', 'location']] -history_tuple = [tuple(x) for x in history.values] +def create_db_manager(config): -dbm.createPicturesTable(force=True) -count = dbm.insertRowPictures(history_tuple) + CON_MYSQL = mysql.connector.connect( + host = config.get('mysql')['database']['server'], + user = config.get('mysql')['database']['user'], + passwd = config.get('mysql')['database']['password'], + database = config.get('mysql')['database']['name'], + port = config.get('mysql')['database']['port'] + ) -print(count) \ No newline at end of file + return MysqlDataBaseManager(CON_MYSQL, config) + + +def populate_locations(config, db_manager): + + history = pd.read_csv(os.path.join(CONFIG.get("directory")['data_dir'], "raw", "history", "history.txt"), sep=";", names=['latitude', 'longitude', 'id', 'location']) + history['timestamp'] = pd.to_datetime(history.id, format="%Y%m%d-%H%M%S").dt.strftime("%Y-%m-%d %H:%M:%S") + history.fillna('NULL', inplace=True) + history = history[['latitude', 'longitude', 'id', 'timestamp', 'location']] + history_tuple = [tuple(x) for x in history.values] + + db_manager.create_pictures_location_table(force=True) + count = db_manager.insert_row_pictures_location(history_tuple) + + print("Nombre d'insertion: %s" % count) + + +def populate_embedding(config, db_manager, clustering_type, clustering_version, clustering_model_type, clustering_model_name, drop=False): + + db_manager.create_pictures_embedding_table() + clustering_config = config.get('clustering')[clustering_type] + clustering_config['version'] = clustering_version + clustering_config['model']['type'] = clustering_model_type + clustering_config['model']['name'] = clustering_model_name + + if drop: + db_manager.drop_embedding_partition(clustering_type, clustering_version, clustering_model_type, clustering_model_name) + + if clustering_type == 'n2d': + clustering = N2DClustering(clustering_config) + elif clustering_type == 'classical': + clustering = ClassicalClustering(clustering_config) + else: + raise Exception + + clustering.load() + model, model_config = Tools.load_model(CONFIG, clustering_model_type, clustering_model_name) + filenames = Tools.list_directory_filenames(CONFIG.get('directory')['collections']) + generator = Tools.load_latent_representation(CONFIG, model, model_config, filenames, 496, None, True) + + count = 0 + for ids, latents in generator: + pictures_embedding = clustering.predict_embedding(latents) + rows = [] + for i, id in enumerate(ids): + rows.append(( + id, + float(np.round(pictures_embedding[i][0], 4)), + float(np.round(pictures_embedding[i][1], 4)), + clustering_type, + clustering_version, + clustering_model_type, + clustering_model_name + )) + count += db_manager.insert_row_pictures_embedding(rows) + print("Nombre d'insertion: %s / %s" % (count, len(filenames))) + + + return + + +def main(action = 'populate_embedding'): + + db_manager = create_db_manager(CONFIG) + + if action == 'population_locations': + populate_locations(CONFIG, db_manager) + elif action == 'populate_embedding': + populate_embedding(CONFIG, db_manager, 'n2d', 1, 'simple_conv', 'model_colab') + else: + pass + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/iss/exec/clustering.py b/iss/exec/clustering.py index 3b3069a..7e484cb 100644 --- a/iss/exec/clustering.py +++ b/iss/exec/clustering.py @@ -17,44 +17,6 @@ from iss.clustering import ClassicalClustering, AdvancedClustering, N2DClusterin _DEBUG = True -def load_model(config, clustering_type): - """ - Load model according to config - """ - - model_type = config.get('clustering')[clustering_type]['model']['type'] - model_name = config.get('clustering')[clustering_type]['model']['name'] - config.get('models')[model_type]['model_name'] = model_name - - if model_type == 'simple_conv': - model = SimpleConvAutoEncoder(config.get('models')[model_type]) - elif model_type == 'simple': - model = SimpleAutoEncoder(config.get('models')[model_type]) - else: - raise Exception - - model_config = config.get('models')[model_type] - - return model, model_config - - -def load_images(config, clustering_type, model, model_config, batch_size, n_batch): - """ - load images and predictions - """ - model_type = config.get('clustering')[clustering_type]['model']['type'] - filenames = Tools.list_directory_filenames(os.path.join(config.get('sampling')['autoencoder']['directory']['train'])) - generator_imgs = Tools.generator_np_picture_from_filenames(filenames, target_size = (model_config['input_height'], model_config['input_width']), batch = batch_size, nb_batch = n_batch) - - pictures_id, pictures_preds = Tools.encoded_pictures_from_generator(generator_imgs, model) - if model_type in ['simple_conv']: - intermediate_output = pictures_preds.reshape((pictures_preds.shape[0], model_config['latent_width']*model_config['latent_height']*model_config['latent_channel'])) - else: - intermediate_output = pictures_preds - - return pictures_id, intermediate_output - - def run_clustering(config, clustering_type, pictures_id, intermediate_output): """ Apply clustering on images @@ -217,9 +179,9 @@ def plot_mosaics(config, clustering_type, clustering, output_image_width, output def main(): - _CLUSTERING_TYPE = 'classical' + _CLUSTERING_TYPE = 'n2d' _BATCH_SIZE = 496 - _N_BATCH = 1 + _N_BATCH = 10 _PLOTS = True _MOSAICS = True _SILHOUETTE = True @@ -228,8 +190,9 @@ def main(): _MOSAIC_NROW = 10 _MOSAIC_NCOL_MAX = 10 - model, model_config = load_model(CONFIG, _CLUSTERING_TYPE) - pictures_id, intermediate_output = load_images(CONFIG, _CLUSTERING_TYPE, model, model_config, _BATCH_SIZE, _N_BATCH) + model, model_config = Tools.load_model(CONFIG, CONFIG.get('clustering')[_CLUSTERING_TYPE]['model']['type'], CONFIG.get('clustering')[_CLUSTERING_TYPE]['model']['name']) + filenames = Tools.list_directory_filenames(CONFIG.get('sampling')['autoencoder']['directory']['train']) + pictures_id, intermediate_output = Tools.load_latent_representation(CONFIG, model, model_config, filenames, _BATCH_SIZE, _N_BATCH, False) clustering = run_clustering(CONFIG, _CLUSTERING_TYPE, pictures_id, intermediate_output) diff --git a/iss/tools/tools.py b/iss/tools/tools.py index 9d65049..c2d1a8a 100644 --- a/iss/tools/tools.py +++ b/iss/tools/tools.py @@ -2,6 +2,7 @@ import PIL import os +import re import numpy as np from io import BytesIO import base64 @@ -57,19 +58,30 @@ class Tools: return path @staticmethod - def encoded_pictures_from_generator(generator, model): + def encoded_pictures_from_generator(generator, model, by_step=False): + if by_step: + return Tools.encoded_pictures_from_generator_by_step(generator, model) predictions_list = [] predictions_id = [] for imgs in generator: - predictions_id.append(imgs[0]) - predictions_list.append(model.get_encoded_prediction(imgs[1])) + tmp_id = [os.path.splitext(os.path.basename(id))[0] for id in imgs[0]] + tmp_pred = model.get_encoded_prediction(imgs[1]) + predictions_id += tmp_id + predictions_list.append(tmp_pred) predictions = np.concatenate(tuple(predictions_list), axis = 0) - predictions_id = [os.path.splitext(os.path.basename(id))[0] for sub_id in predictions_id for id in sub_id] return predictions_id, predictions + @staticmethod + def encoded_pictures_from_generator_by_step(generator, model): + for imgs in generator: + # tmp_id = [os.path.splitext(os.path.basename(id))[0] for sub_id in imgs[0] for id in sub_id] + tmp_id = [os.path.splitext(os.path.basename(id))[0] for id in imgs[0]] + tmp_pred = model.get_encoded_prediction(imgs[1]) + yield (tmp_id, tmp_pred) + @staticmethod def read_np_picture(path, target_size = None, scale = 1): # img = PIL.Image.open(filename) @@ -79,11 +91,12 @@ class Tools: return img_np @staticmethod - def list_directory_filenames(path): + def list_directory_filenames(path, pattern = ".*jpg$"): filenames = os.listdir(path) np.random.seed(33213) np.random.shuffle(filenames) - filenames = [os.path.join(path,f) for f in filenames] + pattern_regex = re.compile(pattern) + filenames = [os.path.join(path,f) for f in filenames if pattern_regex.match(f)] return filenames @@ -138,4 +151,47 @@ class Tools: linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float) # Plot the corresponding dendrogram - dendrogram(linkage_matrix, **kwargs) \ No newline at end of file + dendrogram(linkage_matrix, **kwargs) + + @staticmethod + def load_model(config, model_type, model_name): + """ + Load model according to config + """ + from iss.models import SimpleConvAutoEncoder, SimpleAutoEncoder + + config.get('models')[model_type]['model_name'] = model_name + + if model_type == 'simple_conv': + model = SimpleConvAutoEncoder(config.get('models')[model_type]) + elif model_type == 'simple': + model = SimpleAutoEncoder(config.get('models')[model_type]) + else: + raise Exception + + model_config = config.get('models')[model_type] + + return model, model_config + + @staticmethod + def load_latent_representation(config, model, model_config, filenames, batch_size, n_batch, by_step): + """ + load images and predictions + """ + if by_step: + return Tools.load_latent_representation_by_step(config, model, model_config, filenames, batch_size, n_batch) + + generator_imgs = Tools.generator_np_picture_from_filenames(filenames, target_size = (model_config['input_height'], model_config['input_width']), batch = batch_size, nb_batch = n_batch) + + pictures_id, pictures_preds = Tools.encoded_pictures_from_generator(generator_imgs, model, by_step) + intermediate_output = pictures_preds.reshape((pictures_preds.shape[0], -1)) + + return pictures_id, intermediate_output + + @staticmethod + def load_latent_representation_by_step(config, model, model_config, filenames, batch_size, n_batch): + generator_imgs = Tools.generator_np_picture_from_filenames(filenames, target_size = (model_config['input_height'], model_config['input_width']), batch = batch_size, nb_batch = n_batch) + + for pictures_id, pictures_preds in Tools.encoded_pictures_from_generator(generator_imgs, model, True): + intermediate_output = pictures_preds.reshape((pictures_preds.shape[0], -1)) + yield pictures_id, intermediate_output