diff --git a/Makefile b/Makefile index 3db5089..a49b833 100644 --- a/Makefile +++ b/Makefile @@ -45,7 +45,7 @@ debug: ## Write config template config_template: - $(PYTHON_INTERPRETER) iss/tools/config_template.py + $(PYTHON_INTERPRETER) -m iss.tools.config_template ## start docker docker_start: @@ -80,6 +80,9 @@ training: exec_clustering: $(PYTHON_INTERPRETER) -m iss.exec.clustering +facets: + $(PYTHON_INTERPRETER) -m iss.exec.facets + posters: $(PYTHON_INTERPRETER) -m iss.exec.posters --config-id=1 --generate=1 --poster-id='test' diff --git a/README.md b/README.md index 94f5720..cfb836e 100644 --- a/README.md +++ b/README.md @@ -227,6 +227,10 @@ i use a special config file for floydhub so i provide a different `.env` file. Training dashboard and dataset are public and available [here](https://www.floydhub.com/prise6/projects/smart-iss-posts/22) +``` +make floyd_training_prod +``` + I tested google colab and train the final model with it, but result are really similar to the floydhub model. ### Clustering @@ -310,7 +314,27 @@ A bit messy. #### Facets -*WIP* +Let's try [facets](https://pair-code.github.io/facets/) on this dataset ! Thanks to mysql db i can compare different clustering and visualize it with facets-dive. + +``` +make facets +``` + +Two html page are created in the directory `reports/`. + +You can manipulate all your images: + +![facets_dive_0](data/facets_dive_0.png) + +Bin by cluster: + +![facets_dive_0](data/facets_dive_1.png) + +And zoom on it: + +![facets_dive_0](data/facets_dive_2.png) + +It's a bit messy because you cannot filter your data ... but the sprite trick make it fast! ### Posters diff --git a/config/config.template.yaml b/config/config.template.yaml index c0ab64b..05a6d0e 100644 --- a/config/config.template.yaml +++ b/config/config.template.yaml @@ -1,12 +1,75 @@ +clustering: + advanced: + PCA: + n_components: XXX + random_state: XXX + dbscan: + eps: XXX + min_samples: XXX + kmeans: XXX + save_directory: XXX + strong_kmeans: + high: XXX + iter: XXX + low: XXX + seed: XXX + threshold: XXX + version: XXX + classical: + CAH: + n_clusters: XXX + PCA: + n_components: XXX + random_state: XXX + TSNE: + n_components: XXX + kmeans: + n_clusters: XXX + random_state: XXX + model: + name: XXX + type: XXX + save_directory: XXX + version: XXX + dbscan: + dbscan: + min_cluster_size: XXX + min_samples: XXX + model: + name: XXX + type: XXX + save_directory: XXX + umap: + metric: XXX + min_dist: XXX + n_components: XXX + n_neighbors: XXX + random_state: XXX + version: XXX + n2d: + kmeans: + n_clusters: XXX + random_state: XXX + model: + name: XXX + type: XXX + save_directory: XXX + umap: + metric: XXX + min_dist: XXX + n_components: XXX + n_neighbors: XXX + random_state: XXX + version: XXX directory: - autoencoder: - base: XXX - test: XXX - train: XXX - valid: XXX collections: XXX + data_dir: XXX + isr_dir: XXX + project_dir: XXX + reports: XXX models: simple: + activation: XXX batch_size: XXX callbacks: checkpoint: @@ -23,6 +86,97 @@ models: input_channel: XXX input_height: XXX input_width: XXX + latent_shape: XXX + learning_rate: XXX + model_name: XXX + sampling: XXX + save_directory: XXX + steps_per_epoch: XXX + use_multiprocessing: XXX + validation_freq: XXX + validation_steps: XXX + verbose: XXX + workers: XXX + simple_conv: + activation: XXX + batch_size: XXX + callbacks: + checkpoint: + directory: XXX + period: XXX + verbose: XXX + csv_logger: + append: XXX + directory: XXX + floyd: XXX + tensorboard: + limit_image: XXX + log_dir: XXX + epochs: XXX + initial_epoch: XXX + input_channel: XXX + input_height: XXX + input_width: XXX + latent_channel: XXX + latent_height: XXX + latent_width: XXX + learning_rate: XXX + model_name: XXX + sampling: XXX + save_directory: XXX + steps_per_epoch: XXX + use_multiprocessing: XXX + validation_freq: XXX + validation_steps: XXX + verbose: XXX + workers: XXX + variational: + activation: XXX + batch_size: XXX + callbacks: + checkpoint: + directory: XXX + period: XXX + verbose: XXX + csv_logger: + append: XXX + directory: XXX + display_picture: + epoch_laps: XXX + epochs: XXX + initial_epoch: XXX + input_channel: XXX + input_height: XXX + input_width: XXX + latent_shape: XXX + learning_rate: XXX + model_name: XXX + save_directory: XXX + steps_per_epoch: XXX + use_multiprocessing: XXX + validation_freq: XXX + validation_steps: XXX + verbose: XXX + workers: XXX + variational_conv: + activation: XXX + batch_size: XXX + callbacks: + checkpoint: + directory: XXX + period: XXX + verbose: XXX + csv_logger: + append: XXX + directory: XXX + display_picture: + epoch_laps: XXX + epochs: XXX + initial_epoch: XXX + input_channel: XXX + input_height: XXX + input_width: XXX + latent_shape: XXX learning_rate: XXX model_name: XXX save_directory: XXX @@ -39,10 +193,17 @@ mysql: port: XXX server: XXX user: XXX -training: - proportions: - test: XXX - train: XXX - valid: XXX - seed: XXX +sampling: + autoencoder: + directory: + base: XXX + from: XXX + test: XXX + train: XXX + valid: XXX + proportions: + test: XXX + train: XXX + valid: XXX + seed: XXX version: XXX diff --git a/data/facets_dive_0.png b/data/facets_dive_0.png new file mode 100644 index 0000000..b7b161e Binary files /dev/null and b/data/facets_dive_0.png differ diff --git a/data/facets_dive_1.png b/data/facets_dive_1.png new file mode 100644 index 0000000..8c1d1d1 Binary files /dev/null and b/data/facets_dive_1.png differ diff --git a/data/facets_dive_2.png b/data/facets_dive_2.png new file mode 100644 index 0000000..217bf8a Binary files /dev/null and b/data/facets_dive_2.png differ diff --git a/iss/exec/facets.py b/iss/exec/facets.py new file mode 100644 index 0000000..233ce90 --- /dev/null +++ b/iss/exec/facets.py @@ -0,0 +1,147 @@ +import os +import base64 +import pandas as pd +import numpy as np +from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator + +from iss.init_config import CONFIG +from iss.tools import Tools + + +SPRITE_NB_LIGNE = 145 +SPRITE_NB_COLONNE = 100 +TARGET_SIZE_WIDTH = 48*2 +TARGET_SIZE_HEIGHT = 27*2 +LIMIT = 14499 + +def request_data(config, db_manager): + + sql = """ + SELECT + v1.pictures_id, + + v1.pictures_x as v1_x, + v1.pictures_y as v1_y, + CAST(v1.label AS CHAR) as v1_label, + + v2.pictures_x as v2_x, + v2.pictures_y as v2_y, + CAST(v2.label AS CHAR) as v2_label, + + v3.pictures_x as v3_x, + v3.pictures_y as v3_y, + CAST(v3.label AS CHAR) as v3_label, + + loc.pictures_timestamp, + loc.pictures_location_text, + loc.pictures_latitude, + loc.pictures_longitude + + FROM iss.pictures_embedding AS v1 + + INNER JOIN iss.pictures_embedding v2 + ON v1.pictures_id = v2.pictures_id + AND v2.clustering_type = v1.clustering_type + AND v2.clustering_model_type = v1.clustering_model_type + AND v2.clustering_model_name = v2.clustering_model_name + AND v2.clustering_version = 2 + + INNER JOIN iss.pictures_embedding v3 + ON v1.pictures_id = v3.pictures_id + AND v3.clustering_type = v1.clustering_type + AND v3.clustering_model_type = v1.clustering_model_type + AND v3.clustering_model_name = v1.clustering_model_name + AND v3.clustering_version = 3 + + LEFT JOIN iss.pictures_location loc + ON loc.pictures_id = v1.pictures_id + + WHERE v1.clustering_version = %s + ORDER BY pictures_id ASC LIMIT %s""" + + db_manager.cursor.execute(sql, (1, LIMIT)) + results = db_manager.cursor.fetchall() + + return pd.DataFrame(results, columns=db_manager.cursor.column_names) + + +def create_sprite(config, df): + + images_array = [Tools.read_np_picture(os.path.join(config.get('directory')['collections'], "%s.jpg" % picture_id), target_size = (TARGET_SIZE_HEIGHT, TARGET_SIZE_WIDTH)) for picture_id in df['pictures_id']] + sprite = np.zeros((TARGET_SIZE_HEIGHT*SPRITE_NB_LIGNE, TARGET_SIZE_WIDTH*SPRITE_NB_COLONNE, 3)) + index = 0 + for i in range(SPRITE_NB_LIGNE): + for j in range(SPRITE_NB_COLONNE): + sprite[(i*TARGET_SIZE_HEIGHT):(i+1)*TARGET_SIZE_HEIGHT, (j*TARGET_SIZE_WIDTH):(j+1)*TARGET_SIZE_WIDTH, :] = images_array[index] + index += 1 + if index >= len(images_array): + break + if index >= len(images_array): + break + + img = Tools.display_one_picture(sprite) + return img + + +def generate_facets(config, df): + + proto = GenericFeatureStatisticsGenerator().ProtoFromDataFrames([{'name': 'facets-iss', 'table': df}]) + protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8") + + HTML_TEMPLATE = """ + + + + """ + html = HTML_TEMPLATE.format(protostr=protostr) + + return html + +def generate_facets_dive(config, df, relative_sprite_path): + + jsonstr = df.to_json(orient = 'records') + HTML_TEMPLATE = """ + + + + + """ + html = HTML_TEMPLATE.format(jsonstr=jsonstr, atlas_url = relative_sprite_path, sprite_width=TARGET_SIZE_WIDTH, sprite_height=TARGET_SIZE_HEIGHT) + + return html + + +def main(): + + ## db manager + db_manager = Tools.create_db_manager(CONFIG) + + ## request data + df = request_data(CONFIG, db_manager) + + ## create sprite + sprite = create_sprite(CONFIG, df) + + ## save sprite + sprite.save(os.path.join(CONFIG.get('directory')['reports'], 'figures', 'sprite_altas.png'), "PNG") + + ## generate facets + html_facets = generate_facets(CONFIG, df) + with open(os.path.join(CONFIG.get('directory')['reports'], 'facets.html'),'w') as f: + f.write(html_facets) + + ## generate facets-dive + html_facets_dive = generate_facets_dive(CONFIG, df, './figures/sprite_altas.png') + with open(os.path.join(CONFIG.get('directory')['reports'], 'facets-dive.html'), 'w') as f: + f.write(html_facets_dive) + + +if __name__ == '__main__': + main() \ No newline at end of file