clean facets code + doc + config template

2020-02-09 20:17:02 +01:00 · 2020-02-09 20:17:02 +01:00 · beb5b5107e
parent e0a87c0f3d
commit beb5b5107e
7 changed files with 348 additions and 13 deletions
--- a/5
+++ b/5
@ -45,7 +45,7 @@ debug:

 ## Write config template
 config_template:
-	$(PYTHON_INTERPRETER) iss/tools/config_template.py
+	$(PYTHON_INTERPRETER) -m iss.tools.config_template

 ## start docker
 docker_start:
@ -80,6 +80,9 @@ training:
 exec_clustering:
 	$(PYTHON_INTERPRETER) -m iss.exec.clustering

+facets:
+	$(PYTHON_INTERPRETER) -m iss.exec.facets
+
 posters:
 	$(PYTHON_INTERPRETER) -m iss.exec.posters --config-id=1 --generate=1 --poster-id='test'

--- a/README.md
+++ b/README.md
@ -227,6 +227,10 @@ i use a special config file for floydhub so i provide a different `.env` file.

 Training dashboard and dataset are public and available [here](https://www.floydhub.com/prise6/projects/smart-iss-posts/22) 

+```
+make floyd_training_prod
+```
+
 I tested google colab and train the final model with it, but result are really similar to the floydhub model.

 ### Clustering
@ -310,7 +314,27 @@ A bit messy.

 #### Facets

-*WIP*
+Let's try [facets](https://pair-code.github.io/facets/) on this dataset ! Thanks to mysql db i can compare different clustering and visualize it with facets-dive.
+
+```
+make facets
+```
+
+Two html page are created in the directory `reports/`.
+
+You can manipulate all your images:
+
+![facets_dive_0](data/facets_dive_0.png)
+
+Bin by cluster:
+
+![facets_dive_0](data/facets_dive_1.png)
+
+And zoom on it:
+
+![facets_dive_0](data/facets_dive_2.png)
+
+It's a bit messy because you cannot filter your data  ... but the sprite trick make it fast!


 ### Posters
--- a/config/config.template.yaml
+++ b/config/config.template.yaml
@ -1,12 +1,75 @@
+clustering:
+  advanced:
+    PCA:
+      n_components: XXX
+      random_state: XXX
+    dbscan:
+      eps: XXX
+      min_samples: XXX
+    kmeans: XXX
+    save_directory: XXX
+    strong_kmeans:
+      high: XXX
+      iter: XXX
+      low: XXX
+      seed: XXX
+      threshold: XXX
+    version: XXX
+  classical:
+    CAH:
+      n_clusters: XXX
+    PCA:
+      n_components: XXX
+      random_state: XXX
+    TSNE:
+      n_components: XXX
+    kmeans:
+      n_clusters: XXX
+      random_state: XXX
+    model:
+      name: XXX
+      type: XXX
+    save_directory: XXX
+    version: XXX
+  dbscan:
+    dbscan:
+      min_cluster_size: XXX
+      min_samples: XXX
+    model:
+      name: XXX
+      type: XXX
+    save_directory: XXX
+    umap:
+      metric: XXX
+      min_dist: XXX
+      n_components: XXX
+      n_neighbors: XXX
+      random_state: XXX
+    version: XXX
+  n2d:
+    kmeans:
+      n_clusters: XXX
+      random_state: XXX
+    model:
+      name: XXX
+      type: XXX
+    save_directory: XXX
+    umap:
+      metric: XXX
+      min_dist: XXX
+      n_components: XXX
+      n_neighbors: XXX
+      random_state: XXX
+    version: XXX
 directory:
-  autoencoder:
-    base: XXX
-    test: XXX
-    train: XXX
-    valid: XXX
  collections: XXX
+  data_dir: XXX
+  isr_dir: XXX
+  project_dir: XXX
+  reports: XXX
 models:
  simple:
+    activation: XXX
    batch_size: XXX
    callbacks:
      checkpoint:
@ -23,6 +86,97 @@ models:
    input_channel: XXX
    input_height: XXX
    input_width: XXX
+    latent_shape: XXX
+    learning_rate: XXX
+    model_name: XXX
+    sampling: XXX
+    save_directory: XXX
+    steps_per_epoch: XXX
+    use_multiprocessing: XXX
+    validation_freq: XXX
+    validation_steps: XXX
+    verbose: XXX
+    workers: XXX
+  simple_conv:
+    activation: XXX
+    batch_size: XXX
+    callbacks:
+      checkpoint:
+        directory: XXX
+        period: XXX
+        verbose: XXX
+      csv_logger:
+        append: XXX
+        directory: XXX
+      floyd: XXX
+      tensorboard:
+        limit_image: XXX
+        log_dir: XXX
+    epochs: XXX
+    initial_epoch: XXX
+    input_channel: XXX
+    input_height: XXX
+    input_width: XXX
+    latent_channel: XXX
+    latent_height: XXX
+    latent_width: XXX
+    learning_rate: XXX
+    model_name: XXX
+    sampling: XXX
+    save_directory: XXX
+    steps_per_epoch: XXX
+    use_multiprocessing: XXX
+    validation_freq: XXX
+    validation_steps: XXX
+    verbose: XXX
+    workers: XXX
+  variational:
+    activation: XXX
+    batch_size: XXX
+    callbacks:
+      checkpoint:
+        directory: XXX
+        period: XXX
+        verbose: XXX
+      csv_logger:
+        append: XXX
+        directory: XXX
+      display_picture:
+        epoch_laps: XXX
+    epochs: XXX
+    initial_epoch: XXX
+    input_channel: XXX
+    input_height: XXX
+    input_width: XXX
+    latent_shape: XXX
+    learning_rate: XXX
+    model_name: XXX
+    save_directory: XXX
+    steps_per_epoch: XXX
+    use_multiprocessing: XXX
+    validation_freq: XXX
+    validation_steps: XXX
+    verbose: XXX
+    workers: XXX
+  variational_conv:
+    activation: XXX
+    batch_size: XXX
+    callbacks:
+      checkpoint:
+        directory: XXX
+        period: XXX
+        verbose: XXX
+      csv_logger:
+        append: XXX
+        directory: XXX
+      display_picture:
+        epoch_laps: XXX
+    epochs: XXX
+    initial_epoch: XXX
+    input_channel: XXX
+    input_height: XXX
+    input_width: XXX
+    latent_shape: XXX
    learning_rate: XXX
    model_name: XXX
    save_directory: XXX
@ -39,10 +193,17 @@ mysql:
    port: XXX
    server: XXX
    user: XXX
-training:
-  proportions:
-    test: XXX
-    train: XXX
-    valid: XXX
-  seed: XXX
+sampling:
+  autoencoder:
+    directory:
+      base: XXX
+      from: XXX
+      test: XXX
+      train: XXX
+      valid: XXX
+    proportions:
+      test: XXX
+      train: XXX
+      valid: XXX
+    seed: XXX
 version: XXX
--- a/data/facets_dive_0.png
+++ b/data/facets_dive_0.png
--- a/data/facets_dive_1.png
+++ b/data/facets_dive_1.png
--- a/data/facets_dive_2.png
+++ b/data/facets_dive_2.png
--- a/iss/exec/facets.py
+++ b/iss/exec/facets.py
@ -0,0 +1,147 @@
+import os
+import base64
+import pandas as pd
+import numpy as np
+from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator
+
+from iss.init_config import CONFIG
+from iss.tools import Tools
+
+
+SPRITE_NB_LIGNE = 145
+SPRITE_NB_COLONNE = 100
+TARGET_SIZE_WIDTH = 48*2
+TARGET_SIZE_HEIGHT = 27*2
+LIMIT = 14499
+
+def request_data(config, db_manager):
+
+    sql = """
+    SELECT 
+        v1.pictures_id,
+
+        v1.pictures_x as v1_x,
+        v1.pictures_y as v1_y,
+        CAST(v1.label AS CHAR) as v1_label,
+
+        v2.pictures_x as v2_x,
+        v2.pictures_y as v2_y,
+        CAST(v2.label AS CHAR) as v2_label,
+
+        v3.pictures_x as v3_x,
+        v3.pictures_y as v3_y,
+        CAST(v3.label AS CHAR) as v3_label,
+
+        loc.pictures_timestamp,
+        loc.pictures_location_text,
+        loc.pictures_latitude,
+        loc.pictures_longitude
+
+    FROM iss.pictures_embedding AS v1
+
+    INNER JOIN iss.pictures_embedding v2
+    ON v1.pictures_id = v2.pictures_id
+    AND v2.clustering_type = v1.clustering_type
+    AND v2.clustering_model_type = v1.clustering_model_type
+    AND v2.clustering_model_name = v2.clustering_model_name
+    AND v2.clustering_version = 2
+
+    INNER JOIN iss.pictures_embedding v3
+    ON v1.pictures_id = v3.pictures_id
+    AND v3.clustering_type = v1.clustering_type
+    AND v3.clustering_model_type = v1.clustering_model_type
+    AND v3.clustering_model_name = v1.clustering_model_name
+    AND v3.clustering_version = 3
+
+    LEFT JOIN iss.pictures_location loc
+    ON loc.pictures_id = v1.pictures_id
+
+    WHERE v1.clustering_version = %s
+    ORDER BY pictures_id ASC LIMIT %s"""
+
+    db_manager.cursor.execute(sql, (1, LIMIT))
+    results = db_manager.cursor.fetchall()
+     
+    return pd.DataFrame(results, columns=db_manager.cursor.column_names)
+
+
+def create_sprite(config, df):
+
+    images_array = [Tools.read_np_picture(os.path.join(config.get('directory')['collections'], "%s.jpg" % picture_id), target_size = (TARGET_SIZE_HEIGHT, TARGET_SIZE_WIDTH)) for picture_id in df['pictures_id']]
+    sprite = np.zeros((TARGET_SIZE_HEIGHT*SPRITE_NB_LIGNE, TARGET_SIZE_WIDTH*SPRITE_NB_COLONNE, 3))
+    index = 0
+    for i in range(SPRITE_NB_LIGNE):
+        for j in range(SPRITE_NB_COLONNE):
+            sprite[(i*TARGET_SIZE_HEIGHT):(i+1)*TARGET_SIZE_HEIGHT, (j*TARGET_SIZE_WIDTH):(j+1)*TARGET_SIZE_WIDTH, :] = images_array[index]
+            index += 1
+            if index >= len(images_array):
+                break
+        if index >= len(images_array):
+            break
+
+    img = Tools.display_one_picture(sprite)
+    return img
+
+
+def generate_facets(config, df):
+
+    proto = GenericFeatureStatisticsGenerator().ProtoFromDataFrames([{'name': 'facets-iss', 'table': df}])
+    protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8")
+
+    HTML_TEMPLATE = """
+            <script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
+            <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html" >
+            <facets-overview id="elem"></facets-overview>
+            <script>
+            document.querySelector("#elem").protoInput = "{protostr}";
+            </script>"""
+    html = HTML_TEMPLATE.format(protostr=protostr)
+
+    return html
+
+def generate_facets_dive(config, df, relative_sprite_path):
+
+    jsonstr = df.to_json(orient = 'records')
+    HTML_TEMPLATE = """
+            <script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
+            <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html">
+            <facets-dive id="elem" height="600" cross-origin="anonymous" sprite-image-width="{sprite_width}" sprite-image-height="{sprite_height}">
+            </facets-dive>
+            <script>
+            var data = {jsonstr};
+            var atlas_url = "{atlas_url}";
+            document.querySelector("#elem").data = data;
+            document.querySelector("#elem").atlasUrl = atlas_url;
+            </script>"""
+    html = HTML_TEMPLATE.format(jsonstr=jsonstr, atlas_url = relative_sprite_path, sprite_width=TARGET_SIZE_WIDTH, sprite_height=TARGET_SIZE_HEIGHT)
+    
+    return html
+
+
+def main():
+
+    ## db manager
+    db_manager = Tools.create_db_manager(CONFIG)
+
+    ## request data
+    df = request_data(CONFIG, db_manager)
+
+    ## create sprite
+    sprite = create_sprite(CONFIG, df)
+
+    ## save sprite
+    sprite.save(os.path.join(CONFIG.get('directory')['reports'], 'figures', 'sprite_altas.png'), "PNG")
+
+    ## generate facets
+    html_facets = generate_facets(CONFIG, df)
+    with open(os.path.join(CONFIG.get('directory')['reports'], 'facets.html'),'w') as f:
+        f.write(html_facets)
+
+    ## generate facets-dive
+    html_facets_dive = generate_facets_dive(CONFIG, df, './figures/sprite_altas.png')
+    with open(os.path.join(CONFIG.get('directory')['reports'], 'facets-dive.html'), 'w') as f:
+            f.write(html_facets_dive)
+
+
+if __name__ == '__main__':
+    main()