clean facets code + doc + config template

2020-02-09 20:17:02 +01:00 · 2020-02-09 20:17:02 +01:00 · beb5b5107e
parent e0a87c0f3d
commit beb5b5107e
7 changed files with 348 additions and 13 deletions
--- a/5
+++ b/5
@ -45,7 +45,7 @@ debug:
 ## Write config template
 config_template:
-	$(PYTHON_INTERPRETER) iss/tools/config_template.py
+	$(PYTHON_INTERPRETER) -m iss.tools.config_template
 ## start docker
 docker_start:
@ -80,6 +80,9 @@ training:
 exec_clustering:
 	$(PYTHON_INTERPRETER) -m iss.exec.clustering
 facets:
 	$(PYTHON_INTERPRETER) -m iss.exec.facets
 posters:
 	$(PYTHON_INTERPRETER) -m iss.exec.posters --config-id=1 --generate=1 --poster-id='test'
--- a/README.md
+++ b/README.md
@ -227,6 +227,10 @@ i use a special config file for floydhub so i provide a different `.env` file.
 Training dashboard and dataset are public and available [here](https://www.floydhub.com/prise6/projects/smart-iss-posts/22) 
 ```
 make floyd_training_prod
 ```
 I tested google colab and train the final model with it, but result are really similar to the floydhub model.
 ### Clustering
@ -310,7 +314,27 @@ A bit messy.
 #### Facets
-*WIP*
+Let's try [facets](https://pair-code.github.io/facets/) on this dataset ! Thanks to mysql db i can compare different clustering and visualize it with facets-dive.
 ```
 make facets
 ```
 Two html page are created in the directory `reports/`.
 You can manipulate all your images:
 ![facets_dive_0](data/facets_dive_0.png)
 Bin by cluster:
 ![facets_dive_0](data/facets_dive_1.png)
 And zoom on it:
 ![facets_dive_0](data/facets_dive_2.png)
 It's a bit messy because you cannot filter your data  ... but the sprite trick make it fast!
 ### Posters
--- a/config/config.template.yaml
+++ b/config/config.template.yaml
@ -1,12 +1,75 @@
 clustering:
  advanced:
    PCA:
      n_components: XXX
      random_state: XXX
    dbscan:
      eps: XXX
      min_samples: XXX
    kmeans: XXX
    save_directory: XXX
    strong_kmeans:
      high: XXX
      iter: XXX
      low: XXX
      seed: XXX
      threshold: XXX
    version: XXX
  classical:
    CAH:
      n_clusters: XXX
    PCA:
      n_components: XXX
      random_state: XXX
    TSNE:
      n_components: XXX
    kmeans:
      n_clusters: XXX
      random_state: XXX
    model:
      name: XXX
      type: XXX
    save_directory: XXX
    version: XXX
  dbscan:
    dbscan:
      min_cluster_size: XXX
      min_samples: XXX
    model:
      name: XXX
      type: XXX
    save_directory: XXX
    umap:
      metric: XXX
      min_dist: XXX
      n_components: XXX
      n_neighbors: XXX
      random_state: XXX
    version: XXX
  n2d:
    kmeans:
      n_clusters: XXX
      random_state: XXX
    model:
      name: XXX
      type: XXX
    save_directory: XXX
    umap:
      metric: XXX
      min_dist: XXX
      n_components: XXX
      n_neighbors: XXX
      random_state: XXX
    version: XXX
 directory:
  autoencoder:
    base: XXX
    test: XXX
    train: XXX
    valid: XXX
  collections: XXX
  data_dir: XXX
  isr_dir: XXX
  project_dir: XXX
  reports: XXX
 models:
  simple:
    activation: XXX
    batch_size: XXX
    callbacks:
      checkpoint:
@ -23,6 +86,97 @@ models:
    input_channel: XXX
    input_height: XXX
    input_width: XXX
    latent_shape: XXX
    learning_rate: XXX
    model_name: XXX
    sampling: XXX
    save_directory: XXX
    steps_per_epoch: XXX
    use_multiprocessing: XXX
    validation_freq: XXX
    validation_steps: XXX
    verbose: XXX
    workers: XXX
  simple_conv:
    activation: XXX
    batch_size: XXX
    callbacks:
      checkpoint:
        directory: XXX
        period: XXX
        verbose: XXX
      csv_logger:
        append: XXX
        directory: XXX
      floyd: XXX
      tensorboard:
        limit_image: XXX
        log_dir: XXX
    epochs: XXX
    initial_epoch: XXX
    input_channel: XXX
    input_height: XXX
    input_width: XXX
    latent_channel: XXX
    latent_height: XXX
    latent_width: XXX
    learning_rate: XXX
    model_name: XXX
    sampling: XXX
    save_directory: XXX
    steps_per_epoch: XXX
    use_multiprocessing: XXX
    validation_freq: XXX
    validation_steps: XXX
    verbose: XXX
    workers: XXX
  variational:
    activation: XXX
    batch_size: XXX
    callbacks:
      checkpoint:
        directory: XXX
        period: XXX
        verbose: XXX
      csv_logger:
        append: XXX
        directory: XXX
      display_picture:
        epoch_laps: XXX
    epochs: XXX
    initial_epoch: XXX
    input_channel: XXX
    input_height: XXX
    input_width: XXX
    latent_shape: XXX
    learning_rate: XXX
    model_name: XXX
    save_directory: XXX
    steps_per_epoch: XXX
    use_multiprocessing: XXX
    validation_freq: XXX
    validation_steps: XXX
    verbose: XXX
    workers: XXX
  variational_conv:
    activation: XXX
    batch_size: XXX
    callbacks:
      checkpoint:
        directory: XXX
        period: XXX
        verbose: XXX
      csv_logger:
        append: XXX
        directory: XXX
      display_picture:
        epoch_laps: XXX
    epochs: XXX
    initial_epoch: XXX
    input_channel: XXX
    input_height: XXX
    input_width: XXX
    latent_shape: XXX
    learning_rate: XXX
    model_name: XXX
    save_directory: XXX
@ -39,10 +193,17 @@ mysql:
    port: XXX
    server: XXX
    user: XXX
-training:
+sampling:
-  proportions:
+  autoencoder:
-    test: XXX
+    directory:
-    train: XXX
+      base: XXX
-    valid: XXX
+      from: XXX
-  seed: XXX
+      test: XXX
      train: XXX
      valid: XXX
    proportions:
      test: XXX
      train: XXX
      valid: XXX
    seed: XXX
 version: XXX
--- a/data/facets_dive_0.png
+++ b/data/facets_dive_0.png
--- a/data/facets_dive_1.png
+++ b/data/facets_dive_1.png
--- a/data/facets_dive_2.png
+++ b/data/facets_dive_2.png
--- a/iss/exec/facets.py
+++ b/iss/exec/facets.py
@ -0,0 +1,147 @@
 import os
 import base64
 import pandas as pd
 import numpy as np
 from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator
 from iss.init_config import CONFIG
 from iss.tools import Tools
 SPRITE_NB_LIGNE = 145
 SPRITE_NB_COLONNE = 100
 TARGET_SIZE_WIDTH = 48*2
 TARGET_SIZE_HEIGHT = 27*2
 LIMIT = 14499
 def request_data(config, db_manager):
    sql = """
    SELECT 
        v1.pictures_id,
        v1.pictures_x as v1_x,
        v1.pictures_y as v1_y,
        CAST(v1.label AS CHAR) as v1_label,
        v2.pictures_x as v2_x,
        v2.pictures_y as v2_y,
        CAST(v2.label AS CHAR) as v2_label,
        v3.pictures_x as v3_x,
        v3.pictures_y as v3_y,
        CAST(v3.label AS CHAR) as v3_label,
        loc.pictures_timestamp,
        loc.pictures_location_text,
        loc.pictures_latitude,
        loc.pictures_longitude
    FROM iss.pictures_embedding AS v1
    INNER JOIN iss.pictures_embedding v2
    ON v1.pictures_id = v2.pictures_id
    AND v2.clustering_type = v1.clustering_type
    AND v2.clustering_model_type = v1.clustering_model_type
    AND v2.clustering_model_name = v2.clustering_model_name
    AND v2.clustering_version = 2
    INNER JOIN iss.pictures_embedding v3
    ON v1.pictures_id = v3.pictures_id
    AND v3.clustering_type = v1.clustering_type
    AND v3.clustering_model_type = v1.clustering_model_type
    AND v3.clustering_model_name = v1.clustering_model_name
    AND v3.clustering_version = 3
    LEFT JOIN iss.pictures_location loc
    ON loc.pictures_id = v1.pictures_id
    WHERE v1.clustering_version = %s
    ORDER BY pictures_id ASC LIMIT %s"""
    db_manager.cursor.execute(sql, (1, LIMIT))
    results = db_manager.cursor.fetchall()
    return pd.DataFrame(results, columns=db_manager.cursor.column_names)
 def create_sprite(config, df):
    images_array = [Tools.read_np_picture(os.path.join(config.get('directory')['collections'], "%s.jpg" % picture_id), target_size = (TARGET_SIZE_HEIGHT, TARGET_SIZE_WIDTH)) for picture_id in df['pictures_id']]
    sprite = np.zeros((TARGET_SIZE_HEIGHT*SPRITE_NB_LIGNE, TARGET_SIZE_WIDTH*SPRITE_NB_COLONNE, 3))
    index = 0
    for i in range(SPRITE_NB_LIGNE):
        for j in range(SPRITE_NB_COLONNE):
            sprite[(i*TARGET_SIZE_HEIGHT):(i+1)*TARGET_SIZE_HEIGHT, (j*TARGET_SIZE_WIDTH):(j+1)*TARGET_SIZE_WIDTH, :] = images_array[index]
            index += 1
            if index >= len(images_array):
                break
        if index >= len(images_array):
            break
    img = Tools.display_one_picture(sprite)
    return img
 def generate_facets(config, df):
    proto = GenericFeatureStatisticsGenerator().ProtoFromDataFrames([{'name': 'facets-iss', 'table': df}])
    protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8")
    HTML_TEMPLATE = """
            <script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
            <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html" >
            <facets-overview id="elem"></facets-overview>
            <script>
            document.querySelector("#elem").protoInput = "{protostr}";
            </script>"""
    html = HTML_TEMPLATE.format(protostr=protostr)
    return html
 def generate_facets_dive(config, df, relative_sprite_path):
    jsonstr = df.to_json(orient = 'records')
    HTML_TEMPLATE = """
            <script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
            <link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html">
            <facets-dive id="elem" height="600" cross-origin="anonymous" sprite-image-width="{sprite_width}" sprite-image-height="{sprite_height}">
            </facets-dive>
            <script>
            var data = {jsonstr};
            var atlas_url = "{atlas_url}";
            document.querySelector("#elem").data = data;
            document.querySelector("#elem").atlasUrl = atlas_url;
            </script>"""
    html = HTML_TEMPLATE.format(jsonstr=jsonstr, atlas_url = relative_sprite_path, sprite_width=TARGET_SIZE_WIDTH, sprite_height=TARGET_SIZE_HEIGHT)
    return html
 def main():
    ## db manager
    db_manager = Tools.create_db_manager(CONFIG)
    ## request data
    df = request_data(CONFIG, db_manager)
    ## create sprite
    sprite = create_sprite(CONFIG, df)
    ## save sprite
    sprite.save(os.path.join(CONFIG.get('directory')['reports'], 'figures', 'sprite_altas.png'), "PNG")
    ## generate facets
    html_facets = generate_facets(CONFIG, df)
    with open(os.path.join(CONFIG.get('directory')['reports'], 'facets.html'),'w') as f:
        f.write(html_facets)
    ## generate facets-dive
    html_facets_dive = generate_facets_dive(CONFIG, df, './figures/sprite_altas.png')
    with open(os.path.join(CONFIG.get('directory')['reports'], 'facets-dive.html'), 'w') as f:
            f.write(html_facets_dive)
 if __name__ == '__main__':
    main()