Browse Source

clean facets code + doc + config template

master
Francois Vieille 3 months ago
parent
commit
beb5b5107e
7 changed files with 348 additions and 13 deletions
  1. +4
    -1
      Makefile
  2. +25
    -1
      README.md
  3. +172
    -11
      config/config.template.yaml
  4. BIN
      data/facets_dive_0.png
  5. BIN
      data/facets_dive_1.png
  6. BIN
      data/facets_dive_2.png
  7. +147
    -0
      iss/exec/facets.py

+ 4
- 1
Makefile View File

@@ -45,7 +45,7 @@ debug:

## Write config template
config_template:
$(PYTHON_INTERPRETER) iss/tools/config_template.py
$(PYTHON_INTERPRETER) -m iss.tools.config_template

## start docker
docker_start:
@@ -80,6 +80,9 @@ training:
exec_clustering:
$(PYTHON_INTERPRETER) -m iss.exec.clustering

facets:
$(PYTHON_INTERPRETER) -m iss.exec.facets

posters:
$(PYTHON_INTERPRETER) -m iss.exec.posters --config-id=1 --generate=1 --poster-id='test'



+ 25
- 1
README.md View File

@@ -227,6 +227,10 @@ i use a special config file for floydhub so i provide a different `.env` file.

Training dashboard and dataset are public and available [here](https://www.floydhub.com/prise6/projects/smart-iss-posts/22)

```
make floyd_training_prod
```

I tested google colab and train the final model with it, but result are really similar to the floydhub model.

### Clustering
@@ -310,7 +314,27 @@ A bit messy.

#### Facets

*WIP*
Let's try [facets](https://pair-code.github.io/facets/) on this dataset ! Thanks to mysql db i can compare different clustering and visualize it with facets-dive.

```
make facets
```

Two html page are created in the directory `reports/`.

You can manipulate all your images:

![facets_dive_0](data/facets_dive_0.png)

Bin by cluster:

![facets_dive_0](data/facets_dive_1.png)

And zoom on it:

![facets_dive_0](data/facets_dive_2.png)

It's a bit messy because you cannot filter your data ... but the sprite trick make it fast!


### Posters


+ 172
- 11
config/config.template.yaml View File

@@ -1,12 +1,75 @@
clustering:
advanced:
PCA:
n_components: XXX
random_state: XXX
dbscan:
eps: XXX
min_samples: XXX
kmeans: XXX
save_directory: XXX
strong_kmeans:
high: XXX
iter: XXX
low: XXX
seed: XXX
threshold: XXX
version: XXX
classical:
CAH:
n_clusters: XXX
PCA:
n_components: XXX
random_state: XXX
TSNE:
n_components: XXX
kmeans:
n_clusters: XXX
random_state: XXX
model:
name: XXX
type: XXX
save_directory: XXX
version: XXX
dbscan:
dbscan:
min_cluster_size: XXX
min_samples: XXX
model:
name: XXX
type: XXX
save_directory: XXX
umap:
metric: XXX
min_dist: XXX
n_components: XXX
n_neighbors: XXX
random_state: XXX
version: XXX
n2d:
kmeans:
n_clusters: XXX
random_state: XXX
model:
name: XXX
type: XXX
save_directory: XXX
umap:
metric: XXX
min_dist: XXX
n_components: XXX
n_neighbors: XXX
random_state: XXX
version: XXX
directory:
autoencoder:
base: XXX
test: XXX
train: XXX
valid: XXX
collections: XXX
data_dir: XXX
isr_dir: XXX
project_dir: XXX
reports: XXX
models:
simple:
activation: XXX
batch_size: XXX
callbacks:
checkpoint:
@@ -23,6 +86,97 @@ models:
input_channel: XXX
input_height: XXX
input_width: XXX
latent_shape: XXX
learning_rate: XXX
model_name: XXX
sampling: XXX
save_directory: XXX
steps_per_epoch: XXX
use_multiprocessing: XXX
validation_freq: XXX
validation_steps: XXX
verbose: XXX
workers: XXX
simple_conv:
activation: XXX
batch_size: XXX
callbacks:
checkpoint:
directory: XXX
period: XXX
verbose: XXX
csv_logger:
append: XXX
directory: XXX
floyd: XXX
tensorboard:
limit_image: XXX
log_dir: XXX
epochs: XXX
initial_epoch: XXX
input_channel: XXX
input_height: XXX
input_width: XXX
latent_channel: XXX
latent_height: XXX
latent_width: XXX
learning_rate: XXX
model_name: XXX
sampling: XXX
save_directory: XXX
steps_per_epoch: XXX
use_multiprocessing: XXX
validation_freq: XXX
validation_steps: XXX
verbose: XXX
workers: XXX
variational:
activation: XXX
batch_size: XXX
callbacks:
checkpoint:
directory: XXX
period: XXX
verbose: XXX
csv_logger:
append: XXX
directory: XXX
display_picture:
epoch_laps: XXX
epochs: XXX
initial_epoch: XXX
input_channel: XXX
input_height: XXX
input_width: XXX
latent_shape: XXX
learning_rate: XXX
model_name: XXX
save_directory: XXX
steps_per_epoch: XXX
use_multiprocessing: XXX
validation_freq: XXX
validation_steps: XXX
verbose: XXX
workers: XXX
variational_conv:
activation: XXX
batch_size: XXX
callbacks:
checkpoint:
directory: XXX
period: XXX
verbose: XXX
csv_logger:
append: XXX
directory: XXX
display_picture:
epoch_laps: XXX
epochs: XXX
initial_epoch: XXX
input_channel: XXX
input_height: XXX
input_width: XXX
latent_shape: XXX
learning_rate: XXX
model_name: XXX
save_directory: XXX
@@ -39,10 +193,17 @@ mysql:
port: XXX
server: XXX
user: XXX
training:
proportions:
test: XXX
train: XXX
valid: XXX
seed: XXX
sampling:
autoencoder:
directory:
base: XXX
from: XXX
test: XXX
train: XXX
valid: XXX
proportions:
test: XXX
train: XXX
valid: XXX
seed: XXX
version: XXX

BIN
data/facets_dive_0.png View File

Before After
Width: 1472  |  Height: 696  |  Size: 1.7MB

BIN
data/facets_dive_1.png View File

Before After
Width: 1472  |  Height: 696  |  Size: 398KB

BIN
data/facets_dive_2.png View File

Before After
Width: 1472  |  Height: 696  |  Size: 590KB

+ 147
- 0
iss/exec/facets.py View File

@@ -0,0 +1,147 @@
import os
import base64
import pandas as pd
import numpy as np
from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator

from iss.init_config import CONFIG
from iss.tools import Tools


SPRITE_NB_LIGNE = 145
SPRITE_NB_COLONNE = 100
TARGET_SIZE_WIDTH = 48*2
TARGET_SIZE_HEIGHT = 27*2
LIMIT = 14499

def request_data(config, db_manager):

sql = """
SELECT
v1.pictures_id,

v1.pictures_x as v1_x,
v1.pictures_y as v1_y,
CAST(v1.label AS CHAR) as v1_label,

v2.pictures_x as v2_x,
v2.pictures_y as v2_y,
CAST(v2.label AS CHAR) as v2_label,

v3.pictures_x as v3_x,
v3.pictures_y as v3_y,
CAST(v3.label AS CHAR) as v3_label,

loc.pictures_timestamp,
loc.pictures_location_text,
loc.pictures_latitude,
loc.pictures_longitude

FROM iss.pictures_embedding AS v1

INNER JOIN iss.pictures_embedding v2
ON v1.pictures_id = v2.pictures_id
AND v2.clustering_type = v1.clustering_type
AND v2.clustering_model_type = v1.clustering_model_type
AND v2.clustering_model_name = v2.clustering_model_name
AND v2.clustering_version = 2

INNER JOIN iss.pictures_embedding v3
ON v1.pictures_id = v3.pictures_id
AND v3.clustering_type = v1.clustering_type
AND v3.clustering_model_type = v1.clustering_model_type
AND v3.clustering_model_name = v1.clustering_model_name
AND v3.clustering_version = 3

LEFT JOIN iss.pictures_location loc
ON loc.pictures_id = v1.pictures_id

WHERE v1.clustering_version = %s
ORDER BY pictures_id ASC LIMIT %s"""

db_manager.cursor.execute(sql, (1, LIMIT))
results = db_manager.cursor.fetchall()
return pd.DataFrame(results, columns=db_manager.cursor.column_names)


def create_sprite(config, df):

images_array = [Tools.read_np_picture(os.path.join(config.get('directory')['collections'], "%s.jpg" % picture_id), target_size = (TARGET_SIZE_HEIGHT, TARGET_SIZE_WIDTH)) for picture_id in df['pictures_id']]
sprite = np.zeros((TARGET_SIZE_HEIGHT*SPRITE_NB_LIGNE, TARGET_SIZE_WIDTH*SPRITE_NB_COLONNE, 3))
index = 0
for i in range(SPRITE_NB_LIGNE):
for j in range(SPRITE_NB_COLONNE):
sprite[(i*TARGET_SIZE_HEIGHT):(i+1)*TARGET_SIZE_HEIGHT, (j*TARGET_SIZE_WIDTH):(j+1)*TARGET_SIZE_WIDTH, :] = images_array[index]
index += 1
if index >= len(images_array):
break
if index >= len(images_array):
break

img = Tools.display_one_picture(sprite)
return img


def generate_facets(config, df):

proto = GenericFeatureStatisticsGenerator().ProtoFromDataFrames([{'name': 'facets-iss', 'table': df}])
protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8")

HTML_TEMPLATE = """
<script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
<link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html" >
<facets-overview id="elem"></facets-overview>
<script>
document.querySelector("#elem").protoInput = "{protostr}";
</script>"""
html = HTML_TEMPLATE.format(protostr=protostr)

return html

def generate_facets_dive(config, df, relative_sprite_path):

jsonstr = df.to_json(orient = 'records')
HTML_TEMPLATE = """
<script src="https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js"></script>
<link rel="import" href="https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html">
<facets-dive id="elem" height="600" cross-origin="anonymous" sprite-image-width="{sprite_width}" sprite-image-height="{sprite_height}">
</facets-dive>
<script>
var data = {jsonstr};
var atlas_url = "{atlas_url}";
document.querySelector("#elem").data = data;
document.querySelector("#elem").atlasUrl = atlas_url;
</script>"""
html = HTML_TEMPLATE.format(jsonstr=jsonstr, atlas_url = relative_sprite_path, sprite_width=TARGET_SIZE_WIDTH, sprite_height=TARGET_SIZE_HEIGHT)
return html


def main():

## db manager
db_manager = Tools.create_db_manager(CONFIG)

## request data
df = request_data(CONFIG, db_manager)

## create sprite
sprite = create_sprite(CONFIG, df)

## save sprite
sprite.save(os.path.join(CONFIG.get('directory')['reports'], 'figures', 'sprite_altas.png'), "PNG")

## generate facets
html_facets = generate_facets(CONFIG, df)
with open(os.path.join(CONFIG.get('directory')['reports'], 'facets.html'),'w') as f:
f.write(html_facets)

## generate facets-dive
html_facets_dive = generate_facets_dive(CONFIG, df, './figures/sprite_altas.png')
with open(os.path.join(CONFIG.get('directory')['reports'], 'facets-dive.html'), 'w') as f:
f.write(html_facets_dive)


if __name__ == '__main__':
main()

Loading…
Cancel
Save