mirror of
https://github.com/chidiwilliams/buzz.git
synced 2026-03-14 14:45:46 +01:00
304 lines
6.2 KiB
YAML
304 lines
6.2 KiB
YAML
defaults:
|
|
- _self_
|
|
- dset: musdb44
|
|
- svd: default
|
|
- variant: default
|
|
- override hydra/hydra_logging: colorlog
|
|
- override hydra/job_logging: colorlog
|
|
|
|
dummy:
|
|
dset:
|
|
musdb: /checkpoint/defossez/datasets/musdbhq
|
|
musdb_samplerate: 44100
|
|
use_musdb: true # set to false to not use musdb as training data.
|
|
wav: # path to custom wav dataset
|
|
wav2: # second custom wav dataset
|
|
segment: 11
|
|
shift: 1
|
|
train_valid: false
|
|
full_cv: true
|
|
samplerate: 44100
|
|
channels: 2
|
|
normalize: true
|
|
metadata: ./metadata
|
|
sources: ['drums', 'bass', 'other', 'vocals']
|
|
valid_samples: # valid dataset size
|
|
backend: null # if provided select torchaudio backend.
|
|
|
|
test:
|
|
save: False
|
|
best: True
|
|
workers: 2
|
|
every: 20
|
|
split: true
|
|
shifts: 1
|
|
overlap: 0.25
|
|
sdr: true
|
|
metric: 'loss' # metric used for best model selection on the valid set, can also be nsdr
|
|
nonhq: # path to non hq MusDB for evaluation
|
|
|
|
epochs: 360
|
|
batch_size: 64
|
|
max_batches: # limit the number of batches per epoch, useful for debugging
|
|
# or if your dataset is gigantic.
|
|
optim:
|
|
lr: 3e-4
|
|
momentum: 0.9
|
|
beta2: 0.999
|
|
loss: l1 # l1 or mse
|
|
optim: adam
|
|
weight_decay: 0
|
|
clip_grad: 0
|
|
|
|
seed: 42
|
|
debug: false
|
|
valid_apply: true
|
|
flag:
|
|
save_every:
|
|
weights: [1., 1., 1., 1.] # weights over each source for the training/valid loss.
|
|
|
|
augment:
|
|
shift_same: false
|
|
repitch:
|
|
proba: 0.2
|
|
max_tempo: 12
|
|
remix:
|
|
proba: 1
|
|
group_size: 4
|
|
scale:
|
|
proba: 1
|
|
min: 0.25
|
|
max: 1.25
|
|
flip: true
|
|
|
|
continue_from: # continue from other XP, give the XP Dora signature.
|
|
continue_pretrained: # signature of a pretrained XP, this cannot be a bag of models.
|
|
pretrained_repo: # repo for pretrained model (default is official AWS)
|
|
continue_best: true
|
|
continue_opt: false
|
|
|
|
misc:
|
|
num_workers: 10
|
|
num_prints: 4
|
|
show: false
|
|
verbose: false
|
|
|
|
# List of decay for EMA at batch or epoch level, e.g. 0.999.
|
|
# Batch level EMA are kept on GPU for speed.
|
|
ema:
|
|
epoch: []
|
|
batch: []
|
|
|
|
use_train_segment: true # to remove
|
|
model_segment: # override the segment parameter for the model, usually 4 times the training segment.
|
|
model: demucs # see demucs/train.py for the possibilities, and config for each model hereafter.
|
|
demucs: # see demucs/demucs.py for a detailed description
|
|
# Channels
|
|
channels: 64
|
|
growth: 2
|
|
# Main structure
|
|
depth: 6
|
|
rewrite: true
|
|
lstm_layers: 0
|
|
# Convolutions
|
|
kernel_size: 8
|
|
stride: 4
|
|
context: 1
|
|
# Activations
|
|
gelu: true
|
|
glu: true
|
|
# Normalization
|
|
norm_groups: 4
|
|
norm_starts: 4
|
|
# DConv residual branch
|
|
dconv_depth: 2
|
|
dconv_mode: 1 # 1 = branch in encoder, 2 = in decoder, 3 = in both.
|
|
dconv_comp: 4
|
|
dconv_attn: 4
|
|
dconv_lstm: 4
|
|
dconv_init: 1e-4
|
|
# Pre/post treatment
|
|
resample: true
|
|
normalize: false
|
|
# Weight init
|
|
rescale: 0.1
|
|
|
|
hdemucs: # see demucs/hdemucs.py for a detailed description
|
|
# Channels
|
|
channels: 48
|
|
channels_time:
|
|
growth: 2
|
|
# STFT
|
|
nfft: 4096
|
|
wiener_iters: 0
|
|
end_iters: 0
|
|
wiener_residual: false
|
|
cac: true
|
|
# Main structure
|
|
depth: 6
|
|
rewrite: true
|
|
hybrid: true
|
|
hybrid_old: false
|
|
# Frequency Branch
|
|
multi_freqs: []
|
|
multi_freqs_depth: 3
|
|
freq_emb: 0.2
|
|
emb_scale: 10
|
|
emb_smooth: true
|
|
# Convolutions
|
|
kernel_size: 8
|
|
stride: 4
|
|
time_stride: 2
|
|
context: 1
|
|
context_enc: 0
|
|
# normalization
|
|
norm_starts: 4
|
|
norm_groups: 4
|
|
# DConv residual branch
|
|
dconv_mode: 1
|
|
dconv_depth: 2
|
|
dconv_comp: 4
|
|
dconv_attn: 4
|
|
dconv_lstm: 4
|
|
dconv_init: 1e-3
|
|
# Weight init
|
|
rescale: 0.1
|
|
|
|
# Torchaudio implementation of HDemucs
|
|
torch_hdemucs:
|
|
# Channels
|
|
channels: 48
|
|
growth: 2
|
|
# STFT
|
|
nfft: 4096
|
|
# Main structure
|
|
depth: 6
|
|
freq_emb: 0.2
|
|
emb_scale: 10
|
|
emb_smooth: true
|
|
# Convolutions
|
|
kernel_size: 8
|
|
stride: 4
|
|
time_stride: 2
|
|
context: 1
|
|
context_enc: 0
|
|
# normalization
|
|
norm_starts: 4
|
|
norm_groups: 4
|
|
# DConv residual branch
|
|
dconv_depth: 2
|
|
dconv_comp: 4
|
|
dconv_attn: 4
|
|
dconv_lstm: 4
|
|
dconv_init: 1e-3
|
|
|
|
htdemucs: # see demucs/htdemucs.py for a detailed description
|
|
# Channels
|
|
channels: 48
|
|
channels_time:
|
|
growth: 2
|
|
# STFT
|
|
nfft: 4096
|
|
wiener_iters: 0
|
|
end_iters: 0
|
|
wiener_residual: false
|
|
cac: true
|
|
# Main structure
|
|
depth: 4
|
|
rewrite: true
|
|
# Frequency Branch
|
|
multi_freqs: []
|
|
multi_freqs_depth: 3
|
|
freq_emb: 0.2
|
|
emb_scale: 10
|
|
emb_smooth: true
|
|
# Convolutions
|
|
kernel_size: 8
|
|
stride: 4
|
|
time_stride: 2
|
|
context: 1
|
|
context_enc: 0
|
|
# normalization
|
|
norm_starts: 4
|
|
norm_groups: 4
|
|
# DConv residual branch
|
|
dconv_mode: 1
|
|
dconv_depth: 2
|
|
dconv_comp: 8
|
|
dconv_init: 1e-3
|
|
# Before the Transformer
|
|
bottom_channels: 0
|
|
# CrossTransformer
|
|
# ------ Common to all
|
|
# Regular parameters
|
|
t_layers: 5
|
|
t_hidden_scale: 4.0
|
|
t_heads: 8
|
|
t_dropout: 0.0
|
|
t_layer_scale: True
|
|
t_gelu: True
|
|
# ------------- Positional Embedding
|
|
t_emb: sin
|
|
t_max_positions: 10000 # for the scaled embedding
|
|
t_max_period: 10000.0
|
|
t_weight_pos_embed: 1.0
|
|
t_cape_mean_normalize: True
|
|
t_cape_augment: True
|
|
t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
|
|
t_sin_random_shift: 0
|
|
# ------------- norm before a transformer encoder
|
|
t_norm_in: True
|
|
t_norm_in_group: False
|
|
# ------------- norm inside the encoder
|
|
t_group_norm: False
|
|
t_norm_first: True
|
|
t_norm_out: True
|
|
# ------------- optim
|
|
t_weight_decay: 0.0
|
|
t_lr:
|
|
# ------------- sparsity
|
|
t_sparse_self_attn: False
|
|
t_sparse_cross_attn: False
|
|
t_mask_type: diag
|
|
t_mask_random_seed: 42
|
|
t_sparse_attn_window: 400
|
|
t_global_window: 100
|
|
t_sparsity: 0.95
|
|
t_auto_sparsity: False
|
|
# Cross Encoder First (False)
|
|
t_cross_first: False
|
|
# Weight init
|
|
rescale: 0.1
|
|
|
|
svd: # see svd.py for documentation
|
|
penalty: 0
|
|
min_size: 0.1
|
|
dim: 1
|
|
niters: 2
|
|
powm: false
|
|
proba: 1
|
|
conv_only: false
|
|
convtr: false
|
|
bs: 1
|
|
|
|
quant: # quantization hyper params
|
|
diffq: # diffq penalty, typically 1e-4 or 3e-4
|
|
qat: # use QAT with a fixed number of bits (not as good as diffq)
|
|
min_size: 0.2
|
|
group_size: 8
|
|
|
|
dora:
|
|
dir: outputs
|
|
exclude: ["misc.*", "slurm.*", 'test.reval', 'flag', 'dset.backend']
|
|
|
|
slurm:
|
|
time: 4320
|
|
constraint: volta32gb
|
|
setup: ['module load cudnn/v8.4.1.50-cuda.11.6 NCCL/2.11.4-6-cuda.11.6 cuda/11.6']
|
|
|
|
# Hydra config
|
|
hydra:
|
|
job_logging:
|
|
formatters:
|
|
colorlog:
|
|
datefmt: "%m-%d %H:%M:%S"
|