config.json

{
    "model_name":"spiraconv_v2", // spiraconv_v1 and spiraconv_v2
    "seed":42,
    "dataset":{
       // use the overlapping technique, in which the window was shifted "step" second each time and an instance was extracted during the total audio duration
        "split_wav_using_overlapping": true, // if true use the overlapping technique, in which the window was shifted "step" second each time and an instance was extracted during the total audio duration
        "window_len": 4, // windows len in seconds for overlapping technique
        "step": 1, // step len in seconds for overlapping technique
        "padding_with_max_lenght": false,// if true, the timestep axis will be padded with the longest sequence, else, the timestep will be dynamic and the network will predict a vector of the size of the timestep on the output (if you use convulsions it can decrease)
        "max_seq_len": null, // the data loader set automatically this value and save in train_config['logs_path']/config.json. its is max time dimension value in your train dataset. For convert this value to seconds use: value*hop_length/sample_rate

        // train dataset
        "train_csv":"../SPIRA_Dataset_V2/metadata_train.csv", // format path_wav, class
        "train_data_root_path": "../SPIRA_Dataset_V2/", // complementary path for csv
        // evaluation dataset
        "eval_csv":"../SPIRA_Dataset_V2/metadata_eval.csv", // format path_wav, class
        "eval_data_root_path": "../SPIRA_Dataset_V2/", // complementary path for csv
        // test dataset
        "test_csv":"../SPIRA_Dataset_V2/metadata_test.csv", // format path_wav, class
        "test_data_root_path": "../SPIRA_Dataset_V2/", // complementary path for csv
        // noise dataset for data aumentation
        "noise_csv":"../Ruidos-Hospitalares/noise_data.csv",// format path_wav
        "noise_data_root_path":"../Ruidos-Hospitalares/", // complementary path for csv
        // classes definition
        "control_class": 0,
        "patient_class": 1
    },
    "model":{
        "fc1_dim":100,
        "fc2_dim":1
    },
    "train_config": {
        "early_stop_epochs": 40, // if 0 disable else if the model does not improve after "early_stop_epochs" epochs, training is stopped
        "lr_decay": true, // activate/desactivate Noam Learning Rate Decay Scheme    
        "warmup_steps": 500, //default 4000, Noam decay steps to increase the learning rate from 0 to "lr"
        "epochs": 1000,
        "learning_rate": 1e-3, // Initial learning rate. If Noam decay is active, maximum learning rate.
        "weight_decay": 0.01,  // Weight decay rate for optimizer
        "optimizer":"adam",
        "batch_size": 30,
        "seed": 42,
        "num_workers": 14,
        "logs_path": "../checkpoints/spira-chekpoints/",
        "reinit_layers": null,
        "summary_interval": 10,
        "checkpoint_interval": 500
    },
    "data_aumentation":{
        "insert_noise": true, // disable/enable noise insertion
        "num_noise_control": 4, // number of diferents noise samples applied in control files 
        "num_noise_patient": 3, // number of diferents noise samples applied in control files ! we recomend num_noise_control - 1
        // You need set this values, its the maximum and minimum values for noise patients intervals.  You need to cut out the noise in several patient files and calculate the maximum maximum amplitude and the minimum maximum amplitude. An example script to get these values is available at: get_max_amp_using_noise_samples.py
        "noise_max_amp": 0.19233719,
        "noise_min_amp": 0.033474047
    },
    "test_config": {
        "batch_size": 10, // used on evaluation and test
        "num_workers": 10 // used on evaluation and test
    },
    "audio": {
        "feature": "hpcp", // spectrogram or melspectrogram or mfcc
        // wave load paramers
        "sample_rate": 16000,
        "normalize": true, // if true, then output is divided by 1 << 31 (assumes signed 32-bit audio), and normalizes to [-1, 1]. If number, then output is divided by that number If callable, then the output is passed as a parameter to the given function, then the output is divided by the result. 

        // mel spectorgram paramers
        "num_mels": 40, // number of mels coefficients
        "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
        "mel_fmax": null, // maximum freq level for mel-spec. Tune for dataset!!  good value 8000.0

        // mfcc paramers
        "num_mfcc": 40, // Number of mfc coefficients to retain 
        "log_mels": false, //  whether to use log-mel spectrograms instead of db-scaled.

        // hpcp parameters
        "f_min": 100,
        "f_max": 20000, // default:5000,
        "global_thr": 80,  // in dB below the highest peak
        "local_thr": 30,  // in dB below 0
        "bins_per_octave": 12,
        "filter_width": 0.33,  // in octaves
        "harmonic_decay": 0.6,
        "harmonic_tolerance": 0.66,  // in semitones
        "final_thr": 0.0,
        //"num_hpcp": 120, # bins-per-octave is more appropriate terminology
        
        // general config
        "n_fft": 1200,
        "num_freq": 601,// n_fft//2 + 1
        "hop_length": 160,
        "win_length": 400
    }
}