config.ini

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# +                              FILES AND DIRECTORIES SPECIFICATION                                     |
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# List of location where directories are to be created and where input data files are located.
#  - HLA-Vec_embedding : directory location of where the learned distributed vector representation object 
#                        is to be saved.
#  - HLA-CNN_models: directory location where HLA-CNN models are to be saved.
#  - results: directory location where predicted scores of peptide are to be saved in "inference" mode.
#  - train_set : file of the training data set.
#  - test_set : file of the test data set.
[FilesDirectories]
HLA-Vec_embedding = HLA-Vec_embedding
HLA-CNN_models = HLA-CNN_models
results = prediction_results
train_set = train_test_data/train_data/proteins.txt
#test_set = train_test_data/test_data/A0201_9mer_1028924
test_set = train_test_data/test_data/B2705
#test_set = train_test_data/test_data/B5701
#test_set = train_test_data/test_data/A0201_9mer_1028928
#test_set = train_test_data/test_data/B0702
#test_set = train_test_data/test_data/B2703
#test_set = train_test_data/test_data/A0201_9mer_1028790
#test_set = train_test_data/test_data/A0201_10mer_1028790
#test_set = train_test_data/test_data/A0202
#test_set = train_test_data/test_data/A0203_9mer
#test_set = train_test_data/test_data/A0203_10mer
#test_set = train_test_data/test_data/A0206_9mer
#test_set = train_test_data/test_data/A0206_10mer
#test_set = train_test_data/test_data/A6802_9mer
#test_set = train_test_data/test_data/A6802_10mer

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# +                         HLA-VEC: DISTRIBUTE REPRESENTATION SPECIFICATION                               |
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# List of parameters used by the skip-gram model to learn a distributed representation, HLA-Vec, for amino
# acids.
#  - vec_dim : dimension of the vector representation.
#  - min_count : ignore all amino acids with total counts lower than this threshold.
#  - window_size : maximum distance between current and adjacent amino acid within a peptide sequence.
#  - sg_model : defines the training algorithm. (sg_model = False), CBOW is used. Otherwise
#             (sg_model = True), skip-gram is used.
#  - iter : number of iterations (epochs) over the corpus.
[HLA-Vec]
vec_dim = 15
min_count = 2
window_size = 5
sg_model = True
iter = 10


# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# +                                      HLA-CNN SPECIFICATIONS                                             |
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# List of parameters used by the HLA-CNN model implemented using Keras API on top of Theano backend.  
# Optimizer hard coded to use Adams.
#  - epochs : number of iterations over the training dataset.
#  - lr : learning rate for the optimizer
#  - filter_size : the dimensionality of the output of the filter
#  - filter_length : the length of the filter kernel
#  - dropout : precentage of dropout.
[HLA-CNN]
epochs = 100
lr = .004
filter_size = 32
filter_length = 7
dropout = .25


# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# +                                      PIPELINE SPECIFICATION                                            |
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Steps to perform in the learning pipeline.
#  - HLA_Vec: learns a distributed vector representation, HLA-Vec, with the Skip-gram model.
#  - train : Trains the HLA-CNN classifier and save models to file
#  - evaluate : make prediction and evaluate performance (SRCC and AUC) on 'test_set'
#  - inference: makes prediction and output scores to file on 'test_set'
[Pipeline]
HLA_Vec = False
train = True
evaluate = True
inference = True