Snakemake¶
Example¶
This example is validation to select the best hyperparameters.
from itertools import chain, combinations, product
# Base paths and information
BASE_DIR = '/home/share/hwkim/dti/'
VALIDATION_OUTPUT_DIR = BASE_DIR + 'results/validation/'
OUTPUT_DIR = BASE_DIR + "results/performance/"
workdir: BASE_DIR
# Training file paths
TRAINING_DIR = BASE_DIR + 'data/training/'
TRAINING_DTIS = TRAINING_DIR + 'merged_dti.csv'
TRAINING_DRUGS = TRAINING_DIR + 'merged_compound.csv'
TRAINING_PROTS = TRAINING_DIR + 'merged_protein.csv'
# Validation file paths
VALIDATION_NAMES = 'MATADOR'
VALIDATION_DIR = BASE_DIR + 'data/validation/'
VALIDATION_DTIS = VALIDATION_DIR + 'validation_dti.csv'
VALIDATION_DRUGS = VALIDATION_DIR + 'validation_compound.csv'
VALIDATION_PROTS = VALIDATION_DIR + 'validation_protein.csv'
# Set hyperparameters
DRUG_PARAMS = {
'maccs': {
'drug_type': 'maccs',
'drug_length': '167',
'drug_layer': ['128', '64', '32', '16']
},
'pubchem': {
'drug_type=': 'pubchem',
'drug_length': '881',
'drug_layer': ['512', '256', '128', '64', '32']
},
'ecfp4': {
'drug_type=': 'ecfp4',
'drug_length': '2048',
'drug_layer': ['1024', '512', '256', '128', '64', '32']
},
'ecfp6': {
'drug_type=': 'ecfp6',
'drug_length': '2048',
'drug_layer': ['1024', '512', '256', '128', '64', '32']
},
'pharm2': {
'drug_type=': 'pharm2',
'drug_length': '135',
'drug_layer': ['128', '64', '32', '16']
},
'mol2vec': {
'drug_type=': 'mol2vec',
'drug_length': '300',
'drug_layer': ['256', '128', '64', '32', '16']
}
}
PROTEIN_TYPE = ['Convolution']
PROTEIN_LENGTH = ['2500']
WINDOW_SIZE = ['10 15 20 25 30']
HIDDEN_LAYER = ['128']
DRUG_TYPE = [config['drug_type']]
DRUG_LENGTH = [DRUG_PARAMS[config['drug_type']]['drug_length']]
DRUG_LAYER = [
' '.join(layers)
for layers in chain.from_iterable(
combinations(DRUG_PARAMS[config['drug_type']]['drug_layer'], n)
for n in range(1, len(DRUG_PARAMS[config['drug_type']]['drug_layer'])+1)
)
]
FC_LAYER = [str(int(HIDDEN_LAYER[0]) + int(layers[-1])) for layers in DRUG_LAYER]
EPOCH = ['50']
LEARNING_RATE = ['0.0001']
DECAY = ['0.0001']
ACTIVATION = ['elu']
DROPOUT = ['0.00']
THRESHHOLD = ['0.2']
VALIDATION_NAME = ['MATADOR']
# Set parameter set dictionary
PARAM_KEYS = [
'protein_type', 'protein_length', 'window_size', 'hidden_layer',
'drug_type', 'drug_length', 'drug_layer', 'fc_layer',
'epoch', 'learning_rate', 'decay', 'activation', 'dropout', 'threshold', 'validation_name'
]
paramset_dic = {
paramset[4] + '_' + '_'.join(paramset[6].split(' ')): {param_key: param for param_key, param in zip(PARAM_KEYS, paramset)}
for paramset in product(
PROTEIN_TYPE, PROTEIN_LENGTH, WINDOW_SIZE, HIDDEN_LAYER,
DRUG_TYPE, DRUG_LENGTH, DRUG_LAYER, FC_LAYER,
EPOCH, LEARNING_RATE, DECAY, ACTIVATION, DROPOUT, THRESHHOLD, VALIDATION_NAME
)
}
# Rule for validation
rule validation:
input:
training_dtis = TRAINING_DTIS,
training_drugs = TRAINING_DRUGS,
training_prots = TRAINING_PROTS,
validation_dtis = VALIDATION_DTIS,
validation_drugs = VALIDATION_DRUGS,
validation_prots = VALIDATION_PROTS
run:
for paramset_key, param_dic in paramset_dic.items():
output = VALIDATION_OUTPUT_DIR + paramset_key + '.csv'
shell(
'''
python DTI_deep.py
{input.training_dtis} {input.training_drugs} {input.training_prots}
-i {input.validation_dtis} -d {input.validation_drugs} -t {input.validation_prots}
'''.replace('\n', '') +
'''
-V {drug_type} -L {drug_length} -c {drug_layer}
-v {protein_type} -l {protein_length} -w {window_size} -p {hidden_layer} -f {fc_layer} -F 128
-e {epoch} -r {learning_rate} -y {decay} -a {activation} -b 64 -D {dropout}
--validation -n {validation_name}
'''.format(**param_dic).replace('\n', '') + '-o ' + output + ' -g ' + str(config['gpu_num'])
)
출처