Skip to content

Commit

Permalink
initial v2 fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Kobren committed Aug 13, 2024
1 parent 360afe9 commit 0bcff99
Show file tree
Hide file tree
Showing 15 changed files with 1,833 additions and 996 deletions.
85 changes: 63 additions & 22 deletions cfg.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,32 @@
"""
cfg
cfg.py
config file for ramediesDN
Before using the program, specify the directory in which the tool is located.
Directory name must end with "/ramedies". Variable script_directory
Configuration file for RaMeDiES algorithms
"""

# Change this variable to the directory in which the program is located
script_directory = "/home/sk758/gitrepos/RaMeDiES"
import os

# Automatically extract current directory (where this script is)
script_directory = os.path.dirname(os.path.realpath(__file__))
script_directory = '/'.join(script_directory.split('/')[:-1])

# Paths to files containing mutation rate-score distributions
# Change the values in this dictionary to incorporate custom annotations.
# See the manual for details.
variant_scores_files = {'C': {'S': f"{script_directory}/data/score_lists_CS.txt.gz",
'I': f"{script_directory}/data/score_lists_CI.txt.gz"},
'I': {'S': f"{script_directory}/data/score_lists_IS.txt.gz",
'I': f"{script_directory}/data/score_lists_II.txt.gz"}}
# See the GitHub wiki for details.

variant_scores_files = {'C': {'S': {"CADD": f"{script_directory}/data/CADD_CS.txt.gz",
"AlphaMissense": f"{script_directory}/data/AlphaMissense_MS.txt.gz",
"REVEL": f"{script_directory}/data/REVEL_MS.txt.gz",
"PAI3D": f"{script_directory}/data/PAI3D_MS.txt.gz"},
'I': f"{script_directory}/data/CADD_CI.txt.gz"},
'I': {'S': f"{script_directory}/data/SpliceAI_IS.txt.gz",
'I': f"{script_directory}/data/SpliceAI_II.txt.gz"}}

# Path to pseudogene/RNA gene/overlapping gene list
pseudogenes = f"{script_directory}/data/pseudogenes.txt.gz"

# Path to the table containing s_het values
shet_table = f"{script_directory}/data/shet_table.txt.gz"
# Path to the table containing GeneBayes, s_het, and other gene constraint values
gene_score_table = f"{script_directory}/data/gene_constraint_scores.txt.gz"

# Path to the table with ENSEMBL ID-Gene ID matches
ens2gene = f"{script_directory}/data/ens2gene.txt.gz"
Expand All @@ -41,45 +45,75 @@
# For the incorporation of other variant types (such as regulatory region variants), you will need to precompute the
# variant score files and include them in the "variant_score_files" dictionary defined above.
VEP_cons_dict = {'5_prime_UTR_variant': 'U',
'5_prime_UTR': 'U',
'5PRIME_UTR': 'U',
'3_prime_UTR_variant': 'U',
'3_prime_UTR': 'U',
'3PRIME_UTR': 'U',
'upstream_gene_variant': 'U',
'upstream': 'U',
'downstream_gene_variant': 'U',
'downstream': 'U',
'DOWNSTREAM': 'U',
'regulatory': 'U',
'UPSTREAM': 'U',
'intron_variant': 'I',
'intron': 'I',
'INTRONIC': 'I',
'splice_acceptor_variant': 'I',
'splice_acceptor': 'I',
'splice_donor_variant': 'I',
'splice_donor': 'I',
'splice_donor_region_variant': 'I',
'splice_region_variant': 'I',
'splice_donor_5th_base_variant': 'I',
'splice_donor_5th_base': 'I',
'splice_polypyrimidine_tract_variant': 'I',
'splice_polypyrimidine_tract': 'I',
'splice': 'I',
'CANONICAL_SPLICE': 'I',
'SPLICE_SITE': 'I',
'synonymous_variant': 'S',
'synonymous': 'S',
'stop_retained_variant': 'C',
'stop_lost': 'C',
'stop_gained': 'C',
'start_lost': 'C',
'start_retained_variant': 'C',
'missense_variant': 'C',
'inframe_deletion': 'C',
'inframe_insertion': 'C',
'frameshift_variant': 'C',
'protein_altering_variant': 'C',
'STOP_GAINED': 'C',
'start_lost': 'M',
'STOP_LOST': 'C',
'start_retained_variant': 'M',
'missense_variant': 'M',
'missense': 'M',
'inframe_deletion': 'M',
'inframe_insertion': 'M',
'frameshift_variant': 'M',
'FRAME_SHIFT': 'M',
'INFRAME': 'M',
'protein_altering_variant': 'M',
'incomplete_terminal_codon_variant': 'C',
'coding_sequence_variant': 'C',
'coding_transcript_variant': 'C',
'NON_SYNONYMOUS': 'C',
'NMD': 'C',
'regulatory_region': 'O',
'transcript_ablation': 'O',
'transcript_amplification': 'O',
'feature_elongation': 'O',
'feature_truncation': 'O',
'mature_miRNA_variant': 'O',
'non_coding_transcript_variant': 'O',
'non_coding_exon': 'O',
'non_coding': 'O',
'TFBS_ablation': 'O',
'TFBS_amplification': 'O',
'TF_binding_site_variant': 'O',
'regulatory_region_ablation': 'O',
'regulatory_region_amplification': 'O',
'regulatory_region_variant': 'O',
'intergenic_variant': 'O',
'intergenic': 'O',
'sequence_variant': 'O',
'NONCODING_CHANGE': 'O',
'': None}

# Dictionary specifying inheritance pattern keywords
Expand All @@ -100,7 +134,7 @@
"SAI_AL": "SpliceAI_acceptor-loss-score",
"SAI_DG": "SpliceAI_donor-gain-score",
"SAI_DL": "SpliceAI_donor-loss-score",
"CADD": "CADD-raw", # CADD score of a mutation
"coding_score": "CADD-raw", # CADD score of a mutation
"ensembl_gene_id": "ensembl_gene_id", # ENSEMBL gene ID
"MAF": "MAF",
"inherited_from": "inherited_from", # Parent annotation
Expand All @@ -119,6 +153,13 @@
# 2nd letter: 'S' for SNP, 'I' for indel
var_annot_list = [('C', 'S'), ('C', 'I'), ('I', 'S'), ('I', 'I')]

# Identifiers used in thresholding of scores of input variants
# see parse_variant_lib/get_score_val function
var_type_to_var_str = {('C', 'S'): 'C',
('C', 'I'): 'CInd',
('I', 'S'): 'I',
('I', 'I'): 'I'}

# output file suffix: total number of DENOVO variants observed across cohort for each of 4 variant types
varcount_sums_DN = "denovo_variant_counts"

Expand Down
Binary file added data/AlphaMissense_MS.txt.gz
Binary file not shown.
File renamed without changes.
File renamed without changes.
Binary file added data/PAI3D_MS.txt.gz
Binary file not shown.
Binary file added data/REVEL_MS.txt.gz
Binary file not shown.
File renamed without changes.
File renamed without changes.
Binary file added data/gene_constraint_scores.txt.gz
Binary file not shown.
80 changes: 61 additions & 19 deletions init_objs_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,15 @@ def init_varcount_dict():
# Function that parses a precomputed mutational target file for CADD or SpliceAI score bins
# gene_instances: variant annotation -> ENSEMBL ID -> Gene instance (as specified in stat_lib)
def parse_variant_scores_file(variant_annot,
var_score_file_name,
Gene_inst_dict,
score_thr):

# total_mu is a normalizing value for mutational targets
# After the parsing, all mutational targets are normalized to sum up to unit
total_mu = 0
# Input files are specified in cfg.py
# variant_annot[0]: 'C'/'I' for coding or intronic variant types
# variant_annot[1]: 'S'/'I', for SNP or indel variant types
infile = cfg.variant_scores_files[variant_annot[0]][variant_annot[1]]
with gzip.open(infile, 'rt') as inh:
print(f"Parsing per-gene score-mutation rate distributions from: {infile}")
with gzip.open(var_score_file_name, 'rt') as inh:
print(f"Parsing per-gene score-mutation rate distributions from: {var_score_file_name}")
current_ID = None
for inh_str in inh:
if inh_str.startswith('#'):
Expand Down Expand Up @@ -77,10 +74,11 @@ def parse_variant_scores_file(variant_annot,
current_mu_list.append(eval(inh_str[1]))

# Processing of the final gene
Gene_inst_dict[variant_annot][current_ID] = sl.Gene(current_score_list,
current_mu_list)
if current_score_list:
Gene_inst_dict[variant_annot][current_ID] = sl.Gene(current_score_list,
current_mu_list)

total_mu += Gene_inst_dict[variant_annot][current_ID].mut_targ_arr[0]
total_mu += Gene_inst_dict[variant_annot][current_ID].mut_targ_arr[0]

# Normalization of mutational targets
for ENS_ID, Gene_obj in Gene_inst_dict[variant_annot].items():
Expand All @@ -93,6 +91,7 @@ def parse_variant_scores_file(variant_annot,
# total_mu_dict: variant annotation -> total mutational target of that annotation
def parse_variant_scores_files(score_thr_dict,
consequence_list,
coding_score,
suppress_indels):
# Initialization of gene_instances
# gene_instances:
Expand All @@ -107,12 +106,27 @@ def parse_variant_scores_files(score_thr_dict,
if suppress_indels and variant_annot[1] == 'I':
continue

score_thr = score_thr_dict[variant_annot[0]]
# Separate processing of Coding InDels
if variant_annot == ('C', 'I'):
score_thr = score_thr_dict["CInd"]
else:
score_thr = score_thr_dict[variant_annot[0]]

# Input files are specified in cfg.py
# variant_annot[0]: 'C'/'I' for coding or intronic variant types
# variant_annot[1]: 'S'/'I', for SNP or indel variant types
# coding_score: name of the score for coding SNPs provided under
# --coding_score input parameter
if variant_annot == ('C', 'S'):
var_score_file_name = cfg.variant_scores_files['C']['S'][coding_score]
else:
var_score_file_name = cfg.variant_scores_files[variant_annot[0]][variant_annot[1]]
# A lot of code in this tool has the same structure:
# Function that parses a list of files ->
# function parsing a single file ->
# (sometimes) some functions processing a single line
parse_variant_scores_file(variant_annot,
parse_variant_scores_file(variant_annot,
var_score_file_name,
Gene_inst_dict,
score_thr)

Expand Down Expand Up @@ -153,16 +167,44 @@ def make_ENS2GeneID_dict(forward=True):
# Loads s_het bin enrichment values from a file specified in cfg.py
# Enrichment values are the proportions of exclusively dominant OMIM genes
# in s_het deciles
def load_s_het_bins():
shet_dict = {}
with gzip.open(cfg.shet_table, 'rt') as inh:
def load_gene_score_bins(scoreID = "s_het_R"):
gene_score_dict = {}
with gzip.open(cfg.gene_score_table, 'rt') as inh:
for s in inh:
if s.startswith('#'):
continue
s = s.strip().split()
shet_dict[s[2]] = eval(s[11])
# s_het_dict: ENSEMBL_ID -> s_het enrichment
return shet_dict
if s[0] == 'ensembl_gene_id':
if scoreID not in s:
raise AssertionError(f"ERROR: Gene score ID {scoreID} misspecified")
score_index = s.index(scoreID)
continue

gene_score_dict[s[0]] = eval(s[score_index])
# gene_score_dict: ENSEMBL_ID -> score bin enrichment for dominant disease-
# causing genes
return gene_score_dict

# Write parameters used in the run to an intermediate file
def write_run_info(file_obj, args_obj):
"""
:param file_obj: python file object corresponding to an intermediate output file
:args_obj: argparse arguments object with run specifications supplied by the user
"""
file_obj.write(f"### Information about RaMeDiES run ID {args_obj.o}\n")
file_obj.write("### Do not discard this header when sharing intermediate outputs\n")
file_obj.write(f"### VARIANT ANNOTATIONS: {args_obj.variant_annots}\n")
file_obj.write(f"### INDEL SUPPRESSION: {args_obj.suppress_indels}\n")
file_obj.write(f"### ONLY MISSENSE CODING SNPS: {args_obj.missense_run}\n")
file_obj.write(f"### MAF CUTOFF: {args_obj.MAF}\n")
if 'C' in args_obj.variant_annots:
file_obj.write(f"### CODING SNP SCORE: {args_obj.coding_score}\n")
file_obj.write(f"### CODING SNP SCORE THRESHOLD: {args_obj.C_thr}\n")
if 'I' in args_obj.variant_annots:
file_obj.write(f"### INTRONIC SCORE: SpliceAI\n")
file_obj.write(f"### INTRONIC SCORE THRESHOLD: {args_obj.SAI_thr}\n")
if not args_obj.suppress_indels:
file_obj.write(f"### CODING INDEL SCORE: CADD\n")
file_obj.write(f"### CODING INDEL SCORE THRESHOLD: {args_obj.CI_thr}\n")



# Written on 01.23.2024 by Mikhail Moldovan, HMS DBMI
Loading

0 comments on commit 0bcff99

Please sign in to comment.