initial v2 fixes

hms-dbmi · Aug 13, 2024 · 0bcff99 · 0bcff99
1 parent 360afe9
commit 0bcff99
Show file tree

Hide file tree

Showing 15 changed files with 1,833 additions and 996 deletions.
diff --git a/cfg.py b/cfg.py
@@ -1,28 +1,32 @@
 """
-cfg
+cfg.py
 
-config file for ramediesDN
-
-Before using the program, specify the directory in which the tool is located.
-Directory name must end with "/ramedies". Variable script_directory
+Configuration file for RaMeDiES algorithms
 """
 
-# Change this variable to the directory in which the program is located
-script_directory = "/home/sk758/gitrepos/RaMeDiES"
+import os
+
+# Automatically extract current directory (where this script is)
+script_directory = os.path.dirname(os.path.realpath(__file__))
+script_directory = '/'.join(script_directory.split('/')[:-1])
 
 # Paths to files containing mutation rate-score distributions
 # Change the values in this dictionary to incorporate custom annotations.
-# See the manual for details.
-variant_scores_files = {'C': {'S': f"{script_directory}/data/score_lists_CS.txt.gz",
-                              'I': f"{script_directory}/data/score_lists_CI.txt.gz"},
-                        'I': {'S': f"{script_directory}/data/score_lists_IS.txt.gz",
-                              'I': f"{script_directory}/data/score_lists_II.txt.gz"}}
+# See the GitHub wiki for details.
+
+variant_scores_files = {'C': {'S': {"CADD": f"{script_directory}/data/CADD_CS.txt.gz",
+                                    "AlphaMissense": f"{script_directory}/data/AlphaMissense_MS.txt.gz",
+                                    "REVEL": f"{script_directory}/data/REVEL_MS.txt.gz",
+                                    "PAI3D": f"{script_directory}/data/PAI3D_MS.txt.gz"},
+                              'I': f"{script_directory}/data/CADD_CI.txt.gz"},
+                        'I': {'S': f"{script_directory}/data/SpliceAI_IS.txt.gz",
+                              'I': f"{script_directory}/data/SpliceAI_II.txt.gz"}}
 
 # Path to pseudogene/RNA gene/overlapping gene list
 pseudogenes = f"{script_directory}/data/pseudogenes.txt.gz"
 
-# Path to the table containing s_het values
-shet_table = f"{script_directory}/data/shet_table.txt.gz"
+# Path to the table containing GeneBayes, s_het, and other gene constraint values
+gene_score_table = f"{script_directory}/data/gene_constraint_scores.txt.gz"
 
 # Path to the table with ENSEMBL ID-Gene ID matches
 ens2gene = f"{script_directory}/data/ens2gene.txt.gz"
@@ -41,45 +45,75 @@
 # For the incorporation of other variant types (such as regulatory region variants), you will need to precompute the
 #  variant score files and include them in the "variant_score_files" dictionary defined above. 
 VEP_cons_dict = {'5_prime_UTR_variant': 'U',
+                 '5_prime_UTR': 'U',
+                 '5PRIME_UTR': 'U',
                  '3_prime_UTR_variant': 'U',
+                 '3_prime_UTR': 'U',
+                 '3PRIME_UTR': 'U',
                  'upstream_gene_variant': 'U',
+                 'upstream': 'U',
                  'downstream_gene_variant': 'U',
+                 'downstream': 'U',
+                 'DOWNSTREAM': 'U',
+                 'regulatory': 'U',
+                 'UPSTREAM': 'U',
                  'intron_variant': 'I',
+                 'intron': 'I',
+                 'INTRONIC': 'I',
                  'splice_acceptor_variant': 'I',
+                 'splice_acceptor': 'I',
                  'splice_donor_variant': 'I',
+                 'splice_donor': 'I',
                  'splice_donor_region_variant': 'I',
                  'splice_region_variant': 'I',
                  'splice_donor_5th_base_variant': 'I',
+                 'splice_donor_5th_base': 'I',
                  'splice_polypyrimidine_tract_variant': 'I',
+                 'splice_polypyrimidine_tract': 'I',
+                 'splice': 'I',
+                 'CANONICAL_SPLICE': 'I',
+                 'SPLICE_SITE': 'I',
                  'synonymous_variant': 'S',
+                 'synonymous': 'S',
                  'stop_retained_variant': 'C',
                  'stop_lost': 'C',
                  'stop_gained': 'C',
-                 'start_lost': 'C',
-                 'start_retained_variant': 'C',
-                 'missense_variant': 'C',
-                 'inframe_deletion': 'C',
-                 'inframe_insertion': 'C',
-                 'frameshift_variant': 'C',
-                 'protein_altering_variant': 'C',
+                 'STOP_GAINED': 'C',
+                 'start_lost': 'M',
+                 'STOP_LOST': 'C',
+                 'start_retained_variant': 'M',
+                 'missense_variant': 'M',
+                 'missense': 'M',
+                 'inframe_deletion': 'M',
+                 'inframe_insertion': 'M',
+                 'frameshift_variant': 'M',
+                 'FRAME_SHIFT': 'M',
+                 'INFRAME': 'M',
+                 'protein_altering_variant': 'M',
                  'incomplete_terminal_codon_variant': 'C',
                  'coding_sequence_variant': 'C',
                  'coding_transcript_variant': 'C',
+                 'NON_SYNONYMOUS': 'C',
+                 'NMD': 'C',
                  'regulatory_region': 'O',
                  'transcript_ablation': 'O',
                  'transcript_amplification': 'O',
                  'feature_elongation': 'O',
                  'feature_truncation': 'O',
                  'mature_miRNA_variant': 'O',
                  'non_coding_transcript_variant': 'O',
+                 'non_coding_exon': 'O',
+                 'non_coding': 'O',
                  'TFBS_ablation': 'O',
                  'TFBS_amplification': 'O',
                  'TF_binding_site_variant': 'O',
                  'regulatory_region_ablation': 'O',
                  'regulatory_region_amplification': 'O',
                  'regulatory_region_variant': 'O',
                  'intergenic_variant': 'O',
+                 'intergenic': 'O',
                  'sequence_variant': 'O',
+                 'NONCODING_CHANGE': 'O',
                  '': None}
 
 # Dictionary specifying inheritance pattern keywords 
@@ -100,7 +134,7 @@
                    "SAI_AL": "SpliceAI_acceptor-loss-score",
                    "SAI_DG": "SpliceAI_donor-gain-score",
                    "SAI_DL": "SpliceAI_donor-loss-score",
-                   "CADD": "CADD-raw",  # CADD score of a mutation
+                   "coding_score": "CADD-raw",  # CADD score of a mutation
                    "ensembl_gene_id": "ensembl_gene_id",  # ENSEMBL gene ID
                    "MAF": "MAF",
                    "inherited_from": "inherited_from",  # Parent annotation
@@ -119,6 +153,13 @@
 # 2nd letter: 'S' for SNP, 'I' for indel
 var_annot_list = [('C', 'S'), ('C', 'I'), ('I', 'S'), ('I', 'I')]
 
+# Identifiers used in thresholding of scores of input variants
+#   see parse_variant_lib/get_score_val function
+var_type_to_var_str = {('C', 'S'): 'C',
+                       ('C', 'I'): 'CInd',
+                       ('I', 'S'): 'I',
+                       ('I', 'I'): 'I'}
+
 # output file suffix: total number of DENOVO variants observed across cohort for each of 4 variant types
 varcount_sums_DN = "denovo_variant_counts"
 

diff --git a/data/AlphaMissense_MS.txt.gz b/data/AlphaMissense_MS.txt.gz
diff --git a/data/score_lists_CI.txt.gz → data/CADD_CI.txt.gz b/data/score_lists_CI.txt.gz → data/CADD_CI.txt.gz
diff --git a/data/score_lists_CS.txt.gz → data/CADD_CS.txt.gz b/data/score_lists_CS.txt.gz → data/CADD_CS.txt.gz
diff --git a/data/PAI3D_MS.txt.gz b/data/PAI3D_MS.txt.gz
diff --git a/data/REVEL_MS.txt.gz b/data/REVEL_MS.txt.gz
diff --git a/data/score_lists_II.txt.gz → data/SpliceAI_II.txt.gz b/data/score_lists_II.txt.gz → data/SpliceAI_II.txt.gz
diff --git a/data/score_lists_IS.txt.gz → data/SpliceAI_IS.txt.gz b/data/score_lists_IS.txt.gz → data/SpliceAI_IS.txt.gz
diff --git a/data/gene_constraint_scores.txt.gz b/data/gene_constraint_scores.txt.gz
diff --git a/init_objs_lib.py b/init_objs_lib.py
@@ -32,18 +32,15 @@ def init_varcount_dict():
 # Function that parses a precomputed mutational target file for CADD or SpliceAI score bins
 # gene_instances: variant annotation -> ENSEMBL ID -> Gene instance (as specified in stat_lib)
 def parse_variant_scores_file(variant_annot,
+							  var_score_file_name,
 							  Gene_inst_dict,
 							  score_thr):
 
 	# total_mu is a normalizing value for mutational targets
 	# After the parsing, all mutational targets are normalized to sum up to unit
 	total_mu = 0
-	# Input files are specified in cfg.py
-	# variant_annot[0]: 'C'/'I' for coding or intronic variant types
-	# variant_annot[1]: 'S'/'I', for SNP or indel variant types
-	infile = cfg.variant_scores_files[variant_annot[0]][variant_annot[1]]
-	with gzip.open(infile, 'rt') as inh:
-		print(f"Parsing per-gene score-mutation rate distributions from: {infile}")
+	with gzip.open(var_score_file_name, 'rt') as inh:
+		print(f"Parsing per-gene score-mutation rate distributions from: {var_score_file_name}")
 		current_ID = None
 		for inh_str in inh:
 			if inh_str.startswith('#'):
@@ -77,10 +74,11 @@ def parse_variant_scores_file(variant_annot,
 			current_mu_list.append(eval(inh_str[1]))
 
 	# Processing of the final gene
-	Gene_inst_dict[variant_annot][current_ID] = sl.Gene(current_score_list,
-														current_mu_list)
+	if current_score_list:
+		Gene_inst_dict[variant_annot][current_ID] = sl.Gene(current_score_list,
+															current_mu_list)
 
-	total_mu += Gene_inst_dict[variant_annot][current_ID].mut_targ_arr[0]
+		total_mu += Gene_inst_dict[variant_annot][current_ID].mut_targ_arr[0]
 
 	# Normalization of mutational targets
 	for ENS_ID, Gene_obj in Gene_inst_dict[variant_annot].items():
@@ -93,6 +91,7 @@ def parse_variant_scores_file(variant_annot,
 # total_mu_dict: variant annotation -> total mutational target of that annotation
 def parse_variant_scores_files(score_thr_dict,
 							   consequence_list,
+							   coding_score,
 			  				   suppress_indels):
 	# Initialization of gene_instances
 	# gene_instances:
@@ -107,12 +106,27 @@ def parse_variant_scores_files(score_thr_dict,
 		if suppress_indels and variant_annot[1] == 'I':
 			continue
 
-		score_thr = score_thr_dict[variant_annot[0]]
+		# Separate processing of Coding InDels
+		if variant_annot == ('C', 'I'):
+			score_thr = score_thr_dict["CInd"]
+		else:
+			score_thr = score_thr_dict[variant_annot[0]]
+
+		# Input files are specified in cfg.py
+		# variant_annot[0]: 'C'/'I' for coding or intronic variant types
+		# variant_annot[1]: 'S'/'I', for SNP or indel variant types
+		# coding_score: name of the score for coding SNPs provided under 
+		#	--coding_score input parameter
+		if variant_annot == ('C', 'S'):
+			var_score_file_name = cfg.variant_scores_files['C']['S'][coding_score]
+		else:
+			var_score_file_name = cfg.variant_scores_files[variant_annot[0]][variant_annot[1]]
 		# A lot of code in this tool has the same structure:
 		# Function that parses a list of files -> 
 		#	function parsing a single file ->
 		#	(sometimes) some functions processing a single line
-		parse_variant_scores_file(variant_annot, 
+		parse_variant_scores_file(variant_annot,
+								  var_score_file_name, 
 								  Gene_inst_dict, 
 								  score_thr)
 
@@ -153,16 +167,44 @@ def make_ENS2GeneID_dict(forward=True):
 # Loads s_het bin enrichment values from a file specified in cfg.py
 # Enrichment values are the proportions of exclusively dominant OMIM genes
 #	in s_het deciles
-def load_s_het_bins():
-	shet_dict = {}
-	with gzip.open(cfg.shet_table, 'rt') as inh:
+def load_gene_score_bins(scoreID = "s_het_R"):
+	gene_score_dict = {}
+	with gzip.open(cfg.gene_score_table, 'rt') as inh:
 		for s in inh:
-			if s.startswith('#'):
-				continue
 			s = s.strip().split()
-			shet_dict[s[2]] = eval(s[11])
-	# s_het_dict: ENSEMBL_ID -> s_het enrichment
-	return shet_dict
+			if s[0] == 'ensembl_gene_id':
+				if scoreID not in s:
+					raise AssertionError(f"ERROR: Gene score ID {scoreID} misspecified")
+				score_index = s.index(scoreID)
+				continue
+
+			gene_score_dict[s[0]] = eval(s[score_index])
+	# gene_score_dict: ENSEMBL_ID -> score bin enrichment for dominant disease-
+	#	causing genes
+	return gene_score_dict
+
+# Write parameters used in the run to an intermediate file
+def write_run_info(file_obj, args_obj):
+	"""
+	:param file_obj: python file object corresponding to an intermediate output file
+	:args_obj: argparse arguments object with run specifications supplied by the user
+	"""
+	file_obj.write(f"### Information about RaMeDiES run ID {args_obj.o}\n")
+	file_obj.write("### Do not discard this header when sharing intermediate outputs\n")
+	file_obj.write(f"### VARIANT ANNOTATIONS: {args_obj.variant_annots}\n")
+	file_obj.write(f"### INDEL SUPPRESSION: {args_obj.suppress_indels}\n")
+	file_obj.write(f"### ONLY MISSENSE CODING SNPS: {args_obj.missense_run}\n")
+	file_obj.write(f"### MAF CUTOFF: {args_obj.MAF}\n")
+	if 'C' in args_obj.variant_annots:
+		file_obj.write(f"### CODING SNP SCORE: {args_obj.coding_score}\n")
+		file_obj.write(f"### CODING SNP SCORE THRESHOLD: {args_obj.C_thr}\n")
+	if 'I' in args_obj.variant_annots:
+		file_obj.write(f"### INTRONIC SCORE: SpliceAI\n")
+		file_obj.write(f"### INTRONIC SCORE THRESHOLD: {args_obj.SAI_thr}\n")
+	if not args_obj.suppress_indels:
+		file_obj.write(f"### CODING INDEL SCORE: CADD\n")
+		file_obj.write(f"### CODING INDEL SCORE THRESHOLD: {args_obj.CI_thr}\n")
+
 
 
 # Written on 01.23.2024 by Mikhail Moldovan, HMS DBMI