Skip to content

Commit e14ee3f

Browse files
author
AHBrauT2
committed
Refinement of clustering and testing with all HQ samples done
0 parents  commit e14ee3f

19 files changed

+935
-0
lines changed

README.md

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# StraglrON
2+
Addon for Straglr

requirements.txt

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
asttokens==2.4.1
2+
backcall==0.2.0
3+
biopython==1.81
4+
comm==0.2.0
5+
contourpy==1.1.1
6+
cycler==0.12.1
7+
debugpy==1.8.0
8+
decorator==5.1.1
9+
executing==2.0.1
10+
fonttools==4.44.0
11+
importlib-metadata==6.8.0
12+
importlib-resources==6.1.1
13+
ipykernel==6.26.0
14+
ipython==8.12.3
15+
jedi==0.19.1
16+
joblib==1.3.2
17+
jupyter_client==8.6.0
18+
jupyter_core==5.5.0
19+
kiwisolver==1.4.5
20+
matplotlib==3.7.3
21+
matplotlib-inline==0.1.6
22+
nest-asyncio==1.5.8
23+
numpy==1.24.4
24+
packaging==23.1
25+
pandas==2.0.3
26+
parso==0.8.3
27+
pexpect==4.8.0
28+
pickleshare==0.7.5
29+
Pillow==10.1.0
30+
pip==23.3
31+
platformdirs==4.0.0
32+
prompt-toolkit==3.0.40
33+
psutil==5.9.0
34+
ptyprocess==0.7.0
35+
pure-eval==0.2.2
36+
Pygments==2.16.1
37+
pyparsing==3.1.1
38+
pysam==0.22.0
39+
python-dateutil==2.8.2
40+
pytz==2023.3.post1
41+
pyzmq==25.1.1
42+
scikit-learn==1.3.2
43+
scipy==1.10.1
44+
seaborn==0.13.0
45+
setuptools==68.0.0
46+
six==1.16.0
47+
stack-data==0.6.3
48+
threadpoolctl==3.2.0
49+
tornado==6.3.3
50+
traitlets==5.13.0
51+
typing_extensions==4.7.1
52+
tzdata==2023.3
53+
wcwidth==0.2.5
54+
wheel==0.41.2
55+
zipp==3.11.0

src/.vscode/launch.json

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"version": "0.2.0",
3+
"configurations": [
4+
{
5+
"name": "Python: Current File",
6+
"type": "python",
7+
"request": "launch",
8+
"program": "${file}",
9+
"console": "integratedTerminal",
10+
"args": ["/mnt/storage2/users/ahbraut2/data/snakemake_test/benchmark/straglr_test_new_Loci/og/Sample012_23014LRa.bed","--hist", "/mnt/storage2/users/ahbraut2/data/snakemake_test/benchmark/straglr_test_new_Loci/og/Sample012_23014LRa.tsv","-o", "/mnt/storage2/users/ahbraut2/scripts/TestFolder/Home", "/mnt/storage2/users/ahbraut2/data/Master_Bedfiles/RepeatLoci_straglr_LT_V2.bed" , "/mnt/storage2/users/ahbraut2/data/ONT_Pilot_TNAMSE_Links/Sample012_23014LRa.bam", "--cutoff", "5"]
11+
}
12+
]
13+
}

src/ResultGiverStraglr.py

+134
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
import os
2+
from collections import deque
3+
from pathlib import Path
4+
import argparse
5+
from utils.Structures import Expansion
6+
import utils.ReaderAnalyser as ra
7+
import utils.Visualiser as vis
8+
import utils.extract_repeats as extract_repeats
9+
import shutil
10+
11+
def is_valid_file(parser, arg, type):
12+
"""
13+
Check if the provided file exists and has a .bed or .tsv extension.
14+
"""
15+
if type == "bed":
16+
if not os.path.exists(arg):
17+
parser.error(f"The file '{arg}' does not exist.")
18+
elif not arg.endswith('.bed'):
19+
parser.error(f"The file '{arg}' is not a .bed file.")
20+
else:
21+
return arg
22+
23+
if type == "tsv":
24+
if not os.path.exists(arg):
25+
parser.error(f"The file '{arg}' does not exist.")
26+
elif not arg.endswith('.tsv'):
27+
parser.error(f"The file '{arg}' is not a .tsv file.")
28+
else:
29+
return arg
30+
31+
32+
#Argument parser for command line interface including necessary file and options
33+
34+
parser = argparse.ArgumentParser()
35+
36+
parser.add_argument("path_input_bed", type=lambda x: is_valid_file(parser, x, "bed"), help="Path to input bed file")
37+
parser.add_argument("path_input_tsv", type=lambda x: is_valid_file(parser, x, "tsv"), help="Path to input tsv file")
38+
parser.add_argument("loci_file", type=lambda x: is_valid_file(parser, x, "bed"), help="Path to Loci file used for straglr analysis")
39+
parser.add_argument("-o", "--output", type=str, required=True, help="Path to output folder")
40+
41+
parser.add_argument("--hist", action="store_true", help="Plots histograms of pathogenic expansions")
42+
parser.add_argument("--score", action="store_true", help="Expansion in output file is sorted by normalized size difference score")
43+
44+
parser.add_argument("--altclust", action="store_true", help="Uses Thomas Clustering")
45+
46+
parser.add_argument("--alleles", action="store_true", help="Turns on the allele visualization")
47+
parser.add_argument("--bam", type=str, help="Location of Bam file of interest")
48+
parser.add_argument("-c", "--cutoff", type=int, default=2, help="Sets number of reads cutoff when clustering unimodal or bimodal allele read frequencies")
49+
parser.add_argument("--flank", type=int, default=25, help="flank size. Default:25")
50+
parser.add_argument("--genome", type=str, help="location of reference genome")
51+
52+
args = parser.parse_args()
53+
54+
def outputWriter(output_expansion: "list[Expansion]", sample_id, args):
55+
56+
#Base output here
57+
sample_folder_path = args.output + "/" + sample_id
58+
result_file_path = sample_folder_path + "/" + sample_id + ".txt"
59+
if not (os.path.exists(sample_folder_path)):
60+
os.mkdir(sample_folder_path)
61+
62+
with open(result_file_path, 'w') as f:
63+
64+
print("#chr"+"\t"+"start"+"\t"+"end"+"\t"+"repeat_id"+"\t"+"repeat_unit"+"\t"+"size"+"\t"+"wt_size"+"\t"+"in_pathogenic_range"+"\t"+"size_difference"+"\t"+"allele1_support"+"\t"+"allele2_support"+"\t"+"score")
65+
f.write("#chr\tstart\tend\trepeat_id\trepeat_unit\tsize\twt_size\tin_pathogenic_range\tsize_difference\tallele1_support\tallele2_support\tscore\n")
66+
67+
if args.altclust:
68+
69+
for x in output_expansion:
70+
f.write(x.chr + "\t" + x.start + "\t" + x.end + "\t" + x.repeat_id + "\t" + x.repeat_unit + "\t" + str(x.allele1_size)+'/'+str(x.allele2_size) + "\t" + str(x.wt_size) + "\t" + x.in_pathogenic_range + "\t" + str(x.size_difference) + "\t" + x.allele1_support + "\t" + x.allele2_support+ "\t" + str(x.norm_score)+"\n")
71+
print(x.chr, x.start, x.end, x.repeat_id, x.repeat_unit, (str(x.new_allele1)+'/'+str(x.new_allele2)), x.wt_size, x.new_in_pathogenic_range, x.new_size_difference, x.new_allele1_support, x.new_allele2_support, str(x.new_norm_score))
72+
73+
else:
74+
75+
for x in output_expansion:
76+
f.write(x.chr + "\t" + x.start + "\t" + x.end + "\t" + x.repeat_id + "\t" + x.repeat_unit + "\t" + str(x.allele1_size)+'/'+str(x.allele2_size) + "\t" + str(x.wt_size) + "\t" + x.in_pathogenic_range + "\t" + str(x.size_difference) + "\t" + x.allele1_support + "\t" + x.allele2_support + "\t"+ str(x.norm_score)+"\n")
77+
print(x.chr, x.start, x.end, x.repeat_id, x.repeat_unit, str(x.allele1_size)+'/'+str(x.allele2_size), x.wt_size, x.in_pathogenic_range, x.size_difference, x.allele1_support, x.allele2_support, str(x.norm_score))
78+
79+
80+
#Base output plus Histograms
81+
if args.hist:
82+
plot_folder_path = sample_folder_path + "/plots"
83+
84+
if(os.path.exists(plot_folder_path)):
85+
shutil.rmtree(plot_folder_path)
86+
os.mkdir(plot_folder_path)
87+
88+
if args.altclust:
89+
for x in output_expansion:
90+
vis.plotHistogram(x, plot_folder_path, args.altclust)
91+
92+
else:
93+
for x in output_expansion:
94+
vis.plotHistogram(x, plot_folder_path, args.altclust)
95+
96+
97+
#Base with Allele Visualization
98+
if args.alleles:
99+
repeats_fasta_folder = sample_folder_path + "/" + "repeats"
100+
101+
if(os.path.exists(repeats_fasta_folder)):
102+
shutil.rmtree(repeats_fasta_folder)
103+
os.mkdir(repeats_fasta_folder)
104+
105+
for x in output_expansion:
106+
107+
extract_repeats.fastaMaker(args.path_input_tsv, x.chr + ":"+ x.start + "-" + x.end, args.bam, args.flank, repeats_fasta_folder + "/" + x._title + ".fa")
108+
vis.alleleVisualiser(repeats_fasta_folder + "/" + x._title + ".fa", x.repeat_unit, args.flank, x._title, repeats_fasta_folder, x.chr, x.start, x.end, args.genome)
109+
110+
def main():
111+
112+
loci_dict = ra.lociBedReader(args.loci_file)
113+
expansions = ra.resultBedReader(args.path_input_bed, loci_dict)
114+
sample_id = Path(args.path_input_bed).stem
115+
116+
vis.getHistData(args.path_input_tsv, expansions)
117+
118+
for expansion_object in expansions:
119+
if args.altclust:
120+
ra.newGenotyping(expansion_object, args.cutoff, False)
121+
122+
ra.expansionScorer(expansion_object, args.altclust)
123+
124+
if args.score:
125+
expansions.sort(key=lambda x: float("-Inf") if x.norm_score is None else x.norm_score, reverse=True)
126+
#print("score sorting working as intended")
127+
else:
128+
expansions.sort(key=lambda x: x.chr)
129+
#print("score sorting is turned off")
130+
131+
outputWriter(expansions, sample_id, args)
132+
133+
if __name__ == "__main__":
134+
main()

src/__init__.py

Whitespace-only changes.
2.72 KB
Binary file not shown.

src/__pycache__/locus.cpython-38.pyc

730 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)