-
Notifications
You must be signed in to change notification settings - Fork 119
/
Copy pathpairend_distro.py
executable file
·133 lines (101 loc) · 2.73 KB
/
pairend_distro.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env python
# (c) 2012 - Ryan M. Layer
# Hall Laboratory
# Quinlan Laboratory
# Department of Computer Science
# Department of Biochemistry and Molecular Genetics
# Department of Public Health Sciences and Center for Public Health Genomics,
# University of Virginia
import sys
import numpy as np
from operator import itemgetter
from optparse import OptionParser
# some constants for sam/bam field ids
SAM_FLAG = 1
SAM_REFNAME = 2
SAM_MATE_REFNAME = 6
SAM_ISIZE = 8
parser = OptionParser()
parser.add_option("-r",
"--read_length",
type="int",
dest="read_length",
help="Read length")
parser.add_option("-X",
dest="X",
type="int",
help="Number of stdevs from mean to extend")
parser.add_option("-N",
dest="N",
type="int",
help="Number to sample")
parser.add_option("-o",
dest="output_file",
help="Output file")
parser.add_option("-m",
dest="mads",
type="int",
default=10,
help="Outlier cutoff in # of median absolute deviations (unscaled, upper only)")
def unscaled_upper_mad(xs):
"""Return a tuple consisting of the median of xs followed by the
unscaled median absolute deviation of the values in xs that lie
above the median.
"""
med = np.median(xs)
return med, np.median(xs[xs > med] - med)
(options, args) = parser.parse_args()
if not options.read_length:
parser.error('Read length not given')
if not options.X:
parser.error('X not given')
if not options.N:
parser.error('N not given')
if not options.output_file:
parser.error('Output file not given')
required = 65
restricted = 384
flag_mask = required | restricted
L = []
c = 0
for l in sys.stdin:
if c >= options.N:
break
A = l.rstrip().split('\t')
flag = int(A[SAM_FLAG])
refname = A[SAM_REFNAME]
mate_refname = A[SAM_MATE_REFNAME]
isize = int(A[SAM_ISIZE])
want = mate_refname == "=" and flag & flag_mask == required and isize >= 0
if want:
c += 1
L.append(isize)
# Remove outliers
L = np.array(L)
L.sort()
med, umad = unscaled_upper_mad(L)
upper_cutoff = med + options.mads * umad
L = L[L < upper_cutoff]
new_len = len(L)
removed = c - new_len
sys.stderr.write("Removed %d outliers with isize >= %d\n" %
(removed, upper_cutoff))
c = new_len
mean = np.mean(L)
stdev = np.std(L)
start = options.read_length
end = int(mean + options.X*stdev)
H = [0] * (end - start + 1)
s = 0
for x in L:
if (x >= start) and (x <= end):
j = int(x - start)
H[j] = H[ int(x - start) ] + 1
s += 1
f = open(options.output_file, 'w')
for i in range(end - start):
o = str(i) + "\t" + str(float(H[i])/float(s)) + "\n"
f.write(o)
f.close()
print('mean:' + str(mean) + '\tstdev:' + str(stdev))