forked from molgenis/ngs-utils
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbam_check_for_eof.py
executable file
·78 lines (62 loc) · 2.54 KB
/
bam_check_for_eof.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python
"""
I hacked this script so that it does not add an eof marker, but
only checks if it is present or not.
Usage: python bam_check_for_eof.py bamfile.bam [bamfile2.bam ... bamfileN.bam]
wbkoetsier 20120912
"""
"""Python script to add missing EOF marker to BAM or BGZF files.
BAM files are compressed using BGZF, Blocked GNU Zip Format, which
is a variant of GZIP. Modern BAM files include a special empty
block at the end of the file (EOF) as a marker to help spot when
a dataset has been truncated. This is just a 28 byte BGZF block,
which when decompressed is empty.
Some early tools output valid BAM files without this optional
(but recommended) EOF marker.
This script will add the EOF marker is not already present.
WARNING: If your BAM or BGZF file is truely truncated, this will
not magically fix it. It may hide or obscure the true problem.
WARNING: To avoid excessive data writing, this script modifies
the BAM or BGZF file in situ!
Usage with one or more BAM or BGZF files:
$ ./bam_add_eof.py example1.bam example2.bam ... exampleN.bam
See also: http://samtools.sourceforge.net/
v0.0.0 - Original script
v0.0.1 - Use append mode to add EOF block
https://github.com/peterjc/picobio/blob/master/sambam/bgzf_add_eof.py at 20120912
"""
import os
import sys
def sys_exit(msg, return_code=1):
sys.stderr.write(msg.rstrip() + "\n")
sys.exit(return_code)
def fix_bam(filename):
header = "\x1f\x8b\x08\x04\x00\x00\x00\x00" + \
"\x00\xff\x06\x00\x42\x43\x02\x00"
eof = "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00BC" + \
"\x02\x00\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00"
if not os.path.isfile(filename):
sys_exit("Missing file %s" % filename)
size = os.path.getsize(filename)
h = open(filename, "rb") #read only for now
#Check it looks like a BGZF file
#(could still be GZIP'd, in which case the extra block is harmless)
data = h.read(len(header))
if data != header:
sys_exit("File %s is not a BAM file" % filename)
#Check if it has the EOF already
h.seek(size - 28)
data = h.read(28)
h.close()
if data == eof:
sys.stderr.write("EOF present in %s\n" % filename)
else:
sys.stderr.write("No EOF present in %s\n" % filename)
# sys.stderr.write("Adding EOF block to %s\n" % filename)
# h = open(filename, "ab")
# h.write(eof)
# h.close()
if len(sys.argv) == 1:
sys_exit("Takes one or more BGZF/BAM filenames as arguments (edits in place)")
for bam_filename in sys.argv[1:]:
fix_bam(bam_filename)