forked from nf-core/scrnaseq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patht2g.py
executable file
·102 lines (89 loc) · 3.74 KB
/
t2g.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python
# This was downloaded on 2019-06-23 from https://github.com/bustools/getting_started/releases/
# All credit goes to the original authors from the Kallisto/BUStools team!
# BSD 2-Clause License
#
# Copyright (c) 2017, Nicolas Bray, Harold Pimentel, Páll Melsted and Lior Pachter
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import sys, argparse
def create_transcript_list(input, use_name=True, use_version=False):
r = {}
for line in input:
if len(line) == 0 or line[0] == "#":
continue
l = line.strip().split("\t")
if l[2] == "transcript":
info = l[8]
d = {}
for x in info.split("; "):
x = x.strip()
p = x.find(" ")
if p == -1:
continue
k = x[:p]
p = x.find('"', p)
p2 = x.find('"', p + 1)
v = x[p + 1 : p2]
d[k] = v
if "transcript_id" not in d or "gene_id" not in d:
continue
tid = d["transcript_id"].split(".")[0]
gid = d["gene_id"].split(".")[0]
if use_version:
if "transcript_version" not in d or "gene_version" not in d:
continue
tid += "." + d["transcript_version"]
gid += "." + d["gene_version"]
gname = None
if use_name:
if "gene_name" not in d:
continue
gname = d["gene_name"]
if tid in r:
continue
r[tid] = (gid, gname)
return r
def print_output(output, r, use_name=True):
for tid in r:
if use_name:
output.write("%s\t%s\t%s\n" % (tid, r[tid][0], r[tid][1]))
else:
output.write("%s\t%s\n" % (tid, r[tid][0]))
if __name__ == "__main__":
parser = argparse.ArgumentParser(
add_help=True,
description="Creates transcript to gene info from GTF files\nreads from standard input and writes to standard output",
)
parser.add_argument(
"--use_version",
"-v",
action="store_true",
help="Use version numbers in transcript and gene ids",
)
parser.add_argument("--skip_gene_names", "-s", action="store_true", help="Do not output gene names")
args = parser.parse_args()
input = sys.stdin
r = create_transcript_list(input, use_name=not args.skip_gene_names, use_version=args.use_version)
output = sys.stdout
print_output(output, r)