forked from ucscCancer/cgData
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTrackGenomic.py
156 lines (128 loc) · 6.12 KB
/
TrackGenomic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import CGData
import binascii
import struct
from CGData.SQLUtil import *
CREATE_BED = """
CREATE TABLE %s (
bin smallint unsigned not null,
chrom varchar(255) not null,
chromStart int unsigned not null,
chromEnd int unsigned not null,
name varchar(255) not null,
score int not null,
strand char(1) not null,
thickStart int unsigned not null,
thickEnd int unsigned not null,
reserved int unsigned not null,
blockCount int unsigned not null,
blockSizes longblob not null,
chromStarts longblob not null,
expCount int unsigned not null,
expIds longblob not null,
expScores longblob not null,
INDEX(name(16)),
INDEX(chrom(5),bin)
) engine 'MyISAM';
"""
class TrackGenomic(CGData.CGMergeObject):
DATA_FORM = None
typeSet = {
'clinicalMatrix' : True,
'genomicMatrix' : True,
'sampleMap' : True,
'probeMap' : True
}
format = "bed 15"
def __init__(self):
CGData.CGMergeObject.__init__(self)
def get_name( self ):
return "%s" % ( self.members[ "genomicMatrix" ].get_name() )
def scores(self, row):
return "'%s'" % (','.join( str(a) for a in row ))
def gen_sql_heatmap(self, id_table):
#scan the children
# XXX Handling of sql for children is broken if the child may appear
# as part of multiple merge objects, such as TrackGenomic and TrackClinical.
# A disgusting workaround for clinicalMatrix is to prevent the TrackGenomic from calling
# it for gen_sql.
clinical = self.members.pop("clinicalMatrix")
for line in CGData.CGMergeObject.sql_pass(self, id_table, method="heatmap"):
yield line
self.members["clinicalMatrix"] = clinical
gmatrix = self.members[ 'genomicMatrix' ]
pmap = self.members[ 'probeMap' ].get( assembly="hg18" ) # BUG: hard coded to only producing HG18 tables
if pmap is None:
CGData.error("Missing HG18 %s" % ( self.members[ 'probeMap'].get_name() ))
return
table_base = self.get_name()
CGData.log("Writing Track %s" % (table_base))
clinical_table_base = self.members[ "clinicalMatrix" ].get_name()
yield "INSERT into raDb( name, sampleTable, clinicalTable, columnTable, aliasTable, shortLabel, longLabel, expCount, dataType, platform, profile, security) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%d', '%s', '%s', '%s', '%s');\n" % \
( "genomic_" + table_base, "sample_" + table_base,
"clinical_" + clinical_table_base, "clinical_" + clinical_table_base + "_colDb",
"genomic_" + table_base + "_alias",
sql_fix(gmatrix.attrs['shortTitle']),
sql_fix(gmatrix.attrs['longTitle']),
len(gmatrix.get_sample_list()),
self.format,
gmatrix.attrs[':dataSubType'],
'localDb',
'public',
)
# write out the sample table
yield "drop table if exists sample_%s;" % ( table_base )
yield """
CREATE TABLE sample_%s (
id int,
sampleName varchar(255)
) engine 'MyISAM';
""" % ( table_base )
from CGData.ClinicalMatrix import sortedSamples
for sample in sortedSamples(gmatrix.get_sample_list()):
yield "INSERT INTO sample_%s VALUES( %d, '%s' );\n" % ( table_base, id_table.get( clinical_table_base + ':sample_id', sample), sample )
yield "drop table if exists genomic_%s_alias;" % ( table_base )
yield """
CREATE TABLE genomic_%s_alias (
name varchar(255),
alias varchar(255)
) engine 'MyISAM';
""" % ( table_base )
for pset in pmap:
for probe in pset:
for alias in probe.aliases:
yield "insert into genomic_%s_alias( name, alias ) values( '%s', '%s' );\n" % (table_base, sql_fix(probe.name), sql_fix(alias))
# write out the BED table
yield "drop table if exists %s;" % ( "genomic_" + table_base )
yield CREATE_BED % ( "genomic_" + table_base + "_tmp")
sample_ids = []
samples = gmatrix.get_sample_list()
# sort samples by sample_id, and retain the sort order for application to the genomic data, below
tmp=sorted(zip(samples, range(len(samples))), cmp=lambda x,y: id_table.get(clinical_table_base + ':sample_id', x[0]) - id_table.get( clinical_table_base + ':sample_id', y[0]))
samples, order = map(lambda t: list(t), zip(*tmp))
for sample in samples:
sample_ids.append( str( id_table.get( clinical_table_base + ':sample_id', sample ) ) )
exp_ids = ','.join( sample_ids )
missingProbeCount = 0
for probe_name in gmatrix.get_probe_list():
# get the genomic data and rearrange to match the sample_id order
tmp = gmatrix.get_row_vals( probe_name )
row = map(lambda i: tmp[order[i]], range(len(tmp)))
pset = pmap.get( probe_name )
if pset is not None:
for probe in pset:
istr = "insert into %s(chrom, chromStart, chromEnd, strand, name, expCount, expIds, expScores) values ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', %s );\n" % \
( "genomic_%s_tmp" % (table_base), probe.chrom, probe.chrom_start, probe.chrom_end, probe.strand, sql_fix(probe_name), len(sample_ids), exp_ids, self.scores(row) )
yield istr
else:
missingProbeCount += 1
yield "create table genomic_%s like genomic_%s_tmp;" % (table_base, table_base)
yield "insert into genomic_%s select * from genomic_%s_tmp order by chrom, chromStart;" % (table_base, table_base)
yield "drop table genomic_%s_tmp;" % table_base
CGData.log("%s Missing probes %d" % (table_base, missingProbeCount))
def unload(self):
for t in self.members:
self.members[t].unload()
class BinaryTrackGenomic(TrackGenomic):
format = 'bed 15b'
def scores(self, row):
return "x'%s'" % (''.join( binascii.hexlify(struct.pack('f', a)) for a in row ))