forked from molgenis/ngs-utils
-
Notifications
You must be signed in to change notification settings - Fork 0
/
convertPCA.py
executable file
·102 lines (84 loc) · 3.22 KB
/
convertPCA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#this script converts the PCA.txt
#######################
#create argument parser
#######################
import optparse
parser = optparse.OptionParser(usage='usage: %prog -s subsfile -d pcafile -o outputfile\nChoose option -h for extensive help')
parser.add_option('-s', '--subsetFile', metavar='FILE', help='tab delimited file with two columns defining selectedIds and pseudoIds')
parser.add_option('-d', '--pcaFile', metavar='FILE', help='the pca file)')
parser.add_option('-o', '--outFile', metavar='FILE', help='where the output will be written to')
(options, args) = parser.parse_args()
print options
if options.subsetFile==None or options.pcaFile==None or options.outFile==None:
parser.print_help()
exit()
###################################################
#read subsetFile into two lists, of old and new ids
###################################################
import csv
csvReader = csv.reader(open(options.subsetFile, 'rU'), delimiter='\t')
selectedIds = []
pseudoIds = {}
for row in csvReader:
selectedIds.append(row[1])
pseudoIds[row[1]]=row[3]
##########################################################################################
#iterate through pca file
#keep line if the id is in the selectedIds
#then rename to pseudoId
##########################################################################################
csvReader = csv.reader(open(options.pcaFile, 'rU'), delimiter='\t')
f = open(options.outFile, 'wb')
csvWriter = csv.writer(f, delimiter='\t')
count = 0
for row in csvReader:
if count == 0:
csvWriter.writerow(row)
else:
if row[0] in selectedIds:
#filter the row into 'myvalues' and write to csv
myvalues = []
for col in row:
myvalues.append(col)
myvalues[0] = pseudoIds[row[0]]
csvWriter.writerow(myvalues)
#debug info
if count % 1000 == 0:
print 'converting row '+str(count)
count=count+1
f.close()
##############################################################################################
#QC: check if the rows[0] in outfile match our selectedIds (and do not contain any previousIds)
##############################################################################################
print 'check of output whether all ids are properly converted'
import os
csvReader = csv.reader(open(options.outFile, 'rU'), delimiter='\t')
foundPseudoIds = []
expectedPseudoIds = pseudoIds.values()
count = 0
for row in csvReader:
if count > 0:
#check for illegal ids
if row[0] not in expectedPseudoIds:
os.remove(options.outFile)
print 'conversion FAILED: id \''+row[0]+'\' not a pseudoId'
exit()
if row[0] in selectedIds:
os.remove(options.outFile)
print 'conversion FAILED: id \''+row[0]+'\' not a pseudoId'
exit()
#remember id so we can count
foundPseudoIds.append(row[0])
#debug info
if count % 1000 == 0:
print 'checked row '+str(count)
count=count+1
print str(len(foundPseudoIds)) +' versus '+str(len(expectedPseudoIds))
#check for missing
f = open(options.outFile+'.missing', 'w')
if len(foundPseudoIds) != len(expectedPseudoIds):
for key in pseudoIds:
if pseudoIds[key] not in foundPseudoIds:
f.write('WARNING: mapping '+key+'='+pseudoIds[key]+' not in pca file\n')
print 'WARNING: mapping '+key+'='+pseudoIds[key]+' not in pca file'
print 'conversion completed'