-
Notifications
You must be signed in to change notification settings - Fork 2
/
20190331_DiamondPOCP.R
102 lines (92 loc) · 3.77 KB
/
20190331_DiamondPOCP.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# replace blastp with diamond
# ref: https://github.com/bbuchfink/diamond
# install windows version:
# https://github.com/bbuchfink/diamond/releases
# add diamond.exe path to system env path ;"diamond.exe help" check if done
library(dbplyr)
library(seqinr)
if (!dir.exists('Database')) {
dir.create('Database')
}
if (!dir.exists('Result')) {
dir.create('Result')
}
genome.files <- list.files('Rawdata')
# makeblastdb
for (gn in genome.files) {
header.file <- strsplit(gn,'.',fixed = T)[[1]][1]
commond.makedb <- paste0('diamond.exe makedb --in Rawdata/',
gn, ' --db Database/', header.file)
system(commond.makedb)
}
# pair blast
genome.comn <- combn(genome.files,2)
blast.comm1 <- 'diamond.exe blastp -q Rawdata/'
blast.comm2 <- ' -d Database/'
blast.comm3 <- ' -e 1e-5 --id 40 -o Result/'
pocp.vector <- c()
for (i in (1:dim(genome.comn)[2]) ) {
a.genome <- genome.comn[,i][1]
b.genome <- genome.comn[,i][2]
a.header <- strsplit(a.genome,'.',fixed = T)[[1]][1]
b.header <- strsplit(b.genome,'.',fixed = T)[[1]][1]
a.genome.seq <- read.fasta(paste0('Rawdata/', a.genome),'AA')
b.genome.seq <- read.fasta(paste0('Rawdata/', b.genome),'AA')
a.total <- length(a.genome.seq)
b.total <- length(b.genome.seq)
str(a.genome.seq)
str(b.genome.seq)
a.seq.list <- names(a.genome.seq)
b.seq.list <- names(b.genome.seq)
a.seq.length <- c()
for (nm in a.seq.list) {
tmp.len <- length(a.genome.seq[[which(a.seq.list == nm)]])
a.seq.length <- append(a.seq.length, tmp.len)
}
b.seq.length <- c()
for (nm in b.seq.list) {
tmp.len <- length(b.genome.seq[[which(b.seq.list == nm)]])
b.seq.length <- append(b.seq.length, tmp.len)
}
a.seq.df <- data.frame(a.seq.list, a.seq.length)
colnames(a.seq.df) <- c('V1','length')
b.seq.df <- data.frame(b.seq.list, b.seq.length)
colnames(b.seq.df) <- c('V1','length')
print(paste0('-- Blasting: ',a.header,' - VS - ',b.header))
# blast forward
result.forward <- paste0(a.header,'_VS_',b.header,'.tab')
system(paste0(blast.comm1, a.genome,
blast.comm2, b.header,
blast.comm3, result.forward))
df.forward <- read.table(paste0('Result/',result.forward),
header = F,sep = '\t',
stringsAsFactors = F)
df.forward <- df.forward %>% distinct(V1,.keep_all = T)
df.forward <- merge(df.forward, a.seq.df, by = 'V1', all.x = T)
df.forward$align <- df.forward$V4 / df.forward$length
df.forward <- df.forward[which(df.forward$V3 > 40 & df.forward$align > 0.5 & df.forward$V11 < 1e-5),]
C1 <- dim(df.forward)[1]
# blast backward
result.backward <- paste0(b.header,'_VS_',a.header,'.tab')
system(paste0(blast.comm1, b.genome,
blast.comm2, a.header,
blast.comm3, result.backward))
df.backward <- read.table(paste0('Result/',result.backward),
header = F,sep = '\t',
stringsAsFactors = F)
df.backward <- df.backward %>% distinct(V1,.keep_all = T)
df.backward <- merge(df.backward, b.seq.df, by = 'V1', all.x = T)
df.backward$align <- df.backward$V4 / df.backward$length
df.backward <- df.backward[which(df.backward$V3 > 40 & df.backward$align > 0.5 & df.backward$V11 < 1e-5),]
C2 <- dim(df.backward)[1]
pocp <- (C1 + C2)/(a.total + b.total)
pocp.vector <- append(pocp.vector, paste0(a.header,'\t',b.header,'\t',pocp))
print(paste0('-- Pair blast done: ',a.header,' - VS - ',b.header))
print(paste0('-- The POCP : ', pocp))
print('----------------------------------')
}
write(pocp.vector, 'resultPOCP.txt')
unlink("Database", recursive = TRUE)
unlink("Result", recursive = TRUE)
dir.create('Database')
dir.create('Result')