forked from BioinformaticsArchive/blasr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
WordCounter.cpp
111 lines (91 loc) · 2.44 KB
/
WordCounter.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <algorithm>
#include <stdlib.h>
#include "FASTASequence.h"
#include "FASTAReader.h"
#include "DNASequence.h"
#include "tuples/DNATuple.h"
#include "tuples/TupleMetrics.h"
#include "Types.h"
using namespace std;
int main(int argc, char* argv[]) {
FASTAReader reader;
if (argc < 5) {
cout << "usage: wordCounter seqFile tupleSize tupleOutputFile posOutputFile" << endl;
exit(1);
}
string fileName = argv[1];
int tupleSize = atoi(argv[2]);
string tupleListName = argv[3];
string posOutName = argv[4];
TupleMetrics tm;
tm.Initialize(tupleSize);
reader.Init(fileName);
FASTASequence seq;
reader.GetNext(seq);
vector<CountedDNATuple> tupleList;
CountedDNATuple tuple;
DNALength i;
for (i = 0; i < seq.length - tm.tupleSize + 1; i++ ) {
if (tuple.FromStringRL((Nucleotide*) (seq.seq + i), tm)) {
tuple.count = i;
tupleList.push_back(tuple);
}
}
std::sort(tupleList.begin(), tupleList.end());
int t;
int t2;
int numTuples = tupleList.size();
t = t2 = 0;
int numUnique = 0;
while (t < numTuples) {
t2 = t;
t2++;
while (t2 < numTuples and tupleList[t] == tupleList[t2]) {
t2++;
}
++numUnique;
t = t2;
}
ofstream countedTupleListOut;
countedTupleListOut.open(tupleListName.c_str(), ios_base::binary);
ofstream posOut;
posOut.open(posOutName.c_str(), ios_base::binary);
countedTupleListOut.write((const char*) &numUnique, sizeof(int));
countedTupleListOut.write((const char*) &tm.tupleSize, sizeof(int));
posOut.write((const char*) &numUnique, sizeof(int));
//
// Write out the tuple+counts to a file.
//
t = t2 = 0;
CountedDNATuple countedTuple;
int numMultOne = 0;
while (t < numTuples) {
t2 = t;
t2++;
while (t2 < numTuples and tupleList[t] == tupleList[t2]) {
t2++;
}
countedTuple.tuple = tupleList[t].tuple;
countedTuple.count = t2 - t;
if (countedTuple.count == 1) ++numMultOne;
countedTupleListOut.write((const char*) &countedTuple,sizeof(CountedDNATuple));
posOut.write((char*)&countedTuple.count, sizeof(int));
int tc;
for (tc = t; tc < t2; tc++) {
posOut.write((char*) &tupleList[tc].count, sizeof(int));
}
t = t2;
}
//
// Write out the positions of the tuples to a file.
//
posOut.close();
countedTupleListOut.close();
// cout << "found " << numUnique << " distinct " << DNATuple::TupleSize << "-mers." << endl;
cout << numMultOne << endl;
return 0;
}