Skip to content

Commit

Permalink
Merge pull request #8 from schutzekatze/strandlr
Browse files Browse the repository at this point in the history
Add strand information to indexlr
  • Loading branch information
lcoombe authored Jul 31, 2019
2 parents df11fe9 + b211e61 commit 33c9e16
Show file tree
Hide file tree
Showing 7 changed files with 51 additions and 15 deletions.
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
url="https://github.com/bcgsc/ntJoin",
license="GPLv3",
python_requires=">=3",
install_requires=["python-igraph", "pybedtools"],
install_requires=["python-igraph", "pysam==0.15.2", "pybedtools"],
scripts = ["bin/ntjoin_assemble.py", "bin/read_fasta.py"],
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent",
],
)
)
5 changes: 4 additions & 1 deletion src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,16 @@ clang-format:
fix:
for i in *.cc; do clang-format -i -style=file $$i; done

check: check-physlr-index-fasta check-physlr-index-pos-fasta check-physlr-index-fastq
check: check-physlr-index-fasta check-physlr-index-pos-fasta check-physlr-index-pos-strand-fasta check-physlr-index-fastq

check-physlr-index-fasta: all
./physlr-indexlr -t16 -k100 -w5 data/mt.fa | diff -q - data/mt.physlr.tsv.good

check-physlr-index-pos-fasta: all
./physlr-indexlr -t16 -k100 -w5 --pos data/mt.fa | diff -q - data/mt.pos.physlr.tsv.good

check-physlr-index-pos-strand-fasta: all
./physlr-indexlr -t16 -k100 -w5 --pos --strand data/mt.fa | diff -q - data/mt.pos.strand.physlr.tsv.good

check-physlr-index-fastq: all
./physlr-indexlr -t16 -k100 -w5 data/tiny.fq | diff -q - data/tiny.physlr.tsv.good
6 changes: 6 additions & 0 deletions src/data/mt.pos.strand.physlr.tsv.good
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
NC_012920.1_1-20
NC_012920.1_1-50
NC_012920.1_1-100
NC_012920.1_1-200 896630588373961382:4:+ 313487172214266647:7:+ 444014926207837182:12:+ 1523991246200341572:16:+ 1798117818543673513:19:+ 2967331913030798801:23:- 2388921502351613390:25:+ 3716646830862062911:29:+ 1735167774963772659:31:- 288063344517841377:32:+ 4102877096637136477:33:- 641502725708262787:38:- 41044066726412578:41:+ 2350527039690878033:44:+ 85461042879685439:48:+ 1623544596540149536:51:+ 199833590405879266:55:+ 711536535718262269:58:- 3135301605243919960:63:+ 6529271729409396546:67:+ 2411583937347593083:69:- 359586185935660863:70:- 1700127031481798181:73:- 176618573644396936:77:+ 2818324382397424587:82:+ 2254253362621705587:84:- 2690978069362398879:86:- 3108890209444811361:88:+ 4205045204998867944:93:- 678915414288388982:94:- 333577863667661999:95:+ 1478537368179398632:98:+
NC_012920.1_201-300
NC_012920.1.301-500 2141316536722319392:1:+ 3565582430772079417:4:+ 2941767525567243686:7:+ 7838669594887132640:12:+ 3477874077767130748:13:+ 2549223782120654761:16:- 2434113068435911923:18:+ 880875233461341501:21:- 69411230565005532:24:- 14406526176968659:29:+ 941706627830351327:32:- 8381363673706244476:35:- 713807191179521022:38:+ 2312796283123655724:40:+ 2591156828349162730:43:+ 2438729844916970308:46:+ 2417185650562727676:50:+ 1759381317048577652:52:+ 430284189424740148:57:+ 5821257845948055929:60:- 4379695023087714913:64:- 1632490710302323532:66:+ 2105929453292490301:70:+ 790596247142288701:74:+ 3722341532378510788:75:- 4601184236397494939:79:- 869485915964788930:82:+ 55823568601820076:86:- 4672055442291179162:91:- 3009586415680029419:95:- 1083798195764192325:98:+
14 changes: 8 additions & 6 deletions src/include/indexlr-minimize.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,19 @@ startsWith(const std::string& s, const char (&prefix)[N])
return s.size() > n && equal(s.begin(), s.begin() + n, prefix);
}

struct HashAndPos
struct HashData
{
HashAndPos(uint64_t hash, size_t pos)
HashData(uint64_t hash, size_t pos, char strand)
: hash(hash)
, pos(pos)
, strand(strand)
{}
uint64_t hash;
size_t pos;
char strand;
};

using HashValues = std::vector<HashAndPos>;
using HashValues = std::vector<HashData>;

// Hash the k-mers of a read using ntHash.
static inline HashValues
Expand All @@ -43,7 +45,7 @@ hashKmers(const std::string& readstr, const size_t k)
}
hashes.reserve(readstr.size() - k + 1);
for (ntHashIterator iter(readstr, 1, k); iter != ntHashIterator::end(); ++iter) {
hashes.push_back(HashAndPos((*iter)[0], iter.pos()));
hashes.push_back(HashData((*iter)[0], iter.pos(), iter.strand()));
}
return hashes;
}
Expand Down Expand Up @@ -95,7 +97,7 @@ getMinimizers(const HashValues& hashes, const unsigned w)
auto rightIt = leftIt + w;
if (i < leftIt - firstIt) {
// Use of operator '<=' returns the minimum that is furthest from left.
minIt = std::min_element(leftIt, rightIt, [](const HashAndPos& a, const HashAndPos& b) {
minIt = std::min_element(leftIt, rightIt, [](const HashData& a, const HashData& b) {
return a.hash <= b.hash;
});
} else if (rightIt[-1].hash <= minIt->hash) {
Expand All @@ -120,4 +122,4 @@ assert_good(const std::ios& stream, const std::string& path)
}
}

#endif
#endif
15 changes: 12 additions & 3 deletions src/include/indexlr-workers.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,14 @@ class MinimizeWorker
size_t k,
size_t w,
bool withPositions,
bool withStrands,
bool verbose,
InputWorker& inputWorker,
OutputWorker& outputWorker)
: k(k)
, w(w)
, withPositions(withPositions)
, withStrands(withStrands)
, verbose(verbose)
, inputWorker(inputWorker)
, outputWorker(outputWorker)
Expand All @@ -95,6 +97,7 @@ class MinimizeWorker
: k(worker.k)
, w(worker.w)
, withPositions(worker.withPositions)
, withStrands(worker.withStrands)
, verbose(worker.verbose)
, inputWorker(worker.inputWorker)
, outputWorker(worker.outputWorker)
Expand All @@ -104,6 +107,7 @@ class MinimizeWorker
: k(worker.k)
, w(worker.w)
, withPositions(worker.withPositions)
, withStrands(worker.withStrands)
, verbose(worker.verbose)
, inputWorker(worker.inputWorker)
, outputWorker(worker.outputWorker)
Expand All @@ -122,6 +126,7 @@ class MinimizeWorker
size_t k = 0;
size_t w = 0;
bool withPositions = false;
bool withStrands = false;
bool verbose = false;
InputWorker& inputWorker;
OutputWorker& outputWorker;
Expand Down Expand Up @@ -208,14 +213,15 @@ InputWorker::work()
reads.num = inputNum - 1;
}

buffer.releaseWriteAccess(currentNum);

if (done) {
allRead = true;
buffer.releaseWriteAccess(currentNum);
if (buffer.elements() == 0) {
buffer.close();
}
break;
} else {
buffer.releaseWriteAccess(currentNum);
}
}

Expand Down Expand Up @@ -289,6 +295,9 @@ MinimizeWorker::work()
if (withPositions) {
ss << ':' << m.pos;
}
if (withStrands) {
ss << ':' << m.strand;
}
sep = ' ';
}
ss << '\n';
Expand Down Expand Up @@ -322,4 +331,4 @@ OutputWorker::work()
} while (!inputWorker.allRead || lastWritten != inputWorker.inputNum - 1);
}

#endif
#endif
17 changes: 14 additions & 3 deletions src/physlr-indexlr.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ minimizeReads(
const size_t w,
const size_t t,
const bool withPositions,
const bool withStrands,
const bool verbose)
{
InputWorker inputWorker(ipath);
Expand All @@ -36,7 +37,7 @@ minimizeReads(
outputWorker.start();

auto minimizeWorkers = std::vector<MinimizeWorker>(
t, MinimizeWorker(k, w, withPositions, verbose, inputWorker, outputWorker));
t, MinimizeWorker(k, w, withPositions, withStrands, verbose, inputWorker, outputWorker));
for (auto& worker : minimizeWorkers) {
worker.start();
}
Expand All @@ -62,6 +63,7 @@ printUsage(const std::string& progname)
" -k K use K as k-mer size\n"
" -w W use W as sliding-window size\n"
" --pos include minimizer positions in the output\n"
" --strand include minimizer strand in the output\n"
" -v enable verbose output\n"
" -o FILE write output to FILE, default is stdout\n"
" -t N use N number of threads (default 1, max 5)\n"
Expand All @@ -84,9 +86,11 @@ main(int argc, char* argv[])
bool w_set = false;
bool k_set = false;
static int withPositions = 0;
static int withStrands = 0;
char* end = nullptr;
std::string outfile("/dev/stdout");
static const struct option longopts[] = { { "pos", no_argument, &withPositions, 1 },
{ "strand", no_argument, &withStrands, 1 },
{ "help", no_argument, &help, 1 },
{ nullptr, 0, nullptr, 0 } };
while ((c = getopt_long(argc, argv, "k:w:o:vt:", longopts, &optindex)) != -1) {
Expand Down Expand Up @@ -149,8 +153,15 @@ main(int argc, char* argv[])

for (auto& infile : infiles) {
minimizeReads(
infile == "-" ? "/dev/stdin" : infile, outfile, k, w, t, withPositions, verbose);
infile == "-" ? "/dev/stdin" : infile,
outfile,
k,
w,
t,
withPositions,
withStrands,
verbose);
}

return 0;
}
}
5 changes: 5 additions & 0 deletions src/vendor/ntHashIterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ class ntHashIterator
return m_pos;
}

char strand() const
{
return (m_fhVal < m_rhVal) ? '+' : '-';
}

/** get pointer to hash values for current k-mer */
const uint64_t* operator*() const
{
Expand Down

0 comments on commit 33c9e16

Please sign in to comment.