diff --git a/Makefile b/Makefile index 8dde5e89..884e575f 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ # order is important, libraries first -modules = api stats randgen validate mutations fasta alignmentDepth liftover lod maf chain extract analysis phyloP modify assemblyHub +modules = api stats randgen validate mutations fasta alignmentDepth liftover lod maf chain extract analysis phyloP modify assemblyHub synteny + .PHONY: all %.all clean %.clean doxy %.doxy diff --git a/synteny/Makefile b/synteny/Makefile new file mode 100644 index 00000000..97a92af8 --- /dev/null +++ b/synteny/Makefile @@ -0,0 +1,16 @@ +rootPath = ../ +include ../include.mk + +libHeaders = $(wildcard inc/*) +libSourcesAll = $(wildcard impl/*.cpp) +libSources=$(subst impl/main.cpp,,${libSourcesAll}) + +cpp = h5c++ ${h5prefix} + +all : ${binPath}/hal2synteny + +${binPath}/hal2synteny : impl/main.cpp ${libPath}/halLib.a ${libPath}/halLiftover.a ${basicLibsDependencies} + ${cpp} ${cppflags} -std=c++11 -I${sonLibPath} -I${rootPath}/liftover/inc -I${rootPath}/api/inc -Iinc -Iimpl impl/main.cpp ${libPath}/halLib.a ${libPath}/halLiftover.a ${sonLibPath}/sonLib.a -o ${binPath}/hal2synteny + +clean: + rm -f ${binPath}/hal2synteny diff --git a/synteny/impl/hal2psl.cpp b/synteny/impl/hal2psl.cpp new file mode 100644 index 00000000..adef33ea --- /dev/null +++ b/synteny/impl/hal2psl.cpp @@ -0,0 +1,108 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + + +#include "hal2psl.h" +#include "psl.h" + + +void hal::Hal2Psl::storePslResults(std::vector& pslBlocks){ + + BedList::iterator i = _outBedLines.begin(); + for (; i != _outBedLines.end(); ++i) + { + if (_addExtraColumns == false) + { + i->_extra.clear(); + } + makeUpPsl(i->_psl, i->_blocks, i->_strand, i->_start, i->_chrName, + pslBlocks); + } + +} +std::vector hal::Hal2Psl::convert2psl(hal::AlignmentConstPtr alignment, + const hal::Genome* srcGenome, + const hal::Genome* tgtGenome, + const std::string srcChrom){ + + std::vector pslBlocks; + if (srcGenome->getNumSequences() > 0){ + _srcGenome = srcGenome; + _tgtGenome = tgtGenome; + _coalescenceLimit = NULL; + _traverseDupes = true; + _addExtraColumns = false; + _missedSet.clear(); + _tgtSet.clear(); + _tgtSet.insert(tgtGenome); + hal::SequenceIteratorConstPtr seqIt = srcGenome->getSequenceIterator(); + hal::SequenceIteratorConstPtr seqEnd = srcGenome->getSequenceEndIterator(); + for (; !seqIt->equals(seqEnd); seqIt->toNext()) { + _outBedLines.clear(); + _srcSequence = seqIt->getSequence(); + auto chrName = _srcSequence->getName(); + if (srcChrom != chrName) {continue;} + //else {std::cout << chrName << std::endl;} + _bedLine._start = 0; + _bedLine._end = _srcSequence->getSequenceLength(); + _mappedBlocks.clear(); + _outPSL = true; + visitBegin(); + liftInterval(_mappedBlocks); + if (_mappedBlocks.size()) { + assignBlocksToIntervals(); + } + //_outBedLines.sort(BedLineSrcLess()); + storePslResults(pslBlocks); + cleanResults(); + } + } + return pslBlocks; +} + + +void hal::Hal2Psl::makeUpPsl(const std::vector& vpsl, + const std::vector& blocks, + const char strand, + const hal_index_t start, + const std::string chrName, + std::vector& pslBlocks){ + assert(vpsl.size() == 1); + const hal::PSLInfo& psl = vpsl[0]; + + for (size_t i = 0; i < blocks.size(); ++i) { + PslBlock b = PslBlock(); + b.size = blocks[i]._length; + b.qName = psl._qSeqName; + b.qSize = psl._qSeqSize; + + b.qStart = psl._qBlockStarts[i] - psl._qChromOffset; + b.qEnd = b.qStart + b.size; + if (psl._qStrand == '-'){ + auto posStart = b.qStart; + b.qStart = psl._qSeqSize - posStart - b.size; + b.qEnd = psl._qSeqSize - posStart; + } + + b.tStart = blocks[i]._start + start; + b.tEnd = b.tStart + b.size; + if (strand == '-') + { + auto posStart = b.tStart; + b.tStart = psl._tSeqSize - posStart - b.size; + b.tEnd = psl._tSeqSize - posStart; + } + std::stringstream ss; + ss << psl._qStrand << strand; + ss >> b.strand; + //b.strand = psl._qStrand+'/'+strand; + b.tSize = psl._tSeqSize; + b.qSize = psl._qSeqSize; + b.tName = chrName; + pslBlocks.push_back(b); + } +} + diff --git a/synteny/impl/main.cpp b/synteny/impl/main.cpp new file mode 100644 index 00000000..58097376 --- /dev/null +++ b/synteny/impl/main.cpp @@ -0,0 +1,122 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +#include "hal.h" +#include "halCLParserInstance.h" + +#include "psl_io.cpp" +#include "psl_merger.cpp" +#include "hal2psl.cpp" + + +static hal::CLParserPtr initParser() { + auto optionsParser = hal::hdf5CLParserInstance(); + optionsParser->addArgument("halFile", "input psl file from liftover"); + optionsParser->addOption("queryGenome", "source genome", "\"\""); + optionsParser->addOption("targetGenome", "reference genome name", "\"\""); + optionsParser->addArgument("outPslPath", "output psl file ffor synteny blocks"); + optionsParser->addOption("minBlockSize", + "lower bound on synteny block length", + 5000); + optionsParser->addOption("maxAnchorDistance", + "upper bound on distance for syntenic psl blocks", + 5000); + optionsParser->addOption("queryChromosome", + "chromosome to infer synteny", + "\"\""); + return optionsParser; + +} + +const hal::Genome* openGenomeOrThrow(const hal::AlignmentConstPtr& alignment, + const std::string& genomeName) { + const hal::Genome* genome = alignment->openGenome(genomeName); + if (genome == NULL){ + throw hal_exception(std::string("Reference genome, ") + genomeName + + ", not found in alignment"); + } + return genome; +} + +hal::AlignmentConstPtr openAlignmentOrThrow(const std::string& halFile, + const hal::CLParserPtr& optionsParser){ + + auto alignment = hal::openHalAlignmentReadOnly(halFile, optionsParser); + if (alignment->getNumGenomes() == 0){ + throw hal_exception("hal alignment is empty"); + } + return alignment; +} + + void validateInputOrThrow(const std::string& queryGenomeName, + const std::string& targetGenomeName, + const std::string& queryChromosome) { + if (queryGenomeName == "\"\"" || targetGenomeName == "\"\"" || queryChromosome == "\"\""){ + throw hal_exception("--queryGenome and --targetGenome and --queryChromosome must be" + "specified"); + } + + if (queryGenomeName == targetGenomeName){ + throw hal_exception("--queryGenome and --targetGenome must be" + "different"); + } +} + + + +int main(int argc, char *argv[]) { + hal::CLParserPtr optionsParser = initParser(); + std::string halFile; + std::string outPslPath; + std::string queryGenomeName; + std::string targetGenomeName; + std::string queryChromosome; + hal_size_t minBlockSize; + hal_size_t maxAnchorDistance; + try { + optionsParser->parseOptions(argc, argv); + halFile = optionsParser->getArgument("halFile"); + queryGenomeName = optionsParser->getOption("queryGenome"); + targetGenomeName = optionsParser->getOption("targetGenome"); + outPslPath = optionsParser->getArgument("outPslPath"); + minBlockSize = optionsParser->getOption("minBlockSize"); + maxAnchorDistance = optionsParser->getOption("maxAnchorDistance"); + queryChromosome = optionsParser->getOption("queryChromosome"); + optionsParser->setDescription("convert psl alignments into synteny blocks"); + } + catch(std::exception& e) { + std::cerr << e.what() << std::endl; + optionsParser->printUsage(std::cerr); + exit(1); + } + + validateInputOrThrow(queryGenomeName,targetGenomeName,queryChromosome); + try { + auto alignment = openAlignmentOrThrow(halFile, optionsParser); + auto targetGenome = openGenomeOrThrow(alignment, targetGenomeName); + auto queryGenome = openGenomeOrThrow(alignment, queryGenomeName); + + + auto hal2psl = hal::Hal2Psl(); + //std::cout << "reading hal " << std::endl; + auto blocks = hal2psl.convert2psl(alignment, queryGenome, + targetGenome, queryChromosome); + + + //std::cout << "merging " << blocks.size() << " blocks"<< std::endl; + auto merged_blocks = dag_merge(blocks, + minBlockSize, maxAnchorDistance); + //std::cout << "writing psl " << std::endl; + psl_io::write_psl(merged_blocks, outPslPath); + } + catch(std::exception& e) + { + std::cerr << "Exception caught: " << e.what() << std::endl; + return 1; + } + return 0; + +} diff --git a/synteny/impl/psl_io.cpp b/synteny/impl/psl_io.cpp new file mode 100644 index 00000000..88f8d8b0 --- /dev/null +++ b/synteny/impl/psl_io.cpp @@ -0,0 +1,102 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +#include "psl_io.h" + +namespace psl_io { +std::vector split(const std::string &s, char delim) { + std::stringstream ss(s); + std::string item; + std::vector elems; + while (std::getline(ss, item, delim)) { + elems.push_back(std::move(item)); + } + return elems; +} + + +std::vector get_blocks_set(const std::string psl) { + std::ifstream input(psl); + std::vector blocks; + for (std::string line; std::getline(input, line);) { + auto cur_b = Psl(line).get_blocks(); + blocks.insert(blocks.end(), cur_b.begin(), cur_b.end()); + } + return blocks; + } + +std::vector get_qInserts(const std::vector& blocks) { + std::vector result; + for (int i=0; i < int(blocks.size())-1; ++i) { + auto dif = blocks[i+1].qStart - blocks[i].qEnd; + if (dif > 0) { + result.push_back(dif); + } + } + return result; +} + +std::vector get_tInserts(const std::vector& blocks) { + std::vector result; + for (int i=0; i < int(blocks.size())-1; ++i) { + auto dif = blocks[i+1].tStart - blocks[i].tEnd; + if (dif > 0) { + result.push_back(dif); + //std::cout << blocks[i+1].tStart << " " << blocks[i].tEnd << " " << dif << std::endl; + } + } + return result; +} + + Psl construct_psl(std::vector blocks) { + auto psl = Psl(); + psl.match = std::accumulate(blocks.begin(), blocks.end(), 0, + [] (int total, PslBlock item) + { return total + item.qEnd - item.qStart; }); + psl.misMatch = 0; + psl.repMatch = 0; + psl.nCount = 0; + auto qInserts = get_qInserts(blocks); + psl.qNumInsert = qInserts.size(); + psl.qBaseInsert = std::accumulate(qInserts.begin(), qInserts.end(), 0); + auto tInserts = get_tInserts(blocks); + psl.tNumInsert = tInserts.size(); + psl.tBaseInsert = std::accumulate(tInserts.begin(), tInserts.end(), 0); + psl.qName = blocks[0].qName; + psl.qSize = blocks[0].qSize; + psl.qStart = blocks[0].qStart; + psl.qEnd = blocks.back().qEnd; + psl.tName = blocks[0].tName; + psl.tSize = blocks[0].tSize; + psl.strand = blocks[0].strand; + psl.blockCount = blocks.size(); + psl.blocks = blocks; + //if (psl.strand == "++") { + psl.tStart = blocks.front().tStart; + psl.tEnd = blocks.back().tEnd; + /*} + else if (psl.strand == "+-") { + psl.tEnd = psl.tSize - blocks[0].tStart; + psl.tStart = psl.tSize - blocks.back().tEnd; + //??for (auto b: psl.blocks) { + // b.tStart = psl.tSize - b.tStart; + //} + //std::reverse(psl.blocks.begin(),psl.blocks.end()); + + }*/ + return psl; + } + + void write_psl(const std::vector >& merged_blocks, + const std::string& outFilePath) { + std::ofstream ofs; + ofs.open (outFilePath, std::ofstream::out); + for (auto path : merged_blocks){ + auto psl = construct_psl(path); + ofs << psl << std::endl; + } + ofs.close(); + } + } diff --git a/synteny/impl/psl_merger.cpp b/synteny/impl/psl_merger.cpp new file mode 100644 index 00000000..779389ad --- /dev/null +++ b/synteny/impl/psl_merger.cpp @@ -0,0 +1,161 @@ +#include "psl_merger.h" + + +// Assumes a.start < b.start +bool are_syntenic(const PslBlock& a, const PslBlock& b){ + return a.qEnd <= b.qStart && + a.tEnd <= b.tStart && + a.tName == b.tName && + a.strand == b.strand; +} + +bool is_not_overlapping_ordered_pair(const PslBlock& a, const PslBlock& b, + const hal_size_t threshold) { + return are_syntenic(a, b) && + 0 <= b.qStart - a.qEnd && + b.qStart - a.qEnd < threshold && + 0 <= b.tStart - a.tEnd && + b.tStart - a.tEnd < threshold; +} + +std::vector get_next(const int pos, + const std::vector& queryGroup, + const hal_size_t maxAnchorDistance) { + std::vector f; + for (auto i = pos + 1; i < (int) queryGroup.size(); ++i) { + if (is_not_overlapping_ordered_pair(queryGroup[pos], queryGroup[i], + maxAnchorDistance)) { + if (f.empty()) f.push_back(i); + else { + if (not f.empty() && + is_not_overlapping_ordered_pair(queryGroup[f[0]], + queryGroup[i], maxAnchorDistance)) { + return f; + } + else { + f.push_back(i); + } + } + } + } + + return f; +} + + +// *dag* is represented as a map: vertex -> its possible next vertices; +// *hidden_vertices* is a set of vertices that are already in paths; +// Weight of an edge equals length of the next psl block; +// Weight of a vertex equals estimated weight: +// w_j < w_i + w_e(ij) => w_j must be updated +// Also keeps track of how we came to this state: +// (prev_vertex, weight) +std::map > + weigh_dag(const std::vector& group, + std::map >& dag, + const std::set& hiddenVertices, + const hal_size_t maxAnchorDistance){ + std::map > weightedDag; + for (int i = 0; i < (int) group.size(); ++i) { + if (hiddenVertices.count(i)) continue; + std::vector nexts; + if (not dag.count(i)) { + nexts = get_next(i, group, maxAnchorDistance); + dag[i] = nexts; + } + else { + nexts = dag[i]; + } + // If this vertex was never visited then its weight equals to its size. + // Intuition: otherwise we will never count its size + if (not weightedDag.count(i)) + weightedDag[i] = std::make_pair(-1, group[i].size); + for (auto j: nexts){ + if (hiddenVertices.count(j)) continue; + auto alternativeWeight = weightedDag[i].second + group[j].size; + if (not weightedDag.count(j) or + weightedDag[j].second < alternativeWeight) + // Remembers previous vertex itself and + // its corresponding weight: + // previous vertex w_i + weight of the next edge + weightedDag[j] = std::make_pair(i, alternativeWeight); + } + } + return weightedDag; +} + +int get_maxed_vertex(const std::map >& weightedDag) { + hal_size_t maxWeight = weightedDag.cbegin()->second.second; + int maxPos = weightedDag.cbegin()->first; + for (auto it = weightedDag.cbegin(); it != weightedDag.cend(); ++it) { + if (it->second.second >= maxWeight) { + maxWeight = it->second.second; + maxPos = it->first; + } + } + return maxPos; +} + +std::vector traceback(std::map >& weightedDag, + std::set& hiddenVertices, + const std::vector& group) { + // Chooses the path of the heaviest weight + auto startVertex = get_maxed_vertex(weightedDag); + std::vector path = {startVertex}; + auto prevVertex = weightedDag[startVertex].first; + while (prevVertex != -1) { + path.push_back(prevVertex); + prevVertex = weightedDag[prevVertex].first; + } + hiddenVertices.insert(path.begin(), path.end()); + std::vector pslBlockPath; + std::transform(path.begin(), path.end(), std::back_inserter(pslBlockPath), + [group] (int x) -> PslBlock {return group[x];}); + std::reverse(pslBlockPath.begin(),pslBlockPath.end()); + return pslBlockPath; +} +struct { + bool operator()(PslBlock a, PslBlock b) const + { + if (a.qStart < b.qStart ) + return true; + else if (a.qStart == b.qStart) { + return a.tStart <= b.tStart; + } + return false; + } +} qStartLess; + +std::vector > dag_merge(const std::vector& blocks, + const hal_size_t minBlockBreath, + const hal_size_t maxAnchorDistance){ + std::map > blocksByQName; + for (auto block : blocks) blocksByQName[block.qName].push_back(block); + std::vector > paths; + for (auto pairs : blocksByQName) { + std::vector group = pairs.second; + std::string qName = pairs.first; + std::map > dag; + std::set hiddenVertices; + std::sort(group.begin(), group.end(), qStartLess); + while (hiddenVertices.size() != group.size()){ + auto weightedDag = weigh_dag(group, dag, hiddenVertices, + maxAnchorDistance); + auto path = traceback(weightedDag, hiddenVertices, group); + if (path.empty()) { + break; + } + auto qLen = path.back().qEnd - path[0].qStart; + auto tLen = path.back().tEnd - path[0].tStart; + if (qLen >= minBlockBreath && tLen >= minBlockBreath){ + //if (qLen >= minBlockBreath && tLen >= minBlockBreath){ + paths.push_back(path); + } + } + } + return paths; + +} + + + diff --git a/synteny/inc/hal2psl.h b/synteny/inc/hal2psl.h new file mode 100644 index 00000000..ac2006d7 --- /dev/null +++ b/synteny/inc/hal2psl.h @@ -0,0 +1,41 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +/* + * File: hal2psl.h + * Author: admin + * + * Created on July 11, 2018, 12:15 PM + */ + +#ifndef HAL2PSL_H +#define HAL2PSL_H + +#include "halBedLine.h" +#include "halBlockLiftover.h" + +namespace hal { +class Hal2Psl : public hal::BlockLiftover { + + void storePslResults(std::vector& pslBlocks); + void makeUpPsl(const std::vector& vpsl, + const std::vector& blocks, + const char strand, + const hal_index_t start, + const std::string chrName, + std::vector& pslBlocks); + + public: + + Hal2Psl(){} + std::vector convert2psl(hal::AlignmentConstPtr alignment, + const hal::Genome* srcGenome, + const hal::Genome* tgtGenome, + const std::string srcChrom); +}; +} +#endif /* HAL_MERGER_H */ + diff --git a/synteny/inc/psl.h b/synteny/inc/psl.h new file mode 100644 index 00000000..52611e1b --- /dev/null +++ b/synteny/inc/psl.h @@ -0,0 +1,201 @@ +#ifndef PSL_H +#define PSL_H + +#include +#include +#include +#include + +struct PslBlock { + hal_size_t qStart; + hal_size_t qEnd; + hal_size_t tStart; + hal_size_t tEnd; + hal_size_t size; + hal_size_t tSize; + hal_size_t qSize; + std::string qName; + std::string tName; + std::string tSeq; + std::string qSeq; + std::string strand; + PslBlock(hal_size_t qStart, hal_size_t tStart, hal_size_t size, std::string strand, + const std::string& qName, const std::string& tName, + int qSize, int tSize, + const std::string& qSeq = "", + const std::string& tSeq = ""){ + this->qStart = qStart; + this->qSize = qSize; + this->qEnd = qStart + size; + this->tStart = tStart; + this->tEnd = tStart + size; + this->tSize = tSize; + this->strand = strand; + this->size = size; + this->qName = qName; + this->tName = tName; + this->tSeq = tSeq; + this->qSeq = qSeq; + } + PslBlock() {} + + + + //void initBlock(ColumnIteratorConstPtr col); +}; + + +class Psl { + public: + int match; + int misMatch; + int repMatch; + int nCount; + int qNumInsert; + int qBaseInsert; + int tNumInsert; + int tBaseInsert; + std::string strand; + std::string qName; + hal_size_t qSize; + hal_size_t qStart; + hal_size_t qEnd; + std::string tName; + hal_size_t tSize; + hal_size_t tStart; + hal_size_t tEnd; + int blockCount; + bool haveSeqs; + std::vector blocks; + + private: + std::vector split(const std::string &s, char delim) { + std::stringstream ss(s); + std::string item; + std::vector elems; + while (std::getline(ss, item, delim)) { + elems.push_back(std::move(item)); + } + return elems; + } + + std::vector intArraySplit(const std::string& commaStr){ + std::vector ints; + for (auto s : split(commaStr,',')) + ints.push_back(stoi(s)); + return ints; + } + + void parseBlocks(const std::string& blockSizesStr, + const std::string& qStartsStr, + const std::string& tStartsStr, + hal_size_t qSize, hal_size_t tSize, + const std::string& qSeqsStr, + const std::string& tSeqsStr) { + auto blockSizes = intArraySplit(blockSizesStr); + auto qStarts = intArraySplit(qStartsStr); + auto tStarts = intArraySplit(tStartsStr); + auto haveSeqs = (not qSeqsStr.empty()); + std::vector qSeqs; + std::vector tSeqs; + if (haveSeqs){ + qSeqs = split(qSeqsStr,','); + tSeqs = split(tSeqsStr,','); + } + for (int i = 0; i < this->blockCount; ++i){ + this->blocks.push_back(PslBlock( qStarts[i], tStarts[i], + blockSizes[i], strand, + qName, tName, qSize, tSize, + (haveSeqs ? qSeqs[i] : ""), + (haveSeqs ? tSeqs[i] : ""))); + } + } + + void parse(std::vector row) { + match = stoi(row[0]); + misMatch = stoi(row[1]); + repMatch = stoi(row[2]); + nCount = stoi(row[3]); + qNumInsert = stoi(row[4]); + qBaseInsert = stoi(row[5]); + tNumInsert = stoi(row[6]); + tBaseInsert = stoi(row[7]); + strand = row[8]; + qName = row[9]; + qSize = (hal_size_t)stoi(row[10]); + qStart = (hal_size_t)stoi(row[11]); + qEnd = (hal_size_t)stoi(row[12]); + tName = row[13]; + tSize = (hal_size_t)stoi(row[14]); + tStart = (hal_size_t)stoi(row[15]); + tEnd = (hal_size_t)stoi(row[16]); + blockCount = stoi(row[17]); + haveSeqs = row.size() > 21; + parseBlocks(row[18], row[19], row[20], qSize, tSize, + (haveSeqs ? row[21] : ""), + (haveSeqs ? row[22] : "")); + } + + std::string vector_join(std::vector v, std::string sep) const { + std::string s = ""; + for (int i = 0; i < (int) v.size(); ++i) { + s += v[i]; + s += (i < int(v.size())-1 ? sep : ""); + } + return s; + } + public: + Psl(const std::string& line) { + auto v = split(line,'\t'); + if (not (v.size() == 1) or line[0] == '#'){ + parse(v); + } + } + + Psl() {} + + std::vector get_blocks() { + return blocks; + } + + friend std::ostream& operator<<(std::ostream &strm, const Psl &a) ; + +}; + + std::ostream& operator<<(std::ostream &strm, const Psl &a) { + + std::vector v; + v.push_back(std::to_string(a.match)); + v.push_back(std::to_string(a.misMatch)); + v.push_back(std::to_string(a.repMatch)); + v.push_back(std::to_string(a.nCount)); + v.push_back(std::to_string(a.qNumInsert)); + v.push_back(std::to_string(a.qBaseInsert)); + v.push_back(std::to_string(a.tNumInsert)); + v.push_back(std::to_string(a.tBaseInsert)); + v.push_back(a.strand); + v.push_back(a.qName); + v.push_back(std::to_string(a.qSize)); + v.push_back(std::to_string(a.qStart)); + v.push_back(std::to_string(a.qEnd)); + v.push_back(a.tName); + v.push_back(std::to_string(a.tSize)); + v.push_back(std::to_string(a.tStart)); + v.push_back(std::to_string(a.tEnd)); + v.push_back(std::to_string(a.blockCount)); + std::vector ints; + std::vector qstarts; + std::vector tstarts; + for (auto b: a.blocks){ + ints.push_back(std::to_string(b.size)); + qstarts.push_back(std::to_string(b.qStart)); + tstarts.push_back(std::to_string(b.tStart)); + } + v.push_back(a.vector_join(ints, ",")+","); + v.push_back(a.vector_join(qstarts, ",")+","); + v.push_back(a.vector_join(tstarts, ",")+","); + //TODO: add qseqs and tseqs! + return strm << a.vector_join(v, "\t"); + } + +#endif /*PSL_H*/ diff --git a/synteny/inc/psl_io.h b/synteny/inc/psl_io.h new file mode 100644 index 00000000..b4ef89fd --- /dev/null +++ b/synteny/inc/psl_io.h @@ -0,0 +1,32 @@ +#ifndef PSL_IO_H +#define PSL_IO_H + +#include +#include +#include +#include +#include + +#include "psl.h" + + +namespace psl_io { +std::vector split(const std::string &s, char delim); + + +std::vector get_blocks_set(const std::string psl); + +std::vector get_qInserts(const std::vector& blocks); + +std::vector get_tInserts(const std::vector& blocks); + + Psl construct_psl(std::vector blocks); + + void write_psl(const std::vector >& merged_blocks, + const std::string& outFilePath); + + } + +#endif /* PSL_IO_H */ + + diff --git a/synteny/inc/psl_merger.h b/synteny/inc/psl_merger.h new file mode 100644 index 00000000..9ef65e42 --- /dev/null +++ b/synteny/inc/psl_merger.h @@ -0,0 +1,57 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ + +/* + * File: psl_merger.h + * Author: admin + * + * Created on July 6, 2018, 1:13 PM + */ + +#ifndef PSL_MERGER_H +#define PSL_MERGER_H + +#include +#include +#include +#include +#include +#include +#include + +#include "psl.h" +bool are_syntenic(const PslBlock& a, const PslBlock& b); + +bool is_not_overlapping_ordered_pair(const PslBlock& a, const PslBlock& b, + const hal_size_t threshold=5000); + +std::vector get_next(const int pos, + const std::vector& queryGroup, + const hal_size_t maxAnchorDistance=5000); + + +std::map > + weigh_dag(const std::vector& group, + std::map >& dag, + const std::set& hiddenVertices, + const hal_size_t maxAnchorDistance); + +int get_maxed_vertex(const std::map >& weightedDag); + +std::vector traceback(std::map >& weightedDag, + std::set& hiddenVertices, + const std::vector& group) ; + + +std::vector > + dag_merge(const std::vector& blocks, + const hal_size_t minBlockBreath, + const hal_size_t maxAnchorDistance); + + + +#endif /* PSL_MERGER_H */ +